From cf28b2794b23604c1b45285f139415397998ac6d Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Wed, 20 Nov 2019 13:20:11 -0500
Subject: [PATCH 001/121] created skeleton for greyscale model

---
 common/ScaLBL.h                    |   5 +
 cpu/Greyscale.cpp                  | 278 ++++++++++++++
 gpu/Greyscale.cu                   | 311 ++++++++++++++++
 models/GreyscaleModel.cpp          | 568 +++++++++++++++++++++++++++++
 models/GreyscaleModel.h            |  81 ++++
 tests/CMakeLists.txt               |   1 +
 tests/lbpm_greyscale_simulator.cpp |  64 ++++
 7 files changed, 1308 insertions(+)
 create mode 100644 cpu/Greyscale.cpp
 create mode 100644 gpu/Greyscale.cu
 create mode 100644 models/GreyscaleModel.cpp
 create mode 100644 models/GreyscaleModel.h
 create mode 100644 tests/lbpm_greyscale_simulator.cpp

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index a50ab7ed..efca3be8 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -55,6 +55,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int
 
 extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
 
+// GREYSCALE MODEL
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
+
 // MRT MODEL
 extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
 		double Fy, double Fz);
diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
new file mode 100644
index 00000000..a800413d
--- /dev/null
+++ b/cpu/Greyscale.cpp
@@ -0,0 +1,278 @@
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
+	// conserved momemnts
+	double rho,ux,uy,uz,uu;
+	// non-conserved moments
+	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+
+	for (int n=start; n<finish; n++){
+		// q=0
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+		f6 = dist[5*Np+n];
+		f7 = dist[8*Np+n];
+		f8 = dist[7*Np+n];
+		f9 = dist[10*Np+n];
+		f10 = dist[9*Np+n];
+		f11 = dist[12*Np+n];
+		f12 = dist[11*Np+n];
+		f13 = dist[14*Np+n];
+		f14 = dist[13*Np+n];
+		f15 = dist[16*Np+n];
+		f16 = dist[15*Np+n];
+		f17 = dist[18*Np+n];
+		f18 = dist[17*Np+n];
+
+		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
+		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
+		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
+		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+		// q=0
+		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+
+		// q = 1
+		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+
+		// q=2
+		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+
+		// q = 3
+		dist[3*Np+n] = f3*(1.0-rlx) +
+				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+
+		// q = 4
+		dist[4*Np+n] = f4*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+
+		// q = 5
+		dist[5*Np+n] = f5*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+
+		// q = 6
+		dist[6*Np+n] = f6*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+
+		// q = 7
+		dist[7*Np+n] = f7*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+
+		// q = 8
+		dist[8*Np+n] = f8*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+
+		// q = 9
+		dist[9*Np+n] = f9*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+
+		// q = 10
+		dist[10*Np+n] = f10*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+
+		// q = 11
+		dist[11*Np+n] = f11*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+
+		// q = 12
+		dist[12*Np+n] = f12*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+
+		// q = 13
+		dist[13*Np+n] = f13*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+
+		// q= 14
+		dist[14*Np+n] = f14*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+
+		// q = 15
+		dist[15*Np+n] = f15*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+
+		// q = 16
+		dist[16*Np+n] = f16*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+
+		// q = 17
+		dist[17*Np+n] = f17*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+
+		// q = 18
+		dist[18*Np+n] = f18*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+
+		//........................................................................
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
+	// conserved momemnts
+	double rho,ux,uy,uz,uu;
+	// non-conserved moments
+	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
+
+	int nread;
+	for (int n=start; n<finish; n++){
+		
+		// q=0
+		f0 = dist[n];
+		// q=1
+		nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
+		f1 = dist[nr1]; // reading the f1 data into register fq
+
+		nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
+		f2 = dist[nr2];  // reading the f2 data into register fq
+
+		// q=3
+		nr3 = neighborList[n+2*Np]; // neighbor 4
+		f3 = dist[nr3];
+
+		// q = 4
+		nr4 = neighborList[n+3*Np]; // neighbor 3
+		f4 = dist[nr4];
+
+		// q=5
+		nr5 = neighborList[n+4*Np];
+		f5 = dist[nr5];
+
+		// q = 6
+		nr6 = neighborList[n+5*Np];
+		f6 = dist[nr6];
+		
+		// q=7
+		nr7 = neighborList[n+6*Np];
+		f7 = dist[nr7];
+
+		// q = 8
+		nr8 = neighborList[n+7*Np];
+		f8 = dist[nr8];
+
+		// q=9
+		nr9 = neighborList[n+8*Np];
+		f9 = dist[nr9];
+
+		// q = 10
+		nr10 = neighborList[n+9*Np];
+		f10 = dist[nr10];
+
+		// q=11
+		nr11 = neighborList[n+10*Np];
+		f11 = dist[nr11];
+
+		// q=12
+		nr12 = neighborList[n+11*Np];
+		f12 = dist[nr12];
+
+		// q=13
+		nr13 = neighborList[n+12*Np];
+		f13 = dist[nr13];
+
+		// q=14
+		nr14 = neighborList[n+13*Np];
+		f14 = dist[nr14];
+
+		// q=15
+		nr15 = neighborList[n+14*Np];
+		f15 = dist[nr15];
+
+		// q=16
+		nr16 = neighborList[n+15*Np];
+		f16 = dist[nr16];
+
+		// q=17
+		//fq = dist[18*Np+n];
+		nr17 = neighborList[n+16*Np];
+		f17 = dist[nr17];
+
+		// q=18
+		nr18 = neighborList[n+17*Np];
+		f18 = dist[nr18];
+
+		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
+		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
+		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
+		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+		// q=0
+		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+
+		// q = 1
+		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+
+		// q=2
+		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+
+		// q = 3
+		dist[nr4] = f3*(1.0-rlx) +
+				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+
+		// q = 4
+		dist[nr3] = f4*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+
+		// q = 5
+		dist[nr6] = f5*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+
+		// q = 6
+		dist[nr5] = f6*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+
+		// q = 7
+		dist[nr8] = f7*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+
+		// q = 8
+		dist[nr7] = f8*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+
+		// q = 9
+		dist[nr10] = f9*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+
+		// q = 10
+		dist[nr9] = f10*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+
+		// q = 11
+		dist[nr12] = f11*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+
+		// q = 12
+		dist[nr11] = f12*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+
+		// q = 13
+		dist[nr14] = f13*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+
+		// q= 14
+		dist[nr13] = f14*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+
+		// q = 15
+		dist[nr16] = f15*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+
+		// q = 16
+		dist[nr15] = f16*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+
+		// q = 17
+		dist[nr18] = f17*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+
+		// q = 18
+		dist[nr17] = f18*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+
+	}
+}
\ No newline at end of file
diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
new file mode 100644
index 00000000..04b5e979
--- /dev/null
+++ b/gpu/Greyscale.cu
@@ -0,0 +1,311 @@
+#include <stdio.h>
+
+#define NBLOCKS 1024
+#define NTHREADS 256
+
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
+	// conserved momemnts
+	double rho,ux,uy,uz,uu;
+	// non-conserved moments
+	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+
+	int S = Np/NBLOCKS/NTHREADS + 1;
+	for (int s=0; s<S; s++){
+	    //........Get 1-D index for this thread....................
+	    		  n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
+
+			    if ( n<finish ){
+		// q=0
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+		f6 = dist[5*Np+n];
+		f7 = dist[8*Np+n];
+		f8 = dist[7*Np+n];
+		f9 = dist[10*Np+n];
+		f10 = dist[9*Np+n];
+		f11 = dist[12*Np+n];
+		f12 = dist[11*Np+n];
+		f13 = dist[14*Np+n];
+		f14 = dist[13*Np+n];
+		f15 = dist[16*Np+n];
+		f16 = dist[15*Np+n];
+		f17 = dist[18*Np+n];
+		f18 = dist[17*Np+n];
+
+		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
+		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
+		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
+		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+		// q=0
+		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+
+		// q = 1
+		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+
+		// q=2
+		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+
+		// q = 3
+		dist[3*Np+n] = f3*(1.0-rlx) +
+				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+
+		// q = 4
+		dist[4*Np+n] = f4*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+
+		// q = 5
+		dist[5*Np+n] = f5*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+
+		// q = 6
+		dist[6*Np+n] = f6*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+
+		// q = 7
+		dist[7*Np+n] = f7*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+
+		// q = 8
+		dist[8*Np+n] = f8*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+
+		// q = 9
+		dist[9*Np+n] = f9*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+
+		// q = 10
+		dist[10*Np+n] = f10*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+
+		// q = 11
+		dist[11*Np+n] = f11*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+
+		// q = 12
+		dist[12*Np+n] = f12*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+
+		// q = 13
+		dist[13*Np+n] = f13*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+
+		// q= 14
+		dist[14*Np+n] = f14*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+
+		// q = 15
+		dist[15*Np+n] = f15*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+
+		// q = 16
+		dist[16*Np+n] = f16*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+
+		// q = 17
+		dist[17*Np+n] = f17*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+
+		// q = 18
+		dist[18*Np+n] = f18*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+
+		//........................................................................
+		}
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
+	// conserved momemnts
+	double rho,ux,uy,uz,uu;
+	// non-conserved moments
+	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
+
+	int S = Np/NBLOCKS/NTHREADS + 1;
+	for (int s=0; s<S; s++){
+	    //........Get 1-D index for this thread....................
+	    		  n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
+
+			    if ( n<finish ){		
+		// q=0
+		f0 = dist[n];
+		// q=1
+		nr1 = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
+		f1 = dist[nr1]; // reading the f1 data into register fq
+
+		nr2 = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
+		f2 = dist[nr2];  // reading the f2 data into register fq
+
+		// q=3
+		nr3 = neighborList[n+2*Np]; // neighbor 4
+		f3 = dist[nr3];
+
+		// q = 4
+		nr4 = neighborList[n+3*Np]; // neighbor 3
+		f4 = dist[nr4];
+
+		// q=5
+		nr5 = neighborList[n+4*Np];
+		f5 = dist[nr5];
+
+		// q = 6
+		nr6 = neighborList[n+5*Np];
+		f6 = dist[nr6];
+		
+		// q=7
+		nr7 = neighborList[n+6*Np];
+		f7 = dist[nr7];
+
+		// q = 8
+		nr8 = neighborList[n+7*Np];
+		f8 = dist[nr8];
+
+		// q=9
+		nr9 = neighborList[n+8*Np];
+		f9 = dist[nr9];
+
+		// q = 10
+		nr10 = neighborList[n+9*Np];
+		f10 = dist[nr10];
+
+		// q=11
+		nr11 = neighborList[n+10*Np];
+		f11 = dist[nr11];
+
+		// q=12
+		nr12 = neighborList[n+11*Np];
+		f12 = dist[nr12];
+
+		// q=13
+		nr13 = neighborList[n+12*Np];
+		f13 = dist[nr13];
+
+		// q=14
+		nr14 = neighborList[n+13*Np];
+		f14 = dist[nr14];
+
+		// q=15
+		nr15 = neighborList[n+14*Np];
+		f15 = dist[nr15];
+
+		// q=16
+		nr16 = neighborList[n+15*Np];
+		f16 = dist[nr16];
+
+		// q=17
+		//fq = dist[18*Np+n];
+		nr17 = neighborList[n+16*Np];
+		f17 = dist[nr17];
+
+		// q=18
+		nr18 = neighborList[n+17*Np];
+		f18 = dist[nr18];
+
+		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
+		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
+		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
+		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+		// q=0
+		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+
+		// q = 1
+		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+
+		// q=2
+		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+
+		// q = 3
+		dist[nr4] = f3*(1.0-rlx) +
+				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+
+		// q = 4
+		dist[nr3] = f4*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+
+		// q = 5
+		dist[nr6] = f5*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+
+		// q = 6
+		dist[nr5] = f6*(1.0-rlx) + 
+				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+
+		// q = 7
+		dist[nr8] = f7*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+
+		// q = 8
+		dist[nr7] = f8*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+
+		// q = 9
+		dist[nr10] = f9*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+
+		// q = 10
+		dist[nr9] = f10*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+
+		// q = 11
+		dist[nr12] = f11*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+
+		// q = 12
+		dist[nr11] = f12*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+
+		// q = 13
+		dist[nr14] = f13*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+
+		// q= 14
+		dist[nr13] = f14*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+
+		// q = 15
+		dist[nr16] = f15*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+
+		// q = 16
+		dist[nr15] = f16*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+
+		// q = 17
+		dist[nr18] = f17*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+
+		// q = 18
+		dist[nr17] = f18*(1.0-rlx) + 
+				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+		}
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz);
+
+    cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_AAeven_Greyscale: %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz);
+
+    cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_AAeven_Greyscale: %s \n",cudaGetErrorString(err));
+	}
+}
\ No newline at end of file
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
new file mode 100644
index 00000000..980d15b5
--- /dev/null
+++ b/models/GreyscaleModel.cpp
@@ -0,0 +1,568 @@
+/*
+color lattice boltzmann model
+ */
+#include "models/GreyscaleModel.h"
+#include "analysis/distance.h"
+#include "analysis/morphology.h"
+#include <stdlib.h>
+#include <time.h>
+
+ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
+rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),
+Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
+Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
+{
+	SignDist.resize(Nx,Ny,Nz);           SignDist.fill(0);
+
+}
+ScaLBL_GreyscaleModel::~ScaLBL_GreyscaleModel(){
+
+}
+
+void ScaLBL_GreyscaleModel::ReadParams(string filename){
+	// read the input database 
+	db = std::make_shared<Database>( filename );
+	domain_db = db->getDatabase( "Domain" );
+	greyscale_db =  db->getDatabase( "Greyscale" );
+	analysis_db = db->getDatabase( "Analysis" );
+	vis_db = db->getDatabase( "Visualization" );
+
+	// set defaults
+	timestepMax = 100000;
+	tau = 1.0;
+	tolerance = 0.01;
+	Fx = Fy = Fz = 0.0;
+	Restart=false;
+	din=dout=1.0;
+	flux=0.0;
+	
+	// Color Model parameters
+	if (greyscale_db->keyExists( "timestepMax" )){
+		timestepMax = greyscale_db->getScalar<int>( "timestepMax" );
+	}
+	if (greyscale_db->keyExists( "tau" )){
+		tau = greyscale_db->getScalar<double>( "tauA" );
+	}
+	if (greyscale_db->keyExists( "F" )){
+		Fx = greyscale_db->getVector<double>( "F" )[0];
+		Fy = greyscale_db->getVector<double>( "F" )[1];
+		Fz = greyscale_db->getVector<double>( "F" )[2];
+	}
+	if (greyscale_db->keyExists( "Restart" )){
+		Restart = greyscale_db->getScalar<bool>( "Restart" );
+	}
+	if (greyscale_db->keyExists( "din" )){
+		din = greyscale_db->getScalar<double>( "din" );
+	}
+	if (greyscale_db->keyExists( "dout" )){
+		dout = greyscale_db->getScalar<double>( "dout" );
+	}
+	if (greyscale_db->keyExists( "flux" )){
+		flux = greyscale_db->getScalar<double>( "flux" );
+	}
+	if (greyscale_db->keyExists( "tolerance" )){
+		tolerance = greyscale_db->getScalar<double>( "tolerance" );
+	}
+	BoundaryCondition = 0;
+	if (domain_db->keyExists( "BC" )){
+		BoundaryCondition = domain_db->getScalar<int>( "BC" );
+	}
+}
+
+void ScaLBL_GreyscaleModel::SetDomain(){
+	Dm  = std::shared_ptr<Domain>(new Domain(domain_db,comm));      // full domain for analysis
+	Mask  = std::shared_ptr<Domain>(new Domain(domain_db,comm));    // mask domain removes immobile phases
+	// domain parameters
+	Nx = Dm->Nx;
+	Ny = Dm->Ny;
+	Nz = Dm->Nz;
+	Lx = Dm->Lx;
+	Ly = Dm->Ly;
+	Lz = Dm->Lz;
+	N = Nx*Ny*Nz;
+	id = new signed char [N];
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
+	MPI_Barrier(comm);
+	Dm->CommInit();
+	MPI_Barrier(comm);
+	// Read domain parameters
+	rank = Dm->rank();	
+	nprocx = Dm->nprocx();
+	nprocy = Dm->nprocy();
+	nprocz = Dm->nprocz();
+}
+
+void ScaLBL_GreyscaleModel::ReadInput(){
+	
+	sprintf(LocalRankString,"%05d",rank);
+	sprintf(LocalRankFilename,"%s%s","ID.",LocalRankString);
+	sprintf(LocalRestartFile,"%s%s","Restart.",LocalRankString);
+	
+	if (domain_db->keyExists( "Filename" )){
+		auto Filename = domain_db->getScalar<std::string>( "Filename" );
+		Mask->Decomp(Filename);
+	}
+	else{
+		Mask->ReadIDs();
+	}
+	for (int i=0; i<Nx*Ny*Nz; i++) id[i] = Mask->id[i];  // save what was read
+	
+	// Generate the signed distance map
+	// Initialize the domain and communication
+	Array<char> id_solid(Nx,Ny,Nz);
+	int count = 0;
+	// Solve for the position of the solid phase
+	for (int k=0;k<Nz;k++){
+		for (int j=0;j<Ny;j++){
+			for (int i=0;i<Nx;i++){
+				int n = k*Nx*Ny+j*Nx+i;
+				// Initialize the solid phase
+				signed char label = Mask->id[n];
+				if (label > 0)		id_solid(i,j,k) = 1;
+				else	     		id_solid(i,j,k) = 0;
+			}
+		}
+	}
+	// Initialize the signed distance function
+	for (int k=0;k<Nz;k++){
+		for (int j=0;j<Ny;j++){
+			for (int i=0;i<Nx;i++){
+				int n=k*Nx*Ny+j*Nx+i;
+				// Initialize distance to +/- 1
+				SignDist(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
+			}
+		}
+	}
+//	MeanFilter(SignDist);
+	if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
+	CalcDist(SignDist,id_solid,*Mask);
+	
+	if (rank == 0) cout << "Domain set." << endl;
+}
+
+void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Permeablity)
+{
+	size_t NLABELS=0;
+	signed char VALUE=0;
+	double POROSITY=0.f;
+	double PERMEABILITY=0.f;
+
+	auto LabelList = greyscale_db->getVector<int>( "ComponentLabels" );
+	auto PorosityList = greyscale_db->getVector<double>( "PorosityList" );
+	auto PermeabilityList = greyscale_db->getVector<double>( "PermeabilityList" );
+
+	NLABELS=LabelList.size();
+	if (NLABELS != PorosityList.size()){
+		ERROR("Error: ComponentLabels and PorosityList must be the same length! \n");
+	}
+
+	double label_count[NLABELS];
+	double label_count_global[NLABELS];
+	// Assign the labels
+
+	for (int idx=0; idx<NLABELS; idx++) label_count[idx]=0;
+
+	for (int k=1;k<Nz-1;k++){
+		for (int j=1;j<Ny-1;j++){
+			for (int i=1;i<Nx-1;i++){
+				int n = k*Nx*Ny+j*Nx+i;
+				VALUE=id[n];
+				// Assign the affinity from the paired list
+				for (unsigned int idx=0; idx < NLABELS; idx++){
+				      //printf("idx=%i, value=%i, %i, \n",idx, VALUE,LabelList[idx]);
+					if (VALUE == LabelList[idx]){
+						POROSITY=PorosityList[idx];
+						label_count[idx] += 1.0;
+						idx = NLABELS;
+						//Mask->id[n] = 0; // set mask to zero since this is an immobile component
+					}
+				}
+				// fluid labels are reserved / negative labels are immobile
+				if (VALUE == 1) POROSITY=1.0;
+				else if (VALUE == 2) POROSITY=1.0;
+				else if (VALUE < 1)	 POROSITY = 0.0;  
+				int idx = Map(i,j,k);
+				if (!(idx < 0))
+					Porosity[idx] = POROSITY;
+			}
+		}
+	}
+
+	if (NLABELS != PermeabilityList.size()){
+		ERROR("Error: ComponentLabels and PermeabilityList must be the same length! \n");
+	}
+	for (int k=1;k<Nz-1;k++){
+		for (int j=1;j<Ny-1;j++){
+			for (int i=1;i<Nx-1;i++){
+				int n = k*Nx*Ny+j*Nx+i;
+				VALUE=id[n];
+				// Assign the affinity from the paired list
+				for (unsigned int idx=0; idx < NLABELS; idx++){
+					//printf("idx=%i, value=%i, %i, \n",idx, VALUE,LabelList[idx]);
+					if (VALUE == LabelList[idx]){
+						PERMEABILITY=PermeabilityList[idx];
+						idx = NLABELS;
+						//Mask->id[n] = 0; // set mask to zero since this is an immobile component
+					}
+				}
+				// fluid labels are reserved / negative labels are immobile
+				if (VALUE == 1) PERMEABILITY=1.0;
+				else if (VALUE == 2) PERMEABILITY=1.0;
+				else if (VALUE < 1)	 PERMEABILITY = 0.0;  
+				int idx = Map(i,j,k);
+				if (!(idx < 0))
+					Permeability[idx] = PERMEABILITY;
+			}
+		}
+	}
+
+
+	// Set Dm to match Mask
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
+	
+	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
+
+	if (rank==0){
+		printf("Component labels: %lu \n",NLABELS);
+		for (unsigned int idx=0; idx<NLABELS; idx++){
+			VALUE=LabelList[idx];
+			POROSITY=PorosityList[idx];
+			PERMEABILITY=PermeabilityList[idx];
+			double volume_fraction  = double(label_count_global[idx])/double((Nx-2)*(Ny-2)*(Nz-2)*nprocs);
+			printf("   label=%d, porosity=%f, permeability=%f, volume fraction==%f\n",VALUE,POROSITY,PERMEABILITY,volume_fraction); 
+		}
+	}
+
+}
+
+
+void ScaLBL_GreyscaleModel::Create(){
+	/*
+	 *  This function creates the variables needed to run a LBM 
+	 */
+	//.........................................................
+	// don't perform computations at the eight corners
+	//id[0] = id[Nx-1] = id[(Ny-1)*Nx] = id[(Ny-1)*Nx + Nx-1] = 0;
+	//id[(Nz-1)*Nx*Ny] = id[(Nz-1)*Nx*Ny+Nx-1] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx + Nx-1] = 0;
+
+	//.........................................................
+	// Initialize communication structures in averaging domain
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i];
+	Mask->CommInit();
+	Np=Mask->PoreCount();
+	//...........................................................................
+	if (rank==0)    printf ("Create ScaLBL_Communicator \n");
+	// Create a communicator for the device (will use optimized layout)
+	// ScaLBL_Communicator ScaLBL_Comm(Mask); // original
+	ScaLBL_Comm  = std::shared_ptr<ScaLBL_Communicator>(new ScaLBL_Communicator(Mask));
+
+	int Npad=(Np/16 + 2)*16;
+	if (rank==0)    printf ("Set up memory efficient layout, %i | %i | %i \n", Np, Npad, N);
+	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
+	auto neighborList= new int[18*Npad];
+	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
+	MPI_Barrier(comm);
+
+	//...........................................................................
+	//                MAIN  VARIABLES ALLOCATED HERE
+	//...........................................................................
+	// LBM variables
+	if (rank==0)    printf ("Allocating distributions \n");
+	//......................device distributions.................................
+	dist_mem_size = Np*sizeof(double);
+	neighborSize=18*(Np*sizeof(int));
+	//...........................................................................
+	ScaLBL_AllocateDeviceMemory((void **) &NeighborList, neighborSize);
+	ScaLBL_AllocateDeviceMemory((void **) &dvcMap, sizeof(int)*Np);
+	ScaLBL_AllocateDeviceMemory((void **) &fq, 19*dist_mem_size);
+	ScaLBL_AllocateDeviceMemory((void **) &Permeability, sizeof(double)*Np);		
+	ScaLBL_AllocateDeviceMemory((void **) &Porosity, sizeof(double)*Np);		
+	ScaLBL_AllocateDeviceMemory((void **) &Pressure, sizeof(double)*Np);
+	ScaLBL_AllocateDeviceMemory((void **) &Velocity, 3*sizeof(double)*Np);
+	//...........................................................................
+	// Update GPU data structures
+	if (rank==0)	printf ("Setting up device map and neighbor list \n");
+	fflush(stdout);
+	int *TmpMap;
+	TmpMap=new int[Np];
+	for (int k=1; k<Nz-1; k++){
+		for (int j=1; j<Ny-1; j++){
+			for (int i=1; i<Nx-1; i++){
+				int idx=Map(i,j,k);
+				if (!(idx < 0))
+					TmpMap[idx] = k*Nx*Ny+j*Nx+i;
+			}
+		}
+	}
+	// check that TmpMap is valid
+	for (int idx=0; idx<ScaLBL_Comm->LastExterior(); idx++){
+		int n = TmpMap[idx];
+		if (n > Nx*Ny*Nz){
+			printf("Bad value! idx=%i \n");
+			TmpMap[idx] = Nx*Ny*Nz-1;
+		}
+	}
+	for (int idx=ScaLBL_Comm->FirstInterior(); idx<ScaLBL_Comm->LastInterior(); idx++){
+		int n = TmpMap[idx];
+		if (n > Nx*Ny*Nz){
+			printf("Bad value! idx=%i \n");
+			TmpMap[idx] = Nx*Ny*Nz-1;
+		}
+	}
+	ScaLBL_CopyToDevice(dvcMap, TmpMap, sizeof(int)*Np);
+	ScaLBL_DeviceBarrier();
+	delete [] TmpMap;
+	
+	// copy the neighbor list 
+	ScaLBL_CopyToDevice(NeighborList, neighborList, neighborSize);
+	// initialize phi based on PhaseLabel (include solid component labels)
+	double *Poros, *Perm;
+	Poros = new double[Np];
+	Perm = new double[Np];
+	AssignComponentLabels(Poros,Perm);
+	ScaLBL_CopyToDevice(Porosity, Poros, Np*sizeof(double));
+	ScaLBL_CopyToDevice(Permeability, Perm, Np*sizeof(double));
+}        
+
+/********************************************************
+ * AssignComponentLabels                                 *
+ ********************************************************/
+
+void ScaLBL_GreyscaleModel::Initialize(){
+	
+	if (rank==0)	printf ("Initializing distributions \n");
+	ScaLBL_D3Q19_Init(fq, Np);
+	/*
+	 * This function initializes model
+	 */
+	if (Restart == true){
+		if (rank==0){
+			printf("Reading restart file! \n");
+		}
+
+		// Read in the restart file to CPU buffers
+		int *TmpMap;
+		TmpMap = new int[Np];
+		
+		double *cDist;
+		cDist = new double[19*Np];
+		ScaLBL_CopyToHost(TmpMap, dvcMap, Np*sizeof(int));
+    	
+		ifstream File(LocalRestartFile,ios::binary);
+		int idx;
+		double value;
+		for (int n=0; n<Np; n++){
+			// Read the distributions
+			for (int q=0; q<19; q++){
+				File.read((char*) &value, sizeof(value));
+				cDist[q*Np+n] = value;
+			}
+		}
+		File.close();
+		
+		// Copy the restart data to the GPU
+		ScaLBL_CopyToDevice(fq,cDist,19*Np*sizeof(double));
+		ScaLBL_DeviceBarrier();
+
+		MPI_Barrier(comm);
+	}
+}
+
+void ScaLBL_GreyscaleModel::Run(){
+	int nprocs=nprocx*nprocy*nprocz;
+	const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
+	
+
+	if (rank==0){
+		printf("********************************************************\n");
+		printf("No. of timesteps: %i \n", timestepMax);
+		fflush(stdout);
+	}
+
+	//.......create and start timer............
+	double starttime,stoptime,cputime;
+	ScaLBL_DeviceBarrier();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
+	//.........................................
+	
+	Minkowski Morphology(Mask);
+	DoubleArray Velocity_x(Nx,Ny,Nz);
+	DoubleArray Velocity_y(Nx,Ny,Nz);
+	DoubleArray Velocity_z(Nx,Ny,Nz);
+	DoubleArray Pressure(Nx,Ny,Nz);
+
+	//************ MAIN ITERATION LOOP ***************************************/
+	PROFILE_START("Loop");
+    //std::shared_ptr<Database> analysis_db;
+	timestep=0;
+	double rlx = 1.0/tau;
+	double error = 1.0;
+	double flow_rate_previous = 0.0;
+	while (timestep < timestepMax && error > tolerance) {
+		//************************************************************************/
+		timestep++;
+		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		timestep++;
+		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		//************************************************************************/
+		
+		if (timestep%1000==0){
+			ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
+			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
+			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
+			
+			double count_loc=0;
+			double count;
+			double vax,vay,vaz;
+			double vax_loc,vay_loc,vaz_loc;
+			vax_loc = vay_loc = vaz_loc = 0.f;
+			for (int k=1; k<Nz-1; k++){
+				for (int j=1; j<Ny-1; j++){
+					for (int i=1; i<Nx-1; i++){
+						if (SignDist(i,j,k) > 0){
+							vax_loc += Velocity_x(i,j,k);
+							vay_loc += Velocity_y(i,j,k);
+							vaz_loc += Velocity_z(i,j,k);
+							count_loc+=1.0;
+						}
+					}
+				}
+			}
+			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			
+			vax /= count;
+			vay /= count;
+			vaz /= count;
+			
+			double force_mag = sqrt(Fx*Fx+Fy*Fy+Fz*Fz);
+			double dir_x = Fx/force_mag;
+			double dir_y = Fy/force_mag;
+			double dir_z = Fz/force_mag;
+			if (force_mag == 0.0){
+				// default to z direction
+				dir_x = 0.0;
+				dir_y = 0.0;
+				dir_z = 1.0;
+				force_mag = 1.0;
+			}
+			double flow_rate = (vax*dir_x + vay*dir_y + vaz*dir_z);
+			
+			error = fabs(flow_rate - flow_rate_previous) / fabs(flow_rate);
+			flow_rate_previous = flow_rate;
+			
+			//if (rank==0) printf("Computing Minkowski functionals \n");
+			Morphology.ComputeScalar(SignDist,0.f);
+			//Morphology.PrintAll();
+			double mu = (tau-0.5)/3.f;
+			double Vs = Morphology.V();
+			double As = Morphology.A();
+			double Hs = Morphology.H();
+			double Xs = Morphology.X();
+			Vs=sumReduce( Dm->Comm, Vs);
+			As=sumReduce( Dm->Comm, As);
+			Hs=sumReduce( Dm->Comm, Hs);
+			Xs=sumReduce( Dm->Comm, Xs);
+			double h = Dm->voxel_length;
+			double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
+			if (rank==0) {
+				printf("     %f\n",absperm);
+				FILE * log_file = fopen("Permeability.csv","a");
+				fprintf(log_file,"%i %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g\n",timestep, Fx, Fy, Fz, mu, 
+						h*h*h*Vs,h*h*As,h*Hs,Xs,vax,vay,vaz, absperm);
+				fclose(log_file);
+			}
+		}
+	}
+	PROFILE_STOP("Loop");
+	PROFILE_SAVE("lbpm_greyscale_simulator",1);
+	//************************************************************************
+	ScaLBL_DeviceBarrier();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
+	if (rank==0) printf("-------------------------------------------------------------------\n");
+	// Compute the walltime per timestep
+	cputime = (stoptime - starttime)/timestep;
+	// Performance obtained from each node
+	double MLUPS = double(Np)/cputime/1000000;
+
+	if (rank==0) printf("********************************************************\n");
+	if (rank==0) printf("CPU time = %f \n", cputime);
+	if (rank==0) printf("Lattice update rate (per core)= %f MLUPS \n", MLUPS);
+	MLUPS *= nprocs;
+	if (rank==0) printf("Lattice update rate (total)= %f MLUPS \n", MLUPS);
+	if (rank==0) printf("********************************************************\n");
+
+	// ************************************************************************
+}
+
+
+void ScaLBL_GreyscaleModel::WriteDebug(){
+	// Copy back final phase indicator field and convert to regular layout
+/*	ScaLBL_CopyToHost(Porosity.data(), Poros, sizeof(double)*N);
+
+	FILE *OUTFILE;
+	sprintf(LocalRankFilename,"Phase.%05i.raw",rank);
+	OUTFILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,OUTFILE);
+	fclose(OUTFILE);
+
+    ScaLBL_Comm->RegularLayout(Map,&Den[0],PhaseField);
+	FILE *AFILE;
+	sprintf(LocalRankFilename,"A.%05i.raw",rank);
+	AFILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,AFILE);
+	fclose(AFILE);
+
+	ScaLBL_Comm->RegularLayout(Map,&Den[Np],PhaseField);
+	FILE *BFILE;
+	sprintf(LocalRankFilename,"B.%05i.raw",rank);
+	BFILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,BFILE);
+	fclose(BFILE);
+
+	ScaLBL_Comm->RegularLayout(Map,Pressure,PhaseField);
+	FILE *PFILE;
+	sprintf(LocalRankFilename,"Pressure.%05i.raw",rank);
+	PFILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,PFILE);
+	fclose(PFILE);
+
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[0],PhaseField);
+	FILE *VELX_FILE;
+	sprintf(LocalRankFilename,"Velocity_X.%05i.raw",rank);
+	VELX_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,VELX_FILE);
+	fclose(VELX_FILE);
+
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],PhaseField);
+	FILE *VELY_FILE;
+	sprintf(LocalRankFilename,"Velocity_Y.%05i.raw",rank);
+	VELY_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,VELY_FILE);
+	fclose(VELY_FILE);
+
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],PhaseField);
+	FILE *VELZ_FILE;
+	sprintf(LocalRankFilename,"Velocity_Z.%05i.raw",rank);
+	VELZ_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,VELZ_FILE);
+	fclose(VELZ_FILE);
+
+ * 
+ */
+
+}
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
new file mode 100644
index 00000000..37ddf28f
--- /dev/null
+++ b/models/GreyscaleModel.h
@@ -0,0 +1,81 @@
+/*
+Implementation of color lattice boltzmann model
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <iostream>
+#include <exception>
+#include <stdexcept>
+#include <fstream>
+
+#include "common/Communication.h"
+#include "common/MPI_Helpers.h"
+#include "common/Database.h"
+#include "common/ScaLBL.h"
+#include "ProfilerApp.h"
+#include "threadpool/thread_pool.h"
+
+class ScaLBL_GreyscaleModel{
+public:
+	ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM);
+	~ScaLBL_GreyscaleModel();	
+	
+	// functions in they should be run
+	void ReadParams(string filename);
+	void ReadParams(std::shared_ptr<Database> db0);
+	void SetDomain();
+	void ReadInput();
+	void Create();
+	void Initialize();
+	void Run();
+	void WriteDebug();
+	
+	bool Restart,pBC;
+	int timestep,timestepMax;
+	int BoundaryCondition;
+	double tau;
+	double tolerance;
+	double Fx,Fy,Fz,flux;
+	double din,dout;
+	
+	int Nx,Ny,Nz,N,Np;
+	int rank,nprocx,nprocy,nprocz,nprocs;
+	double Lx,Ly,Lz;
+
+	std::shared_ptr<Domain> Dm;   // this domain is for analysis
+	std::shared_ptr<Domain> Mask; // this domain is for lbm
+	std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm;
+    
+    // input database
+    std::shared_ptr<Database> db;
+    std::shared_ptr<Database> domain_db;
+    std::shared_ptr<Database> greyscale_db;
+    std::shared_ptr<Database> analysis_db;
+    std::shared_ptr<Database> vis_db;
+
+    IntArray Map;
+    DoubleArray SignDist;
+    signed char *id;    
+	int *NeighborList;
+	int *dvcMap;
+	double *fq;
+	double *Permeability;
+	double *Porosity;
+	double *Velocity;
+	double *Pressure;
+		
+private:
+	MPI_Comm comm;
+    
+	int dist_mem_size;
+	int neighborSize;
+	// filenames
+    char LocalRankString[8];
+    char LocalRankFilename[40];
+    char LocalRestartFile[40];
+   
+    void AssignComponentLabels(double *Porosity, double *Permeablity);
+    
+};
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8d600321..8b14a9dc 100755
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,6 +3,7 @@
 #ADD_LBPM_EXECUTABLE( lbpm_nondarcy_simulator )
 ADD_LBPM_EXECUTABLE( lbpm_color_simulator )
 ADD_LBPM_EXECUTABLE( lbpm_permeability_simulator )
+ADD_LBPM_EXECUTABLE( lbpm_greyscale_simulator )
 #ADD_LBPM_EXECUTABLE( lbpm_BGK_simulator )
 #ADD_LBPM_EXECUTABLE( lbpm_color_macro_simulator )
 ADD_LBPM_EXECUTABLE( lbpm_dfh_simulator )
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
new file mode 100644
index 00000000..9ab7c385
--- /dev/null
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <iostream>
+#include <exception>
+#include <stdexcept>
+#include <fstream>
+
+#include "common/ScaLBL.h"
+#include "common/Communication.h"
+#include "common/MPI_Helpers.h"
+#include "models/GreyscaleModel.h"
+//#define WRITE_SURFACES
+
+/*
+ * Simulator for two-phase flow in porous media
+ * James E. McClure 2013-2014
+ */
+
+using namespace std;
+
+
+int main(int argc, char **argv)
+{
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
+	// Initialize MPI
+	int rank,nprocs;
+	MPI_Init(&argc,&argv);
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
+	{
+		// parallel domain size (# of sub-domains)
+		int nprocx,nprocy,nprocz;
+		int iproc,jproc,kproc;
+
+		if (rank == 0){
+			printf("********************************************************\n");
+			printf("Running Greyscale Single Phase Permeability Calculation \n");
+			printf("********************************************************\n");
+		}
+		// Initialize compute device
+		int device=ScaLBL_SetDevice(rank);
+		ScaLBL_DeviceBarrier();
+		MPI_Barrier(comm);
+		
+		
+		ScaLBL_MRTModel MRT(rank,nprocs,comm);
+		auto filename = argv[1];
+		MRT.ReadParams(filename);
+		MRT.SetDomain();    // this reads in the domain 
+		MRT.ReadInput();
+		MRT.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+		MRT.Initialize();   // initializing the model will set initial conditions for variables
+		MRT.Run();	 
+		MRT.VelocityField();
+	}
+	// ****************************************************
+	MPI_Barrier(comm);
+	MPI_Finalize();
+	// ****************************************************
+}

From 2abcf030286f148af2bcbea34d5bf67708b01f92 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Thu, 21 Nov 2019 13:01:24 -0500
Subject: [PATCH 002/121] greyscale update

---
 tests/lbpm_greyscale_simulator.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index 9ab7c385..0744a214 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -47,15 +47,14 @@ int main(int argc, char **argv)
 		MPI_Barrier(comm);
 		
 		
-		ScaLBL_MRTModel MRT(rank,nprocs,comm);
+		ScaLBL_GreyscaleModel Greyscale(rank,nprocs,comm);
 		auto filename = argv[1];
-		MRT.ReadParams(filename);
-		MRT.SetDomain();    // this reads in the domain 
-		MRT.ReadInput();
-		MRT.Create();       // creating the model will create data structure to match the pore structure and allocate variables
-		MRT.Initialize();   // initializing the model will set initial conditions for variables
-		MRT.Run();	 
-		MRT.VelocityField();
+		Greyscale.ReadParams(filename);
+		Greyscale.SetDomain();    // this reads in the domain 
+		Greyscale.ReadInput();
+		Greyscale.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+		Greyscale.Initialize();   // initializing the model will set initial conditions for variables
+		Greyscale.Run();	 
 	}
 	// ****************************************************
 	MPI_Barrier(comm);

From 3cd5053ec9b2635ab4bc91943eae0b8fe545d262 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 21 Nov 2019 13:29:26 -0500
Subject: [PATCH 003/121] Copying halo when reading grid file

---
 models/ColorModel.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 13d71b4d..d21153a9 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -4,6 +4,7 @@ color lattice boltzmann model
 #include "models/ColorModel.h"
 #include "analysis/distance.h"
 #include "analysis/morphology.h"
+#include "common/Communication.h"
 #include "common/ReadMicroCT.h"
 #include <stdlib.h>
 #include <time.h>
@@ -191,8 +192,17 @@ void ScaLBL_ColorModel::ReadInput(){
 		IMAGE_INDEX++;
 	}
 	else if (domain_db->keyExists( "GridFile" )){
+        // Read the local domain data
 	    auto input_id = readMicroCT( *domain_db, MPI_COMM_WORLD );
-	    for (int i=0; i<Nx*Ny*Nz; i++) Mask->id[i] = input_id(i);
+        // Fill the halo (assuming GCW of 1)
+        array<int,3> size0 = { input_id.size(0), input_id.size(1), input_id.size(2) };
+        ArraySize size1 = { Mask->Nx, Mask->Ny, Mask->Nz };
+        ASSERT( size1[0] == size0[0]+2 && size1[1] == size0[1]+2 && size1[2] == size0[2]+2 );
+        fillHalo<signed char> fill( MPI_COMM_WORLD, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
+        Array<signed char> id_view;
+        id_view.viewRaw( size1, Mask->id );
+        fill.copy( input_id, id_view );
+        fill.fill( id_view );
 	}
 	else if (domain_db->keyExists( "Filename" )){
 		auto Filename = domain_db->getScalar<std::string>( "Filename" );

From 86beafab8acb435a76b03381e8631d909b55c6fe Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Thu, 21 Nov 2019 13:43:32 -0500
Subject: [PATCH 004/121] save the work for cpu version

---
 common/ScaLBL.h                    |   6 +-
 cpu/Greyscale.cpp                  | 277 ++++++++++++++++++++---------
 models/GreyscaleModel.cpp          | 166 ++++++++++++++---
 models/GreyscaleModel.h            |  11 +-
 tests/lbpm_greyscale_simulator.cpp |  16 +-
 5 files changed, 356 insertions(+), 120 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index efca3be8..ecb1ffed 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -56,9 +56,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int
 extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
 
 // GREYSCALE MODEL
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+                                              double *Poros,double *Perm, double *Velocity);
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+                                             double *Poros,double *Perm, double *Velocity);
 
 // MRT MODEL
 extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index a800413d..fa9a1f49 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -1,9 +1,19 @@
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+#include <math.h>
+
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+                                              double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
-	double rho,ux,uy,uz,uu;
+	double rho,vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
 
 	for (int n=start; n<finish; n++){
 		// q=0
@@ -26,97 +36,150 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 		f16 = dist[15*Np+n];
 		f17 = dist[18*Np+n];
 		f18 = dist[17*Np+n];
+        
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
-		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
+        double Fy_tmp=Fy;
+        double Fz_tmp=Fz;
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        if (porosity==1.0){
+            Fx=Fx_tmp;
+            Fy=Fy_tmp;
+            Fz=Fz_tmp;
+        }
 
 		// q=0
-		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 1
-		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q=2
-		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 3
-		dist[3*Np+n] = f3*(1.0-rlx) +
-				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 4
-		dist[4*Np+n] = f4*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 5
-		dist[5*Np+n] = f5*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
 
 		// q = 6
-		dist[6*Np+n] = f6*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
 
 		// q = 7
-		dist[7*Np+n] = f7*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 8
-		dist[8*Np+n] = f8*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 9
-		dist[9*Np+n] = f9*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 10
-		dist[10*Np+n] = f10*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 11
-		dist[11*Np+n] = f11*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
 
 		// q = 12
-		dist[12*Np+n] = f12*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 13
-		dist[13*Np+n] = f13*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q= 14
-		dist[14*Np+n] = f14*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
 
 		// q = 15
-		dist[15*Np+n] = f15*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
 
 		// q = 16
-		dist[16*Np+n] = f16*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 17
-		dist[17*Np+n] = f17*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 18
-		dist[18*Np+n] = f18*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
 
-		//........................................................................
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+                                             double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
-	double rho,ux,uy,uz,uu;
+	double rho,vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
 
 	int nread;
 	for (int n=start; n<finish; n++){
@@ -195,84 +258,130 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 		nr18 = neighborList[n+17*Np];
 		f18 = dist[nr18];
 
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
-		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
+        double Fy_tmp=Fy;
+        double Fz_tmp=Fz;
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        if (porosity==1.0){
+            Fx=Fx_tmp;
+            Fy=Fy_tmp;
+            Fz=Fz_tmp;
+        }
 
 		// q=0
-		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 1
-		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q=2
-		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 3
-		dist[nr4] = f3*(1.0-rlx) +
-				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 4
-		dist[nr3] = f4*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 5
-		dist[nr6] = f5*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
 
 		// q = 6
-		dist[nr5] = f6*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
 
 		// q = 7
-		dist[nr8] = f7*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 8
-		dist[nr7] = f8*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 9
-		dist[nr10] = f9*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 10
-		dist[nr9] = f10*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 11
-		dist[nr12] = f11*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
 
 		// q = 12
-		dist[nr11] = f12*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 13
-		dist[nr14] = f13*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q= 14
-		dist[nr13] = f14*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
 
 		// q = 15
-		dist[nr16] = f15*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
 
 		// q = 16
-		dist[nr15] = f16*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 17
-		dist[nr18] = f17*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 18
-		dist[nr17] = f18*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
 
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
 	}
-}
\ No newline at end of file
+}
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 980d15b5..cf66d6f4 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -8,11 +8,11 @@ color lattice boltzmann model
 #include <time.h>
 
 ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
-rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),
-Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
+rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
 {
-	SignDist.resize(Nx,Ny,Nz);           SignDist.fill(0);
+	SignDist.resize(Nx,Ny,Nz);           
+    SignDist.fill(0);
 
 }
 ScaLBL_GreyscaleModel::~ScaLBL_GreyscaleModel(){
@@ -35,13 +35,17 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	Restart=false;
 	din=dout=1.0;
 	flux=0.0;
+    dp = 10.0; //unit of 'dp': voxel
 	
 	// Color Model parameters
 	if (greyscale_db->keyExists( "timestepMax" )){
 		timestepMax = greyscale_db->getScalar<int>( "timestepMax" );
 	}
 	if (greyscale_db->keyExists( "tau" )){
-		tau = greyscale_db->getScalar<double>( "tauA" );
+		tau = greyscale_db->getScalar<double>( "tau" );
+	}
+	if (greyscale_db->keyExists( "dp" )){
+		dp = greyscale_db->getScalar<double>( "dp" );
 	}
 	if (greyscale_db->keyExists( "F" )){
 		Fx = greyscale_db->getVector<double>( "F" )[0];
@@ -80,6 +84,12 @@ void ScaLBL_GreyscaleModel::SetDomain(){
 	Ly = Dm->Ly;
 	Lz = Dm->Lz;
 	N = Nx*Ny*Nz;
+
+	SignDist.resize(Nx,Ny,Nz);
+	Velocity_x.resize(Nx,Ny,Nz);
+	Velocity_y.resize(Nx,Ny,Nz);
+	Velocity_z.resize(Nx,Ny,Nz);
+
 	id = new signed char [N];
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
 	MPI_Barrier(comm);
@@ -140,6 +150,9 @@ void ScaLBL_GreyscaleModel::ReadInput(){
 	if (rank == 0) cout << "Domain set." << endl;
 }
 
+/********************************************************
+ * AssignComponentLabels                                 *
+ ********************************************************/
 void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Permeablity)
 {
 	size_t NLABELS=0;
@@ -182,8 +195,14 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 				else if (VALUE == 2) POROSITY=1.0;
 				else if (VALUE < 1)	 POROSITY = 0.0;  
 				int idx = Map(i,j,k);
-				if (!(idx < 0))
-					Porosity[idx] = POROSITY;
+				if (!(idx < 0)){
+                    if (POROSITY<=0.0){
+                        ERROR("Error: Porosity for grey voxels must be 0.0 < Porosity <= 1.0 !\n");
+                    }
+                    else{
+					    Porosity[idx] = POROSITY;
+                    }
+                }
 			}
 		}
 	}
@@ -205,13 +224,21 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 						//Mask->id[n] = 0; // set mask to zero since this is an immobile component
 					}
 				}
-				// fluid labels are reserved / negative labels are immobile
+				// Permeability of fluid labels are reserved
+                // NOTE: the voxel permeability of apparent pore nodes should be infinity
+                // TODO: Need to revise the PERMEABILITY of nodes whose VALUE=1 and 2
 				if (VALUE == 1) PERMEABILITY=1.0;
 				else if (VALUE == 2) PERMEABILITY=1.0;
 				else if (VALUE < 1)	 PERMEABILITY = 0.0;  
 				int idx = Map(i,j,k);
-				if (!(idx < 0))
-					Permeability[idx] = PERMEABILITY;
+				if (!(idx < 0)){
+                    if (PERMEABILITY<=0.0){
+                        ERROR("Error: Permeability for grey voxel must be > 0.0 ! \n");
+                    }
+                    else{
+					    Permeability[idx] = PERMEABILITY;
+                    }
+                }
 			}
 		}
 	}
@@ -229,7 +256,7 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 			POROSITY=PorosityList[idx];
 			PERMEABILITY=PermeabilityList[idx];
 			double volume_fraction  = double(label_count_global[idx])/double((Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-			printf("   label=%d, porosity=%f, permeability=%f, volume fraction==%f\n",VALUE,POROSITY,PERMEABILITY,volume_fraction); 
+			printf("   label=%d, porosity=%.3g, permeability=%.3g, volume fraction==%.3g\n",VALUE,POROSITY,PERMEABILITY,volume_fraction); 
 		}
 	}
 
@@ -324,9 +351,6 @@ void ScaLBL_GreyscaleModel::Create(){
 	ScaLBL_CopyToDevice(Permeability, Perm, Np*sizeof(double));
 }        
 
-/********************************************************
- * AssignComponentLabels                                 *
- ********************************************************/
 
 void ScaLBL_GreyscaleModel::Initialize(){
 	
@@ -387,10 +411,6 @@ void ScaLBL_GreyscaleModel::Run(){
 	//.........................................
 	
 	Minkowski Morphology(Mask);
-	DoubleArray Velocity_x(Nx,Ny,Nz);
-	DoubleArray Velocity_y(Nx,Ny,Nz);
-	DoubleArray Velocity_z(Nx,Ny,Nz);
-	DoubleArray Pressure(Nx,Ny,Nz);
 
 	//************ MAIN ITERATION LOOP ***************************************/
 	PROFILE_START("Loop");
@@ -403,21 +423,21 @@ void ScaLBL_GreyscaleModel::Run(){
 		//************************************************************************/
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz);
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
 		if (timestep%1000==0){
-			ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			//ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
+			//ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
@@ -509,6 +529,106 @@ void ScaLBL_GreyscaleModel::Run(){
 	// ************************************************************************
 }
 
+void ScaLBL_GreyscaleModel::VelocityField(){
+
+/*	Minkowski Morphology(Mask);
+	int SIZE=Np*sizeof(double);
+	ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
+	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+	ScaLBL_CopyToHost(&VELOCITY[0],&Velocity[0],3*SIZE);
+
+	memcpy(Morphology.SDn.data(), Distance.data(), Nx*Ny*Nz*sizeof(double));
+	Morphology.Initialize();
+	Morphology.UpdateMeshValues();
+	Morphology.ComputeLocal();
+	Morphology.Reduce();
+	
+	double count_loc=0;
+	double count;
+	double vax,vay,vaz;
+	double vax_loc,vay_loc,vaz_loc;
+	vax_loc = vay_loc = vaz_loc = 0.f;
+	for (int n=0; n<ScaLBL_Comm->LastExterior(); n++){
+		vax_loc += VELOCITY[n];
+		vay_loc += VELOCITY[Np+n];
+		vaz_loc += VELOCITY[2*Np+n];
+		count_loc+=1.0;
+	}
+	
+	for (int n=ScaLBL_Comm->FirstInterior(); n<ScaLBL_Comm->LastInterior(); n++){
+		vax_loc += VELOCITY[n];
+		vay_loc += VELOCITY[Np+n];
+		vaz_loc += VELOCITY[2*Np+n];
+		count_loc+=1.0;
+	}
+	MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	
+	vax /= count;
+	vay /= count;
+	vaz /= count;
+	
+	double mu = (tau-0.5)/3.f;
+	if (rank==0) printf("Fx Fy Fz mu Vs As Js Xs vx vy vz\n");
+	if (rank==0) printf("%.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g\n",Fx, Fy, Fz, mu, 
+						Morphology.V(),Morphology.A(),Morphology.J(),Morphology.X(),vax,vay,vaz);
+						*/
+	
+	std::vector<IO::MeshDataStruct> visData;
+	fillHalo<double> fillData(Dm->Comm,Dm->rank_info,{Dm->Nx-2,Dm->Ny-2,Dm->Nz-2},{1,1,1},0,1);
+
+	auto VxVar = std::make_shared<IO::Variable>();
+	auto VyVar = std::make_shared<IO::Variable>();
+	auto VzVar = std::make_shared<IO::Variable>();
+	auto SignDistVar = std::make_shared<IO::Variable>();
+
+	IO::initialize("","silo","false");
+	// Create the MeshDataStruct	
+	visData.resize(1);
+	visData[0].meshName = "domain";
+	visData[0].mesh = std::make_shared<IO::DomainMesh>( Dm->rank_info,Dm->Nx-2,Dm->Ny-2,Dm->Nz-2,Dm->Lx,Dm->Ly,Dm->Lz );
+	SignDistVar->name = "SignDist";
+	SignDistVar->type = IO::VariableType::VolumeVariable;
+	SignDistVar->dim = 1;
+	SignDistVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
+	visData[0].vars.push_back(SignDistVar);
+	
+	VxVar->name = "Velocity_x";
+	VxVar->type = IO::VariableType::VolumeVariable;
+	VxVar->dim = 1;
+	VxVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
+	visData[0].vars.push_back(VxVar);
+	VyVar->name = "Velocity_y";
+	VyVar->type = IO::VariableType::VolumeVariable;
+	VyVar->dim = 1;
+	VyVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
+	visData[0].vars.push_back(VyVar);
+	VzVar->name = "Velocity_z";
+	VzVar->type = IO::VariableType::VolumeVariable;
+	VzVar->dim = 1;
+	VzVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
+	visData[0].vars.push_back(VzVar);
+	
+	Array<double>& SignData  = visData[0].vars[0]->data;
+	Array<double>& VelxData = visData[0].vars[1]->data;
+	Array<double>& VelyData = visData[0].vars[2]->data;
+	Array<double>& VelzData = visData[0].vars[3]->data;
+	
+    ASSERT(visData[0].vars[0]->name=="SignDist");
+    ASSERT(visData[0].vars[1]->name=="Velocity_x");
+    ASSERT(visData[0].vars[2]->name=="Velocity_y");
+    ASSERT(visData[0].vars[3]->name=="Velocity_z");
+	
+    fillData.copy(SignDist,SignData);
+    fillData.copy(Velocity_x,VelxData);
+    fillData.copy(Velocity_y,VelyData);
+    fillData.copy(Velocity_z,VelzData);
+	
+    IO::writeData( timestep, visData, Dm->Comm );
+
+}
 
 void ScaLBL_GreyscaleModel::WriteDebug(){
 	// Copy back final phase indicator field and convert to regular layout
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index 37ddf28f..9b970a65 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -30,6 +30,7 @@ public:
 	void Initialize();
 	void Run();
 	void WriteDebug();
+	void VelocityField();
 	
 	bool Restart,pBC;
 	int timestep,timestepMax;
@@ -38,6 +39,7 @@ public:
 	double tolerance;
 	double Fx,Fy,Fz,flux;
 	double din,dout;
+    double dp;//solid particle diameter, unit in voxel
 	
 	int Nx,Ny,Nz,N,Np;
 	int rank,nprocx,nprocy,nprocz,nprocs;
@@ -54,16 +56,19 @@ public:
     std::shared_ptr<Database> analysis_db;
     std::shared_ptr<Database> vis_db;
 
-    IntArray Map;
-    DoubleArray SignDist;
     signed char *id;    
 	int *NeighborList;
 	int *dvcMap;
 	double *fq;
-	double *Permeability;
+	double *Permeability;//grey voxel permeability
 	double *Porosity;
 	double *Velocity;
 	double *Pressure;
+    IntArray Map;
+    DoubleArray SignDist;
+    DoubleArray Velocity_x;
+    DoubleArray Velocity_y;
+    DoubleArray Velocity_z;
 		
 private:
 	MPI_Comm comm;
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index 9ab7c385..e15797e3 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -47,15 +47,15 @@ int main(int argc, char **argv)
 		MPI_Barrier(comm);
 		
 		
-		ScaLBL_MRTModel MRT(rank,nprocs,comm);
+		ScaLBL_GreyscaleModel GreyscaleModel(rank,nprocs,comm);
 		auto filename = argv[1];
-		MRT.ReadParams(filename);
-		MRT.SetDomain();    // this reads in the domain 
-		MRT.ReadInput();
-		MRT.Create();       // creating the model will create data structure to match the pore structure and allocate variables
-		MRT.Initialize();   // initializing the model will set initial conditions for variables
-		MRT.Run();	 
-		MRT.VelocityField();
+		GreyscaleModel.ReadParams(filename);
+		GreyscaleModel.SetDomain();    // this reads in the domain 
+		GreyscaleModel.ReadInput();
+		GreyscaleModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+		GreyscaleModel.Initialize();   // initializing the model will set initial conditions for variables
+		GreyscaleModel.Run();	 
+		GreyscaleModel.VelocityField();
 	}
 	// ****************************************************
 	MPI_Barrier(comm);

From a4b0f3e26eafa27d5de3b81008e99f060ce86a49 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Thu, 21 Nov 2019 14:05:58 -0500
Subject: [PATCH 005/121] gpu version of greyscale LBM is also updated

---
 gpu/Greyscale.cu | 287 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 198 insertions(+), 89 deletions(-)

diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index 04b5e979..3365c6f9 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -3,12 +3,20 @@
 #define NBLOCKS 1024
 #define NTHREADS 256
 
-__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+                                                  double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
-	double rho,ux,uy,uz,uu;
+	double rho,vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
@@ -37,97 +45,151 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
 		f17 = dist[18*Np+n];
 		f18 = dist[17*Np+n];
 
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
-		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
+        double Fy_tmp=Fy;
+        double Fz_tmp=Fz;
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        if (porosity==1.0){
+            Fx=Fx_tmp;
+            Fy=Fy_tmp;
+            Fz=Fz_tmp;
+        }
 
 		// q=0
-		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 1
-		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q=2
-		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 3
-		dist[3*Np+n] = f3*(1.0-rlx) +
-				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 4
-		dist[4*Np+n] = f4*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 5
-		dist[5*Np+n] = f5*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
 
 		// q = 6
-		dist[6*Np+n] = f6*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
 
 		// q = 7
-		dist[7*Np+n] = f7*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 8
-		dist[8*Np+n] = f8*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 9
-		dist[9*Np+n] = f9*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 10
-		dist[10*Np+n] = f10*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 11
-		dist[11*Np+n] = f11*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
 
 		// q = 12
-		dist[12*Np+n] = f12*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 13
-		dist[13*Np+n] = f13*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q= 14
-		dist[14*Np+n] = f14*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
 
 		// q = 15
-		dist[15*Np+n] = f15*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
 
 		// q = 16
-		dist[16*Np+n] = f16*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 17
-		dist[17*Np+n] = f17*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 18
-		dist[18*Np+n] = f18*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
 
-		//........................................................................
 		}
 	}
 }
 
-__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+                                                 double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
-	double rho,ux,uy,uz,uu;
+	double rho,vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
@@ -209,91 +271,138 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
 		nr18 = neighborList[n+17*Np];
 		f18 = dist[nr18];
 
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		uz = f5-f6+f11-f12-f13+f14+f15-f16-f17+f18;
-		uu = 1.5*(ux*ux+uy*uy+uz*uz);
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
+
+        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
+        double Fy_tmp=Fy;
+        double Fz_tmp=Fz;
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        if (porosity==1.0){
+            Fx=Fx_tmp;
+            Fy=Fy_tmp;
+            Fz=Fz_tmp;
+        }
 
 		// q=0
-		dist[n] = f0*(1.0-rlx)+rlx*0.3333333333333333*(1.0-uu);
+		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 1
-		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*(rho + 3.0*ux + 4.5*ux*ux - uu) + 0.16666666*Fx;
+		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q=2
-		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*(rho - 3.0*ux + 4.5*ux*ux - uu)-  0.16666666*Fx;
+		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 3
-		dist[nr4] = f3*(1.0-rlx) +
-				rlx*0.05555555555555555*(rho + 3.0*uy + 4.5*uy*uy - uu) + 0.16666666*Fy;
+		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 4
-		dist[nr3] = f4*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uy + 4.5*uy*uy - uu)- 0.16666666*Fy;
+		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
 
 		// q = 5
-		dist[nr6] = f5*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho + 3.0*uz + 4.5*uz*uz - uu) + 0.16666666*Fz;
+		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
 
 		// q = 6
-		dist[nr5] = f6*(1.0-rlx) + 
-				rlx*0.05555555555555555*(rho - 3.0*uz + 4.5*uz*uz - uu) - 0.16666666*Fz;
+		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
 
 		// q = 7
-		dist[nr8] = f7*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) + 0.08333333333*(Fx+Fy);
+		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 8
-		dist[nr7] = f8*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uy) + 4.5*(ux+uy)*(ux+uy) - uu) - 0.08333333333*(Fx+Fy);
+		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 9
-		dist[nr10] = f9*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) + 0.08333333333*(Fx-Fy);
+		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 10
-		dist[nr9] = f10*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uy) + 4.5*(ux-uy)*(ux-uy) - uu) - 0.08333333333*(Fx-Fy);
+		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
 
 		// q = 11
-		dist[nr12] = f11*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu) + 0.08333333333*(Fx+Fz);
+		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
 
 		// q = 12
-		dist[nr11] = f12*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux+uz) + 4.5*(ux+uz)*(ux+uz) - uu)  - 0.08333333333*(Fx+Fz);
+		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 13
-		dist[nr14] = f13*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu) + 0.08333333333*(Fx-Fz);
+		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
 
 		// q= 14
-		dist[nr13] = f14*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(ux-uz) + 4.5*(ux-uz)*(ux-uz) - uu)- 0.08333333333*(Fx-Fz);
+		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
 
 		// q = 15
-		dist[nr16] = f15*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) + 0.08333333333*(Fy+Fz);
+		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
 
 		// q = 16
-		dist[nr15] = f16*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy+uz) + 4.5*(uy+uz)*(uy+uz) - uu) - 0.08333333333*(Fy+Fz);
+		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 17
-		dist[nr18] = f17*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho + 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) + 0.08333333333*(Fy-Fz);
+		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
 
 		// q = 18
-		dist[nr17] = f18*(1.0-rlx) + 
-				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
+		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
 		}
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
 	
-    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz);
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -301,11 +410,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
-    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz);
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q19_AAeven_Greyscale: %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q19_AAodd_Greyscale: %s \n",cudaGetErrorString(err));
 	}
-}
\ No newline at end of file
+}

From cf5a284f6dd7ef2ac3b03128c11d2eb960651c03 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 3 Dec 2019 13:01:37 -0500
Subject: [PATCH 006/121] adding subphase functionality

---
 example/Workflow/HelperFunctions.R | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/example/Workflow/HelperFunctions.R b/example/Workflow/HelperFunctions.R
index a86ee6fe..497cb262 100644
--- a/example/Workflow/HelperFunctions.R
+++ b/example/Workflow/HelperFunctions.R
@@ -7,11 +7,26 @@ ReadSubphase<-function(PATH){
   S<-read.csv(FILE,head=TRUE,sep=" ")
   S$Vw<-S$Vwc+S$Vwd
   S$Vn<-S$Vnc+S$Vnd
+  S$Aw<-S$Awc+S$Awd
+  S$An<-S$Anc+S$And
+  S$Hw<-S$Hwc+S$Hwd
+  S$Hn<-S$Hnc+S$Hnd
+  S$Xw<-S$Xwc+S$Xwd
+  S$Xn<-S$Xnc+S$Xnd
+
   S$Sw<-S$Vw/(S$Vn+S$Vw)
+  S$pw<-(S$pwc*S$Vwc+S$pwd*S$Vwd) / (S$Vwc+S$Vwd)
+  S$pn<-(S$pnc*S$Vnc+S$pnd*S$Vnd) / (S$Vnc+S$Vnd)
+
   S$Qwx<-S$Vw*(S$Pwc_x+S$Pwd_x)/(S$Mwc+S$Mwd)
   S$Qnx<-S$Vn*(S$Pnc_x+S$Pnd_x)/(S$Mnc+S$Mnd)
-  S$Krn<-S$nun*S$Qnx/S$Fx
-  S$Krw<-S$nuw*S$Qwx/S$Fx
+  S$Qwy<-S$Vw*(S$Pwc_y+S$Pwd_y)/(S$Mwc+S$Mwd)
+  S$Qny<-S$Vn*(S$Pnc_y+S$Pnd_y)/(S$Mnc+S$Mnd)
+  S$Qwz<-S$Vw*(S$Pwc_z+S$Pwd_z)/(S$Mwc+S$Mwd)
+  S$Qnz<-S$Vn*(S$Pnc_z+S$Pnd_z)/(S$Mnc+S$Mnd)
+
+  S$Krn<-S$nun*S$Qnz/S$Fz
+  S$Krw<-S$nuw*S$Qwz/S$Fz
   S$Case<-PATH
   return(S)
 }

From a67d3f8b6900e032c02d2b879f0c666aab86c0fd Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 9 Dec 2019 14:44:58 -0500
Subject: [PATCH 007/121] calculate the medium porosity if read domain from
 Filename

---
 common/Domain.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 0fa8545a..82bcaee2 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -585,6 +585,50 @@ void Domain::Decomp(std::string Filename)
 		MPI_Recv(id,N,MPI_CHAR,0,15,Comm,MPI_STATUS_IGNORE);
 	}
 	MPI_Barrier(Comm);
+
+	// Compute the porosity
+	double sum;
+	double sum_local=0.0;
+	double iVol_global = 1.0/(1.0*(Nx-2)*(Ny-2)*(Nz-2)*nprocs);
+	if (BoundaryCondition > 0) iVol_global = 1.0/(1.0*(Nx-2)*nprocx*(Ny-2)*nprocy*((Nz-2)*nprocz-6));
+	//.........................................................
+	// If external boundary conditions are applied remove solid
+	if (BoundaryCondition >  0  && kproc() == 0){
+    	if (inlet_layers_z < 4)	inlet_layers_z=4;
+		for (int k=0; k<inlet_layers_z; k++){
+			for (int j=0;j<Ny;j++){
+				for (int i=0;i<Nx;i++){
+					int n = k*Nx*Ny+j*Nx+i;
+					id[n] = 1;
+				}                    
+			}
+ 		}
+ 	}
+    if (BoundaryCondition >  0  && kproc() == nprocz-1){
+    	if (outlet_layers_z < 4)	outlet_layers_z=4;
+ 		for (int k=Nz-outlet_layers_z; k<Nz; k++){
+ 			for (int j=0;j<Ny;j++){
+ 				for (int i=0;i<Nx;i++){
+ 					int n = k*Nx*Ny+j*Nx+i;
+ 					id[n] = 2;
+ 				}                    
+ 			}
+ 		}
+ 	}
+    for (int k=inlet_layers_z+1; k<Nz-outlet_layers_z-1;k++){
+        for (int j=1;j<Ny-1;j++){
+            for (int i=1;i<Nx-1;i++){
+                int n = k*Nx*Ny+j*Nx+i;
+                if (id[n] > 0){
+                    sum_local+=1.0;
+                }
+            }
+        }
+    }
+    MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,Comm);
+    porosity = sum*iVol_global;
+    if (rank()==0) printf("Media porosity = %f \n",porosity);
+ 	//.........................................................
 }
 
 void Domain::AggregateLabels(char *FILENAME){

From 9c48b3de7070734e6ba3b637a369e634787bd321 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 9 Dec 2019 15:17:27 -0500
Subject: [PATCH 008/121] enable single phase abs-perm simulator to read medium
 from Filename

---
 common/Domain.cpp   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 models/MRTModel.cpp | 13 ++++++++-----
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 0fa8545a..82bcaee2 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -585,6 +585,50 @@ void Domain::Decomp(std::string Filename)
 		MPI_Recv(id,N,MPI_CHAR,0,15,Comm,MPI_STATUS_IGNORE);
 	}
 	MPI_Barrier(Comm);
+
+	// Compute the porosity
+	double sum;
+	double sum_local=0.0;
+	double iVol_global = 1.0/(1.0*(Nx-2)*(Ny-2)*(Nz-2)*nprocs);
+	if (BoundaryCondition > 0) iVol_global = 1.0/(1.0*(Nx-2)*nprocx*(Ny-2)*nprocy*((Nz-2)*nprocz-6));
+	//.........................................................
+	// If external boundary conditions are applied remove solid
+	if (BoundaryCondition >  0  && kproc() == 0){
+    	if (inlet_layers_z < 4)	inlet_layers_z=4;
+		for (int k=0; k<inlet_layers_z; k++){
+			for (int j=0;j<Ny;j++){
+				for (int i=0;i<Nx;i++){
+					int n = k*Nx*Ny+j*Nx+i;
+					id[n] = 1;
+				}                    
+			}
+ 		}
+ 	}
+    if (BoundaryCondition >  0  && kproc() == nprocz-1){
+    	if (outlet_layers_z < 4)	outlet_layers_z=4;
+ 		for (int k=Nz-outlet_layers_z; k<Nz; k++){
+ 			for (int j=0;j<Ny;j++){
+ 				for (int i=0;i<Nx;i++){
+ 					int n = k*Nx*Ny+j*Nx+i;
+ 					id[n] = 2;
+ 				}                    
+ 			}
+ 		}
+ 	}
+    for (int k=inlet_layers_z+1; k<Nz-outlet_layers_z-1;k++){
+        for (int j=1;j<Ny-1;j++){
+            for (int i=1;i<Nx-1;i++){
+                int n = k*Nx*Ny+j*Nx+i;
+                if (id[n] > 0){
+                    sum_local+=1.0;
+                }
+            }
+        }
+    }
+    MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,Comm);
+    porosity = sum*iVol_global;
+    if (rank()==0) printf("Media porosity = %f \n",porosity);
+ 	//.........................................................
 }
 
 void Domain::AggregateLabels(char *FILENAME){
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index 04fe937d..e2984b2a 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -93,16 +93,19 @@ void ScaLBL_MRTModel::SetDomain(){
 }
 
 void ScaLBL_MRTModel::ReadInput(){
-    int rank=Dm->rank();
-    size_t readID;
-    //.......................................................................
-    //.......................................................................
-    Mask->ReadIDs();
     
     sprintf(LocalRankString,"%05d",Dm->rank());
     sprintf(LocalRankFilename,"%s%s","ID.",LocalRankString);
     sprintf(LocalRestartFile,"%s%s","Restart.",LocalRankString);
 
+	if (domain_db->keyExists( "Filename" )){
+		auto Filename = domain_db->getScalar<std::string>( "Filename" );
+		Mask->Decomp(Filename);
+	}
+	else{
+		Mask->ReadIDs();
+	}
+
 	// Generate the signed distance map
 	// Initialize the domain and communication
 	Array<char> id_solid(Nx,Ny,Nz);

From bbd2a6e34a81d1349ffbc4ac4a729164bd7f2133 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 9 Dec 2019 15:56:44 -0500
Subject: [PATCH 009/121] update output (*.out and Permeability.csv) for
 Greyscale

---
 models/GreyscaleModel.cpp | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index cf66d6f4..5af10205 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -498,13 +498,24 @@ void ScaLBL_GreyscaleModel::Run(){
 			Xs=sumReduce( Dm->Comm, Xs);
 			double h = Dm->voxel_length;
 			double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
-			if (rank==0) {
-				printf("     %f\n",absperm);
-				FILE * log_file = fopen("Permeability.csv","a");
-				fprintf(log_file,"%i %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g\n",timestep, Fx, Fy, Fz, mu, 
-						h*h*h*Vs,h*h*As,h*Hs,Xs,vax,vay,vaz, absperm);
-				fclose(log_file);
-			}
+
+            if (rank==0){
+				printf("     AbsPerm = %.5g [micron^2]\n",absperm);
+                bool WriteHeader=false;
+                FILE * log_file = fopen("Permeability.csv","r");
+                if (log_file != NULL)
+                    fclose(log_file);
+                else
+                    WriteHeader=true;
+                log_file = fopen("Permeability.csv","a");
+                if (WriteHeader)
+                    fprintf(log_file,"timesteps Fx Fy Fz mu Vs As Hs Xs vax vay vaz absperm \n",
+                            timestep,Fx,Fy,Fz,mu,h*h*h*Vs,h*h*As,h*Hs,Xs,vax,vay,vaz,absperm);
+
+                fprintf(log_file,"%i %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g\n",timestep, Fx, Fy, Fz, mu, 
+                        h*h*h*Vs,h*h*As,h*Hs,Xs,vax,vay,vaz, absperm);
+                fclose(log_file);
+            }
 		}
 	}
 	PROFILE_STOP("Loop");

From 5f85b767d6f5fd01463506d832d44f6be02b2ba5 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 9 Dec 2019 22:30:36 -0500
Subject: [PATCH 010/121] add debugging for greyscale lbm

---
 models/GreyscaleModel.cpp          | 56 +++++++++++++++---------------
 tests/lbpm_greyscale_simulator.cpp |  1 +
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 5af10205..2ec3b85e 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -643,34 +643,36 @@ void ScaLBL_GreyscaleModel::VelocityField(){
 
 void ScaLBL_GreyscaleModel::WriteDebug(){
 	// Copy back final phase indicator field and convert to regular layout
-/*	ScaLBL_CopyToHost(Porosity.data(), Poros, sizeof(double)*N);
+	DoubleArray PhaseField(Nx,Ny,Nz);
 
-	FILE *OUTFILE;
-	sprintf(LocalRankFilename,"Phase.%05i.raw",rank);
-	OUTFILE = fopen(LocalRankFilename,"wb");
-	fwrite(PhaseField.data(),8,N,OUTFILE);
-	fclose(OUTFILE);
+	//ScaLBL_CopyToHost(Porosity.data(), Poros, sizeof(double)*N);
 
-    ScaLBL_Comm->RegularLayout(Map,&Den[0],PhaseField);
-	FILE *AFILE;
-	sprintf(LocalRankFilename,"A.%05i.raw",rank);
-	AFILE = fopen(LocalRankFilename,"wb");
-	fwrite(PhaseField.data(),8,N,AFILE);
-	fclose(AFILE);
-
-	ScaLBL_Comm->RegularLayout(Map,&Den[Np],PhaseField);
-	FILE *BFILE;
-	sprintf(LocalRankFilename,"B.%05i.raw",rank);
-	BFILE = fopen(LocalRankFilename,"wb");
-	fwrite(PhaseField.data(),8,N,BFILE);
-	fclose(BFILE);
-
-	ScaLBL_Comm->RegularLayout(Map,Pressure,PhaseField);
-	FILE *PFILE;
-	sprintf(LocalRankFilename,"Pressure.%05i.raw",rank);
-	PFILE = fopen(LocalRankFilename,"wb");
-	fwrite(PhaseField.data(),8,N,PFILE);
-	fclose(PFILE);
+//	FILE *OUTFILE;
+//	sprintf(LocalRankFilename,"Phase.%05i.raw",rank);
+//	OUTFILE = fopen(LocalRankFilename,"wb");
+//	fwrite(PhaseField.data(),8,N,OUTFILE);
+//	fclose(OUTFILE);
+//
+//    ScaLBL_Comm->RegularLayout(Map,&Den[0],PhaseField);
+//	FILE *AFILE;
+//	sprintf(LocalRankFilename,"A.%05i.raw",rank);
+//	AFILE = fopen(LocalRankFilename,"wb");
+//	fwrite(PhaseField.data(),8,N,AFILE);
+//	fclose(AFILE);
+//
+//	ScaLBL_Comm->RegularLayout(Map,&Den[Np],PhaseField);
+//	FILE *BFILE;
+//	sprintf(LocalRankFilename,"B.%05i.raw",rank);
+//	BFILE = fopen(LocalRankFilename,"wb");
+//	fwrite(PhaseField.data(),8,N,BFILE);
+//	fclose(BFILE);
+//
+//	ScaLBL_Comm->RegularLayout(Map,Pressure,PhaseField);
+//	FILE *PFILE;
+//	sprintf(LocalRankFilename,"Pressure.%05i.raw",rank);
+//	PFILE = fopen(LocalRankFilename,"wb");
+//	fwrite(PhaseField.data(),8,N,PFILE);
+//	fclose(PFILE);
 
 	ScaLBL_Comm->RegularLayout(Map,&Velocity[0],PhaseField);
 	FILE *VELX_FILE;
@@ -693,7 +695,5 @@ void ScaLBL_GreyscaleModel::WriteDebug(){
 	fwrite(PhaseField.data(),8,N,VELZ_FILE);
 	fclose(VELZ_FILE);
 
- * 
- */
 
 }
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index 9f910a32..61322d6d 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -55,6 +55,7 @@ int main(int argc, char **argv)
 		Greyscale.Initialize();   // initializing the model will set initial conditions for variables
 		Greyscale.Run();	 
 		Greyscale.VelocityField();
+		Greyscale.WriteDebug();
 	}
 	// ****************************************************
 	MPI_Barrier(comm);

From cddcfa0188d932f755981e92ede2d71c5394d267 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 10 Dec 2019 16:54:42 -0500
Subject: [PATCH 011/121] fix the bug and now have a workable greyscale BGK
 model in both CPU and GPU

---
 cpu/Greyscale.cpp         | 54 +++++++++++++++++----------------------
 gpu/Greyscale.cu          | 52 +++++++++++++++++--------------------
 models/GreyscaleModel.cpp | 14 +++++++++-
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index fa9a1f49..48e61a56 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -1,6 +1,6 @@
 #include <math.h>
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
                                               double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
@@ -14,6 +14,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
     double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	for (int n=start; n<finish; n++){
 		// q=0
@@ -47,27 +48,23 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
-		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
-		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
         v_mag=sqrt(vx*vx+vy*vy+vz*vz);
         ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
         uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
         uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
-		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
 
-        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
-        double Fy_tmp=Fy;
-        double Fz_tmp=Fz;
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
-            Fx=Fx_tmp;
-            Fy=Fy_tmp;
-            Fz=Fz_tmp;
+            Fx=Gx;
+            Fy=Gy;
+            Fz=Gz;
         }
 
 		// q=0
@@ -165,7 +162,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
                                              double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
@@ -180,6 +177,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
     double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int nread;
 	for (int n=start; n<finish; n++){
@@ -268,27 +266,23 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
-		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
-		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
         v_mag=sqrt(vx*vx+vy*vy+vz*vz);
         ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
         uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
         uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
-		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
 
-        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
-        double Fy_tmp=Fy;
-        double Fz_tmp=Fz;
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
-            Fx=Fx_tmp;
-            Fy=Fy_tmp;
-            Fz=Fz_tmp;
+            Fx=Gx;
+            Fy=Gy;
+            Fz=Gz;
         }
 
 		// q=0
diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index 3365c6f9..18bfba58 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -3,7 +3,7 @@
 #define NBLOCKS 1024
 #define NTHREADS 256
 
-__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
                                                   double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
@@ -17,6 +17,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
     double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
@@ -55,27 +56,23 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
-		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
-		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
         v_mag=sqrt(vx*vx+vy*vy+vz*vz);
         ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
         uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
         uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
-		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
 
-        //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
-        double Fy_tmp=Fy;
-        double Fz_tmp=Fz;
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
-            Fx=Fx_tmp;
-            Fy=Fy_tmp;
-            Fz=Fz_tmp;
+            Fx=Gx;
+            Fy=Gy;
+            Fz=Gz;
         }
 
 		// q=0
@@ -175,7 +172,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
 	}
 }
 
-__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
                                                  double *Poros,double *Perm, double *Velocity){
 	int n;
 	// conserved momemnts
@@ -190,6 +187,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
     double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
@@ -281,27 +279,23 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Fx;
-		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Fy;
-		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Fz;
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
         v_mag=sqrt(vx*vx+vy*vy+vz*vz);
         ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
         uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
         uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
-		//uu = 1.5*(ux*ux+uy*uy+uz*uz);
 
         //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        double Fx_tmp=Fx; //Fx_tmp stores user-specified body force
-        double Fy_tmp=Fy;
-        double Fz_tmp=Fz;
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Fx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Fy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Fz;
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
-            Fx=Fx_tmp;
-            Fy=Fy_tmp;
-            Fz=Fz_tmp;
+            Fx=Gx;
+            Fy=Gy;
+            Fz=Gz;
         }
 
 		// q=0
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 2ec3b85e..2ce6ff5e 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -153,7 +153,7 @@ void ScaLBL_GreyscaleModel::ReadInput(){
 /********************************************************
  * AssignComponentLabels                                 *
  ********************************************************/
-void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Permeablity)
+void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Permeability)
 {
 	size_t NLABELS=0;
 	signed char VALUE=0;
@@ -695,5 +695,17 @@ void ScaLBL_GreyscaleModel::WriteDebug(){
 	fwrite(PhaseField.data(),8,N,VELZ_FILE);
 	fclose(VELZ_FILE);
 
+	ScaLBL_Comm->RegularLayout(Map,&Porosity[0],PhaseField);
+	FILE *POROS_FILE;
+	sprintf(LocalRankFilename,"Porosity.%05i.raw",rank);
+	POROS_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,POROS_FILE);
+	fclose(POROS_FILE);
 
+	ScaLBL_Comm->RegularLayout(Map,&Permeability[0],PhaseField);
+	FILE *PERM_FILE;
+	sprintf(LocalRankFilename,"Permeability.%05i.raw",rank);
+	PERM_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,PERM_FILE);
+	fclose(PERM_FILE);
 }

From e66a92142fd6c8da55d732d54c98d53c4cc38248 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 12 Dec 2019 13:58:51 -0500
Subject: [PATCH 012/121] Adding MPIFLAGS option

---
 cmake/libraries.cmake |  2 +-
 cmake/macros.cmake    | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/libraries.cmake b/cmake/libraries.cmake
index 54d70b5d..ebc37f8f 100644
--- a/cmake/libraries.cmake
+++ b/cmake/libraries.cmake
@@ -77,7 +77,7 @@ MACRO( CONFIGURE_MPI )
                 ENDIF ()
             ELSE ()
                 # Search for the MPI executable in the current directory
-                FIND_PROGRAM ( MPIEXEC  NAMES mpiexec mpirun lamexec  PATHS ${MPI_DIRECTORY}/bin  NO_DEFAULT_PATH )
+                FIND_PROGRAM( MPIEXEC  NAMES mpiexec mpirun lamexec  PATHS ${MPI_DIRECTORY}/bin  NO_DEFAULT_PATH )
                 IF ( NOT MPIEXEC )
                     MESSAGE( FATAL_ERROR "Could not locate mpi executable" )
                 ENDIF()
diff --git a/cmake/macros.cmake b/cmake/macros.cmake
index 8791616c..d1c8dbe7 100644
--- a/cmake/macros.cmake
+++ b/cmake/macros.cmake
@@ -848,7 +848,7 @@ FUNCTION( ADD_${PROJ}_TEST EXEFILE ${ARGN} )
     ADD_PROJ_PROVISIONAL_TEST( ${EXEFILE} )
     CREATE_TEST_NAME( ${EXEFILE} ${ARGN} )
     IF ( USE_MPI_FOR_SERIAL_TESTS )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
     ELSE()
         ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
@@ -877,7 +877,7 @@ FUNCTION( ADD_${PROJ}_WEEKLY_TEST EXEFILE PROCS ${ARGN} )
     ELSEIF( ${PROCS} STREQUAL "1" )
         CREATE_TEST_NAME( "${EXEFILE}_WEEKLY" ${ARGN} )
         IF ( USE_MPI_FOR_SERIAL_TESTS )
-            ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+            ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
             SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         ELSE()
             ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
@@ -909,7 +909,7 @@ FUNCTION( ADD_${PROJ}_TEST_PARALLEL EXEFILE PROCS ${ARGN} )
     ELSEIF ( ${PROCS} GREATER ${TEST_MAX_PROCS} )
         MESSAGE("Disabling test ${TESTNAME} (exceeds maximum number of processors ${TEST_MAX_PROCS})")
     ELSE()
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
@@ -930,7 +930,7 @@ MACRO( ADD_${PROJ}_TEST_THREAD_MPI EXEFILE PROCS THREADS ${ARGN} )
         SET_TESTS_PROPERTIES ( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${TOT_PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
     ELSEIF ( USE_MPI OR USE_EXT_MPI )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         SET_TESTS_PROPERTIES ( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${TOT_PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
@@ -966,7 +966,7 @@ FUNCTION( ADD_${PROJ}_EXAMPLE EXEFILE PROCS ${ARGN} )
         ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
     ELSEIF ( USE_EXT_MPI AND NOT (${PROCS} GREATER ${TEST_MAX_PROCS}) )
         CREATE_TEST_NAME( "example--${EXEFILE}_${PROCS}procs" ${ARGN} )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
     ENDIF()
     SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${PROCS} )

From f0a7732f21e27756de72c96400c4e646c628d7f5 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 2 Jan 2020 13:23:51 -0500
Subject: [PATCH 013/121] Updating StackTrace and improving performance
 converting uCT data

---
 StackTrace/StackTrace.cpp      |  26 ++++++--
 StackTrace/StackTrace.h        |  11 ++++
 StackTrace/Utilities.cpp       |  58 ++++++++++++++---
 StackTrace/Utilities.h         |  18 +++++
 StackTrace/string_view.h       |   2 +-
 analysis/runAnalysis.cpp       |  11 ++--
 common/Communication.hpp       |  12 ++--
 common/ReadMicroCT.cpp         |  37 +++++------
 common/Utilities.cpp           | 116 ++++++++++++++++++++++++++++++++-
 common/Utilities.h             |  31 +++++++++
 tests/lbpm_color_simulator.cpp |  70 ++++++++++----------
 11 files changed, 303 insertions(+), 89 deletions(-)

diff --git a/StackTrace/StackTrace.cpp b/StackTrace/StackTrace.cpp
index e9292990..55a24352 100644
--- a/StackTrace/StackTrace.cpp
+++ b/StackTrace/StackTrace.cpp
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cerrno>
 #include <csignal>
 #include <cstring>
 #include <iostream>
@@ -348,8 +349,11 @@ static inline int exec3( const char *cmd, FUNCTION &fun )
         if ( buffer[0] != 0 )
             fun( buffer );
     }
-    auto status = pclose( pipe );
-    int code    = WEXITSTATUS( status );
+    int code = pclose( pipe );
+    if ( errno == ECHILD ) {
+        errno = 0;
+        code  = 0;
+    }
     std::this_thread::yield(); // Allow any signals to process
     resetSignal( SIGCHLD );    // Clear child exited
     return code;
@@ -1741,7 +1745,7 @@ std::vector<int> StackTrace::defaultSignalsToCatch()
  *  Set the signal handlers                                                  *
  ****************************************************************************/
 static std::function<void( const StackTrace::abort_error &err )> abort_fun;
-static StackTrace::abort_error rethrow()
+StackTrace::abort_error rethrow()
 {
     StackTrace::abort_error error;
 #ifdef USE_LINUX
@@ -1775,14 +1779,14 @@ static StackTrace::abort_error rethrow()
     }
     return error;
 }
-static void term_func_abort( int sig )
+void StackTrace::terminateFunctionSignal( int sig )
 {
     StackTrace::abort_error err;
     err.type      = StackTrace::terminateType::signal;
     err.signal    = sig;
     err.bytes     = StackTrace::Utilities::getMemoryUsage();
     err.stack     = StackTrace::backtrace();
-    err.stackType = StackTrace::printStackType::global;
+    err.stackType = StackTrace::getDefaultStackType();
     abort_fun( err );
 }
 static bool signals_set[256] = { false };
@@ -1829,7 +1833,7 @@ void StackTrace::setErrorHandler( std::function<void( const StackTrace::abort_er
 {
     abort_fun = abort;
     std::set_terminate( term_func );
-    setSignals( defaultSignalsToCatch(), &term_func_abort );
+    setSignals( defaultSignalsToCatch(), &terminateFunctionSignal );
     std::set_unexpected( term_func );
 }
 void StackTrace::clearErrorHandler()
@@ -2215,7 +2219,7 @@ void StackTrace::cleanupStackTrace( multi_stack_info &stack )
             // Remove callstack (and all children) for threads that are just contributing
             bool test = function.find( "_callstack_signal_handler" ) != npos ||
                         function.find( "getGlobalCallStacks" ) != npos ||
-                        function.find( "(" ) == npos;
+                        function.find( "backtrace" ) != npos || function.find( "(" ) == npos;
             if ( test ) {
                 it = stack.children.erase( it );
                 continue;
@@ -2515,3 +2519,11 @@ const char *StackTrace::abort_error::what() const noexcept
             d_msg.erase( i, 1 );
     return d_msg.c_str();
 }
+
+
+/****************************************************************************
+ * Get/Set default stack type                                                *
+ ****************************************************************************/
+static StackTrace::printStackType abort_stackType = StackTrace::printStackType::global;
+void StackTrace::setDefaultStackType( StackTrace::printStackType type ) { abort_stackType = type; }
+StackTrace::printStackType StackTrace::getDefaultStackType() { return abort_stackType; }
diff --git a/StackTrace/StackTrace.h b/StackTrace/StackTrace.h
index ce315020..3773509c 100644
--- a/StackTrace/StackTrace.h
+++ b/StackTrace/StackTrace.h
@@ -246,6 +246,10 @@ void clearSignals();
 void raiseSignal( int signal );
 
 
+//! Default function to abort after catching a signal
+void terminateFunctionSignal( int signal );
+
+
 //! Return a list of all signals that can be caught
 std::vector<int> allSignalsToCatch();
 
@@ -289,6 +293,13 @@ multi_stack_info generateFromString( const std::vector<std::string> &str );
 multi_stack_info generateFromString( const std::string &str );
 
 
+//! Set default stack type
+void setDefaultStackType( StackTrace::printStackType );
+
+//! Get default stack type
+StackTrace::printStackType getDefaultStackType();
+
+
 } // namespace StackTrace
 
 
diff --git a/StackTrace/Utilities.cpp b/StackTrace/Utilities.cpp
index 734a0056..11f05777 100644
--- a/StackTrace/Utilities.cpp
+++ b/StackTrace/Utilities.cpp
@@ -8,8 +8,10 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <mutex>
 #include <sstream>
 #include <stdexcept>
+#include <typeinfo>
 
 #ifdef USE_MPI
 #include "mpi.h"
@@ -19,6 +21,10 @@
 #include "MemoryApp.h"
 #endif
 
+#ifdef USE_GCOV
+extern "C" void __gcov_flush( void );
+#endif
+
 
 #define perr std::cerr
 
@@ -65,6 +71,12 @@
 // clang-format on
 
 
+#ifdef __GNUC__
+#define USE_ABI
+#include <cxxabi.h>
+#endif
+
+
 namespace StackTrace {
 
 
@@ -96,13 +108,12 @@ inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
 /****************************************************************************
  *  Function to terminate the program                                        *
  ****************************************************************************/
-static bool abort_throwException      = false;
-static printStackType abort_stackType = printStackType::global;
-static int force_exit                 = 0;
+static bool abort_throwException = false;
+static int force_exit            = 0;
 void Utilities::setAbortBehavior( bool throwException, int stackType )
 {
     abort_throwException = throwException;
-    abort_stackType      = static_cast<printStackType>( stackType );
+    StackTrace::setDefaultStackType( static_cast<printStackType>( stackType ) );
 }
 void Utilities::abort( const std::string &message, const std::string &filename, const int line )
 {
@@ -112,16 +123,28 @@ void Utilities::abort( const std::string &message, const std::string &filename,
     err.type      = terminateType::abort;
     err.line      = line;
     err.bytes     = Utilities::getMemoryUsage();
-    err.stackType = abort_stackType;
+    err.stackType = StackTrace::getDefaultStackType();
     err.stack     = StackTrace::backtrace();
     throw err;
 }
-static void terminate( const StackTrace::abort_error &err )
+static std::mutex terminate_mutex;
+static inline void callAbort()
 {
+#ifdef USE_GCOV
+    __gcov_flush();
+#endif
+    terminate_mutex.unlock();
+    std::abort();
+}
+void Utilities::terminate( const StackTrace::abort_error &err )
+{
+    // Lock mutex to ensure multiple threads do not try to abort simultaneously
+    terminate_mutex.lock();
+    // Clear the error handlers
     clearErrorHandler();
     // Print the message and abort
     if ( force_exit > 1 ) {
-        std::abort();
+        callAbort();
     } else if ( !abort_throwException ) {
         // Use MPI_abort (will terminate all processes)
         force_exit = 2;
@@ -135,10 +158,11 @@ static void terminate( const StackTrace::abort_error &err )
             MPI_Abort( MPI_COMM_WORLD, -1 );
         }
 #endif
-        std::abort();
+        callAbort();
     } else {
         perr << err.what();
-        std::abort();
+        perr.flush();
+        callAbort();
     }
 }
 
@@ -149,7 +173,7 @@ static void terminate( const StackTrace::abort_error &err )
 static void setTerminateErrorHandler()
 {
     // Set the terminate routine for runtime errors
-    StackTrace::setErrorHandler( terminate );
+    StackTrace::setErrorHandler( Utilities::terminate );
 }
 void Utilities::setErrorHandlers()
 {
@@ -293,4 +317,18 @@ std::string Utilities::exec( const string_view &cmd, int &exit_code )
 }
 
 
+/****************************************************************************
+ *  Get the type name                                                        *
+ ****************************************************************************/
+std::string Utilities::getTypeName( const std::type_info &id )
+{
+    std::string name = id.name();
+#if defined( USE_ABI )
+    int status;
+    name = abi::__cxa_demangle( name.c_str(), 0, 0, &status );
+#endif
+    return name;
+}
+
+
 } // namespace StackTrace
diff --git a/StackTrace/Utilities.h b/StackTrace/Utilities.h
index 10ed9085..83c8d7aa 100644
--- a/StackTrace/Utilities.h
+++ b/StackTrace/Utilities.h
@@ -4,6 +4,7 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <typeinfo>
 
 #include "StackTrace/StackTrace.h"
 #include "StackTrace/string_view.h"
@@ -28,9 +29,14 @@ void abort( const std::string &message, const std::string &filename, const int l
 void setAbortBehavior( bool throwException, int stackType = 2 );
 
 
+//! Function to terminate the application
+void terminate( const StackTrace::abort_error &err );
+
+
 //! Function to set the error handlers
 void setErrorHandlers();
 
+
 //! Function to clear the error handlers
 void clearErrorHandlers();
 
@@ -92,6 +98,18 @@ void cause_segfault();
 std::string exec( const StackTrace::string_view &cmd, int &exit_code );
 
 
+//! Return the hopefully demangled name of the given type
+std::string getTypeName( const std::type_info &id );
+
+
+//! Return the hopefully demangled name of the given type
+template<class TYPE>
+inline std::string getTypeName()
+{
+    return getTypeName( typeid( TYPE ) );
+}
+
+
 } // namespace Utilities
 } // namespace StackTrace
 
diff --git a/StackTrace/string_view.h b/StackTrace/string_view.h
index d83d1f24..ee729f63 100644
--- a/StackTrace/string_view.h
+++ b/StackTrace/string_view.h
@@ -119,7 +119,7 @@ public:
         int result = 0;
         for ( int i = 0; i < N && result == 0; i++ )
             if ( d_data[i] != other[i] )
-                result = d_data[i] < other[i] ? -i : i;
+                result = d_data[i] < other[i] ? -( i + 1 ) : ( i + 1 );
         if ( result == 0 )
             result = size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
         return result;
diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index caa03b1b..6c76f58b 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -767,6 +767,8 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
         double *Pressure, double *Velocity, double *fq, double *Den)
 {
     int N = d_N[0]*d_N[1]*d_N[2];
+    NULL_USE( N );
+    NULL_USE( Phi );
     
 	auto db = input_db->getDatabase( "Analysis" );
     //int timestep = db->getWithDefault<int>( "timestep", 0 );
@@ -937,8 +939,6 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
  ******************************************************************/
 void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPhase &Averages, const double *Phi, double *Pressure, double *Velocity, double *fq, double *Den)
 {
-    int N = d_N[0]*d_N[1]*d_N[2];
-
     // Check which analysis steps we need to perform
 	auto color_db =  input_db->getDatabase( "Color" );
 	auto vis_db =  input_db->getDatabase( "Visualization" );
@@ -954,7 +954,7 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
         finish();
     }
 
-    PROFILE_START("run");
+    PROFILE_START("basic");
 
     // Copy the appropriate variables to the host (so we can spawn new threads)
     ScaLBL_DeviceBarrier();
@@ -983,7 +983,6 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
     }
     PROFILE_STOP("Copy data to host");
 
-    PROFILE_START("run",1);
     // Spawn threads to do the analysis work
     //if (timestep%d_restart_interval==0){
     // if ( matches(type,AnalysisType::ComputeAverages) ) {
@@ -1036,12 +1035,11 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
         d_wait_vis = d_tpool.add_work(work);
     }
 
-    PROFILE_STOP("run");
+    PROFILE_STOP("basic");
 }
 
 void runAnalysis::WriteVisData(int timestep, std::shared_ptr<Database> input_db, SubPhase &Averages, const double *Phi, double *Pressure, double *Velocity, double *fq, double *Den)
 {
-    int N = d_N[0]*d_N[1]*d_N[2];
 	auto color_db =  input_db->getDatabase( "Color" );
 	auto vis_db =  input_db->getDatabase( "Visualization" );
     //int timestep = color_db->getWithDefault<int>( "timestep", 0 );
@@ -1068,7 +1066,6 @@ void runAnalysis::WriteVisData(int timestep, std::shared_ptr<Database> input_db,
     d_wait_vis = d_tpool.add_work(work2);
 
     //Averages.WriteVis = false;
-   // }
     
     PROFILE_STOP("write vis");
 }
diff --git a/common/Communication.hpp b/common/Communication.hpp
index cb9f3f18..33fed3a7 100644
--- a/common/Communication.hpp
+++ b/common/Communication.hpp
@@ -44,9 +44,9 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     if ( !src_data.empty() ) {
         int i1[3] = { src_size[0] * src_rank.ix, src_size[1] * src_rank.jy, src_size[2] * src_rank.kz };
         int i2[3] = { i1[0] + src_size[0] - 1, i1[1] + src_size[1] - 1, i1[2] + src_size[2] - 1 };
-        for ( size_t i=0; i<dst_rank.nx; i++ ) {
-            for ( size_t j=0; j<dst_rank.ny; j++ ) {
-                for ( size_t k=0; k<dst_rank.nz; k++ ) {
+        for ( int i=0; i<dst_rank.nx; i++ ) {
+            for ( int j=0; j<dst_rank.ny; j++ ) {
+                for ( int k=0; k<dst_rank.nz; k++ ) {
                     int j1[3] = { i * dst_size[0], j * dst_size[1], k * dst_size[2] };
                     int j2[3] = { j1[0] + dst_size[0] - 1, j1[1] + dst_size[1] - 1, j1[2] + dst_size[2] - 1 };
                     auto index = calcOverlap( i1, i2, j1, j2 );
@@ -65,9 +65,9 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     Array<TYPE> dst_data( dst_size[0], dst_size[1], dst_size[2] );
     int i1[3] = { dst_size[0] * dst_rank.ix, dst_size[1] * dst_rank.jy, dst_size[2] * dst_rank.kz };
     int i2[3] = { i1[0] + dst_size[0] - 1, i1[1] + dst_size[1] - 1, i1[2] + dst_size[2] - 1 };
-    for ( size_t i=0; i<src_rank.nx; i++ ) {
-        for ( size_t j=0; j<src_rank.ny; j++ ) {
-            for ( size_t k=0; k<src_rank.nz; k++ ) {
+    for ( int i=0; i<src_rank.nx; i++ ) {
+        for ( int j=0; j<src_rank.ny; j++ ) {
+            for ( int k=0; k<src_rank.nz; k++ ) {
                 int j1[3] = { i * src_size[0], j * src_size[1], k * src_size[2] };
                 int j2[3] = { j1[0] + src_size[0] - 1, j1[1] + src_size[1] - 1, j1[2] + src_size[2] - 1 };
                 auto index = calcOverlap( i1, i2, j1, j2 );
diff --git a/common/ReadMicroCT.cpp b/common/ReadMicroCT.cpp
index 94c728ef..79ef241e 100644
--- a/common/ReadMicroCT.cpp
+++ b/common/ReadMicroCT.cpp
@@ -70,8 +70,6 @@ Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
     auto n = domain.getVector<int>( "n" );
     int rank = comm_rank(MPI_COMM_WORLD);
     auto nproc = domain.getVector<int>( "nproc" );
-	auto ReadValues = domain.getVector<int>( "ReadValues" );
-	auto WriteValues = domain.getVector<int>( "WriteValues" );
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     
     // Determine the largest file number to get
@@ -95,29 +93,26 @@ Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
     		ERROR( "Invalid name for first file" );
     	}
     	data = readMicroCT( filename );
-
-    	// Relabel the data
-    	for (int k = 0; k<1024; k++){
-    		for (int j = 0; j<1024; j++){
-    			for (int i = 0; i<1024; i++){
-    				//n = k*Nfx*Nfy + j*Nfx + i;
-    				//char locval = loc_id[n];
-    				char locval = data(i,j,k);
-    				for (int idx=0; idx<ReadValues.size(); idx++){
-    					signed char oldvalue=ReadValues[idx];
-    					signed char newvalue=WriteValues[idx];
-    					if (locval == oldvalue){
-    						data(i,j,k) = newvalue;
-    						idx = ReadValues.size();
-    					}
-    				}
-    			}
-    		}
-    	}
     }
 
     // Redistribute the data
     data = redistribute( srcRankInfo, data, rankInfo, { n[0], n[1], n[2] }, comm );
 
+	// Relabel the data
+    auto ReadValues = domain.getVector<int>( "ReadValues" );
+    auto WriteValues = domain.getVector<int>( "WriteValues" );
+    ASSERT( ReadValues.size() == WriteValues.size() );
+    int readMaxValue = 0;
+    for ( auto v : ReadValues )
+        readMaxValue = std::max( data.max()+1, v );
+    std::vector<int> map( readMaxValue + 1, -1 );
+    for ( size_t i=0; i<ReadValues.size(); i++ )
+        map[ReadValues[i]] = WriteValues[i];
+    for ( size_t i=0; i<data.length(); i++ ) {
+        int t = data(i);
+        ASSERT( t >= 0 && t <= readMaxValue );
+        data(i) = map[t];
+    }
+
     return data;
 }
diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index f6d810af..1cf764be 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -1,10 +1,116 @@
 #include "common/Utilities.h"
+#include "StackTrace/StackTrace.h"
+#include "StackTrace/ErrorHandlers.h"
+
+#ifdef USE_TIMER
+#include "MemoryApp.h"
+#include "ProfilerApp.h"
+#endif
+
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
 
-#include <math.h>
 #include <algorithm>
+#include <math.h>
+#include <mutex>
 
 
-// Factor a number into it's prime factors
+// Mutex for Utility functions
+static std::mutex Utilities_mutex;
+
+
+/****************************************************************************
+ *  Function to perform the default startup/shutdown sequences               *
+ ****************************************************************************/
+void Utilities::startup( int argc, char **argv )
+{
+    NULL_USE( argc );
+    NULL_USE( argv );
+    // Disable OpenMP
+    Utilities::setenv( "OMP_NUM_THREADS", "1" );
+    Utilities::setenv( "MKL_NUM_THREADS", "1" );
+    // Start MPI
+#ifdef USE_MPI
+    int provided;
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
+    if ( provided < MPI_THREAD_MULTIPLE ) {
+        int rank;
+        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+        if ( rank == 0 )
+            std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+    }
+    StackTrace::globalCallStackInitialize( MPI_COMM_WORLD );
+#endif
+    // Set the error handlers
+    Utilities::setAbortBehavior( true, 3 );
+    Utilities::setErrorHandlers();
+}
+void Utilities::shutdown()
+{
+    // Clear the error handlers
+    Utilities::clearErrorHandlers();
+    StackTrace::clearSignals();
+    StackTrace::clearSymbols();
+    int rank = 0;
+#ifdef USE_MPI
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+    StackTrace::globalCallStackFinalize();
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Finalize();
+#endif
+#ifdef USE_TIMER
+    PROFILE_DISABLE();
+    auto memory = MemoryApp::getMemoryStats();
+    if ( rank == 0 && memory.N_new > memory.N_delete )
+        MemoryApp::print( std::cout );
+#endif
+}
+
+
+/****************************************************************************
+ *  Function to set an environemental variable                               *
+ ****************************************************************************/
+void Utilities::setenv( const std::string &name, const std::string &value )
+{
+    Utilities_mutex.lock();
+#if defined( USE_LINUX ) || defined( USE_MAC )
+    bool pass = false;
+    if ( !value.empty() )
+        pass = ::setenv( name.data(), value.data(), 1 ) == 0;
+    else
+        pass = ::unsetenv( name.data() ) == 0;
+#elif defined( USE_WINDOWS )
+    bool pass = SetEnvironmentVariable( name.data(), value.data() ) != 0;
+#else
+#error Unknown OS
+#endif
+    Utilities_mutex.unlock();
+    if ( !pass ) {
+        char msg[1024];
+        if ( !value.empty() )
+            sprintf(
+                msg, "Error setting enviornmental variable: %s=%s\n", name.data(), value.data() );
+        else
+            sprintf( msg, "Error clearing enviornmental variable: %s\n", name.data() );
+        ERROR( msg );
+    }
+}
+std::string Utilities::getenv( const std::string &name )
+{
+    std::string var;
+    Utilities_mutex.lock();
+    auto tmp = std::getenv( name.data() );
+    if ( tmp )
+        var = std::string( tmp );
+    Utilities_mutex.unlock();
+    return var;
+}
+
+
+/****************************************************************************
+ *  Factor a number into it's prime factors                                  *
+ ****************************************************************************/
 std::vector<int> Utilities::factor(size_t number)
 {
     if ( number<=3 ) 
@@ -54,9 +160,13 @@ std::vector<int> Utilities::factor(size_t number)
 }
 
 
-// Dummy function to prevent compiler from optimizing away variable
+/****************************************************************************
+ *  Dummy function to prevent compiler from optimizing away variable         *
+ ****************************************************************************/
 void Utilities::nullUse( void* data )
 {
     NULL_USE(data);
 }
 
+
+
diff --git a/common/Utilities.h b/common/Utilities.h
index 90cb4008..da579966 100644
--- a/common/Utilities.h
+++ b/common/Utilities.h
@@ -25,6 +25,37 @@ using StackTrace::Utilities::sleep_ms;
 using StackTrace::Utilities::sleep_s;
 
 
+/*!
+ * \brief Start MPI, error handlers
+ * \details This routine will peform the default startup sequence
+ * \param argc              argc from main
+ * \param argv              argv from main
+ */
+void startup( int argc, char **argv );
+
+/*!
+ * \brief Stop MPI, error handlers
+ * \details This routine will peform the default shutdown sequence to match startup
+ */
+void shutdown();
+
+
+/*!
+ * Get an environmental variable
+ * @param name              The name of the environmental variable
+ * @return                  The value of the enviornmental variable
+ */
+std::string getenv( const std::string &name );
+
+
+/*!
+ * Set an environmental variable
+ * @param name              The name of the environmental variable
+ * @param value             The value to set
+ */
+void setenv( const std::string &name, const std::string &value );
+
+
 //! std::string version of sprintf
 inline std::string stringf( const char *format, ... );
 
diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index e8e675e2..1f63c653 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -7,6 +7,7 @@
 #include <fstream>
 
 #include "models/ColorModel.h"
+#include "common/Utilities.h"
 
 //#define WRE_SURFACES
 
@@ -15,7 +16,6 @@
  * James E. McClure 2013-2014
  */
 
-using namespace std;
 
 //*************************************************************************
 // Implementation of Two-Phase Immiscible LBM using CUDA
@@ -23,27 +23,26 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
-  // Initialize MPI
-  int provided_thread_support = -1;
-  MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-  int rank = comm_rank(comm);
-  int nprocs = comm_size(comm);
-  if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
-    std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+  // Initialize MPI and error handlers
+  Utilities::startup( argc, argv );
+
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
-	if (rank == 0){
-		printf("********************************************************\n");
-		printf("Running Color LBM	\n");
-		printf("********************************************************\n");
-	}
-	// Initialize compute device
-	int device=ScaLBL_SetDevice(rank);
-	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
-	
+    MPI_Comm comm;
+    MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+    int rank = comm_rank(comm);
+    int nprocs = comm_size(comm);
+
+    if (rank == 0){
+	    printf("********************************************************\n");
+	    printf("Running Color LBM	\n");
+	    printf("********************************************************\n");
+    }
+    // Initialize compute device
+    ScaLBL_SetDevice(rank);
+    ScaLBL_DeviceBarrier();
+    MPI_Barrier(comm);
+
     PROFILE_ENABLE(1);
     //PROFILE_ENABLE_TRACE();
     //PROFILE_ENABLE_MEMORY();
@@ -51,23 +50,26 @@ int main(int argc, char **argv)
     PROFILE_START("Main");
     Utilities::setErrorHandlers();
 
-	auto filename = argv[1];
-	ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
-	ColorModel.ReadParams(filename);
-	ColorModel.SetDomain();    
-	ColorModel.ReadInput();    
-	ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
-	ColorModel.Initialize();   // initializing the model will set initial conditions for variables
-	ColorModel.Run();	       
-	//ColorModel.WriteDebug();
-	
+    auto filename = argv[1];
+    ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
+    ColorModel.ReadParams(filename);
+    ColorModel.SetDomain();    
+    ColorModel.ReadInput();    
+    ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+    ColorModel.Initialize();   // initializing the model will set initial conditions for variables
+    ColorModel.Run();	       
+    //ColorModel.WriteDebug();
+
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);
-	// ****************************************************
-	MPI_Barrier(comm);
+    // ****************************************************
+
+    MPI_Barrier(comm);
+    MPI_Comm_free(&comm);
+
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Comm_free(&comm);
-  MPI_Finalize();
+
+  Utilities::shutdown();
 }
 
 

From 38f97c2848aa484fe887313d2c054ae78208f594 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 13 Jan 2020 22:50:37 -0500
Subject: [PATCH 014/121] save the work

---
 gpu/Greyscale.cu | 468 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 468 insertions(+)

diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index 18bfba58..fdb0a462 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -394,6 +394,474 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
 	}
 }
 
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+                                                  double *Poros,double *Perm, double *Velocity, double Den){
+	int n;
+	double vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+	// conserved momemnts
+	double rho,jx,jy,jz;
+	// non-conserved moments
+	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
+	double m3,m5,m7;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
+    double rlx_setA = rlx;
+    double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+
+	int S = Np/NBLOCKS/NTHREADS + 1;
+	for (int s=0; s<S; s++){
+	    //........Get 1-D index for this thread....................
+	    n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
+
+		if ( n<finish ){
+
+		// q=0
+		fq = dist[n];
+		rho = fq;
+		m1  = -30.0*fq;
+		m2  = 12.0*fq;
+
+		// q=1
+		fq = dist[2*Np+n];
+		rho += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jx = fq;
+		m4 = -4.0*fq;
+		m9 = 2.0*fq;
+		m10 = -4.0*fq;
+
+		// f2 = dist[10*Np+n];
+		fq = dist[1*Np+n];
+		rho += fq;
+		m1 -= 11.0*(fq);
+		m2 -= 4.0*(fq);
+		jx -= fq;
+		m4 += 4.0*(fq);
+		m9 += 2.0*(fq);
+		m10 -= 4.0*(fq);
+
+		// q=3
+		fq = dist[4*Np+n];
+		rho += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy = fq;
+		m6 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 = fq;
+		m12 = -2.0*fq;
+
+		// q = 4
+		fq = dist[3*Np+n];
+		rho+= fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy -= fq;
+		m6 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 += fq;
+		m12 -= 2.0*fq;
+
+		// q=5
+		fq = dist[6*Np+n];
+		rho += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz = fq;
+		m8 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q = 6
+		fq = dist[5*Np+n];
+		rho+= fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz -= fq;
+		m8 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q=7
+		fq = dist[8*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy += fq;
+		m6 += fq;
+		m9  += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 = fq;
+		m16 = fq;
+		m17 = -fq;
+
+		// q = 8
+		fq = dist[7*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 += fq;
+		m16 -= fq;
+		m17 += fq;
+
+		// q=9
+		fq = dist[10*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 += fq;
+		m17 += fq;
+
+		// q = 10
+		fq = dist[9*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy += fq;
+		m6 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 -= fq;
+		m17 -= fq;
+
+		// q=11
+		fq = dist[12*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 = fq;
+		m16 -= fq;
+		m18 = fq;
+
+		// q=12
+		fq = dist[11*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 += fq;
+		m16 += fq;
+		m18 -= fq;
+
+		// q=13
+		fq = dist[14*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 -= fq;
+		m18 -= fq;
+
+		// q=14
+		fq = dist[13*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 += fq;
+		m18 += fq;
+
+		// q=15
+		fq = dist[16*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 = fq;
+		m17 += fq;
+		m18 -= fq;
+
+		// q=16
+		fq = dist[15*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 += fq;
+		m17 -= fq;
+		m18 += fq;
+
+		// q=17
+		fq = dist[18*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 += fq;
+		m18 += fq;
+
+		// q=18
+		fq = dist[17*Np+n];
+		rho += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 -= fq;
+		m18 -= fq;
+
+		//........................................................................
+		//..............carry out relaxation process..............................
+		//..........Toelke, Fruediger et. al. 2006................................
+		if (C == 0.0)	nx = ny = nz = 0.0;
+		m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho0 - 11*rho) -19*alpha*C - m1);
+		m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho0)- m2);
+		m4 = m4 + rlx_setB*((-0.6666666666666666*jx)- m4);
+		m6 = m6 + rlx_setB*((-0.6666666666666666*jy)- m6);
+		m8 = m8 + rlx_setB*((-0.6666666666666666*jz)- m8);
+		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9);
+		m10 = m10 + rlx_setA*( - m10);
+		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(ny*ny-nz*nz)- m11);
+		m12 = m12 + rlx_setA*( - m12);
+		m13 = m13 + rlx_setA*( (jx*jy/rho0) + 0.5*alpha*C*nx*ny - m13);
+		m14 = m14 + rlx_setA*( (jy*jz/rho0) + 0.5*alpha*C*ny*nz - m14);
+		m15 = m15 + rlx_setA*( (jx*jz/rho0) + 0.5*alpha*C*nx*nz - m15);
+		m16 = m16 + rlx_setB*( - m16);
+		m17 = m17 + rlx_setB*( - m17);
+		m18 = m18 + rlx_setB*( - m18);
+
+
+
+
+		// q=0
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+		f6 = dist[5*Np+n];
+		f7 = dist[8*Np+n];
+		f8 = dist[7*Np+n];
+		f9 = dist[10*Np+n];
+		f10 = dist[9*Np+n];
+		f11 = dist[12*Np+n];
+		f12 = dist[11*Np+n];
+		f13 = dist[14*Np+n];
+		f14 = dist[13*Np+n];
+		f15 = dist[16*Np+n];
+		f16 = dist[15*Np+n];
+		f17 = dist[18*Np+n];
+		f18 = dist[17*Np+n];
+
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
+		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
+		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
+		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
+        if (porosity==1.0){
+            Fx=Gx;
+            Fy=Gy;
+            Fz=Gz;
+        }
+
+		// q=0
+		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+
+		// q = 1
+		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+
+		// q=2
+		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+
+		// q = 3
+		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+
+		// q = 4
+		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+
+		// q = 5
+		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+
+		// q = 6
+		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+
+		// q = 7
+		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+
+		// q = 8
+		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+
+		// q = 9
+		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+
+		// q = 10
+		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+
+		// q = 11
+		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+
+		// q = 12
+		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+
+		// q = 13
+		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+
+		// q= 14
+		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+
+		// q = 15
+		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+
+		// q = 16
+		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+
+		// q = 17
+		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+
+		// q = 18
+		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
+
+		}
+	}
+}
+
+
+
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
 	
     dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);

From 3b23fca11864c313da091bd375000dfb70d39a88 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 14 Jan 2020 12:00:22 -0500
Subject: [PATCH 015/121] change specifier of printf to correct the output for
 very large image

---
 common/Domain.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 82bcaee2..48bfed15 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -391,7 +391,7 @@ void Domain::Decomp(std::string Filename)
 			for (int idx=0; idx<ReadValues.size(); idx++){
 				long int label=ReadValues[idx];
 				long int count=LabelCount[idx];
-				printf("Label=%d, Count=%d \n",label,count);
+				printf("Label=%ld, Count=%ld \n",label,count);
 			}
 		}
 

From 12c0d42d3685e40f98cfd7500cf4a2783ef2a1c0 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 14 Jan 2020 12:01:33 -0500
Subject: [PATCH 016/121] change specifier of printf to correct the output for
 very large image

---
 common/Domain.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 82bcaee2..48bfed15 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -391,7 +391,7 @@ void Domain::Decomp(std::string Filename)
 			for (int idx=0; idx<ReadValues.size(); idx++){
 				long int label=ReadValues[idx];
 				long int count=LabelCount[idx];
-				printf("Label=%d, Count=%d \n",label,count);
+				printf("Label=%ld, Count=%ld \n",label,count);
 			}
 		}
 

From 9fa091a49db08000dc26d5a21408b07915d7655f Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Fri, 17 Jan 2020 18:46:28 -0500
Subject: [PATCH 017/121] save the work, CPU versions seem to work, but need
 non-unity porosity test

---
 common/Domain.cpp         |    2 +-
 common/ScaLBL.h           |    7 +
 cpu/Greyscale.cpp         | 1070 +++++++++++++++++++++++++++++++++++++
 models/GreyscaleModel.cpp |   20 +-
 models/GreyscaleModel.h   |    1 +
 5 files changed, 1093 insertions(+), 7 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 82bcaee2..48bfed15 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -391,7 +391,7 @@ void Domain::Decomp(std::string Filename)
 			for (int idx=0; idx<ReadValues.size(); idx++){
 				long int label=ReadValues[idx];
 				long int count=LabelCount[idx];
-				printf("Label=%d, Count=%d \n",label,count);
+				printf("Label=%ld, Count=%ld \n",label,count);
 			}
 		}
 
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index ecb1ffed..d2495e3f 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -62,6 +62,13 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
                                              double *Poros,double *Perm, double *Velocity);
 
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+                                              double *Poros,double *Perm, double *Velocity,double Den);
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+                                             double *Poros,double *Perm, double *Velocity,double Den);
+
+
 // MRT MODEL
 extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
 		double Fy, double Fz);
diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index 48e61a56..11a8eb5c 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -379,3 +379,1073 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 		Velocity[2*Np+n] = uz;
 	}
 }
+
+
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+                                              double *Poros,double *Perm, double *Velocity, double Den){
+	int n;
+	double vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    double pressure;//defined for this incompressible model
+	// conserved momemnts
+	double jx,jy,jz;
+	// non-conserved moments
+	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
+	double m3,m5,m7;
+    double fq;
+	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
+    double rlx_setA = rlx;
+    double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+
+	const double mrt_V1=0.05263157894736842;
+	const double mrt_V2=0.012531328320802;
+	const double mrt_V3=0.04761904761904762;
+	const double mrt_V4=0.004594820384294068;
+	const double mrt_V5=0.01587301587301587;
+	const double mrt_V6=0.0555555555555555555555555;
+	const double mrt_V7=0.02777777777777778;
+	const double mrt_V8=0.08333333333333333;
+	const double mrt_V9=0.003341687552213868;
+	const double mrt_V10=0.003968253968253968;
+	const double mrt_V11=0.01388888888888889;
+	const double mrt_V12=0.04166666666666666;
+
+	for (int n=start; n<finish; n++){
+
+		//........................................................................
+		//					READ THE DISTRIBUTIONS
+		//		(read from opposite array due to previous swap operation)
+		//........................................................................
+		// q=0
+		fq = dist[n];
+		m1  = -30.0*fq;
+		m2  = 12.0*fq;
+
+		// q=1
+		fq = dist[2*Np+n];
+        pressure = fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jx = fq;
+		m4 = -4.0*fq;
+		m9 = 2.0*fq;
+		m10 = -4.0*fq;
+
+		// f2 = dist[10*Np+n];
+		fq = dist[1*Np+n];
+		pressure += fq;
+		m1 -= 11.0*(fq);
+		m2 -= 4.0*(fq);
+		jx -= fq;
+		m4 += 4.0*(fq);
+		m9 += 2.0*(fq);
+		m10 -= 4.0*(fq);
+
+		// q=3
+		fq = dist[4*Np+n];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy = fq;
+		m6 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 = fq;
+		m12 = -2.0*fq;
+
+		// q = 4
+		fq = dist[3*Np+n];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy -= fq;
+		m6 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 += fq;
+		m12 -= 2.0*fq;
+
+		// q=5
+		fq = dist[6*Np+n];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz = fq;
+		m8 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q = 6
+		fq = dist[5*Np+n];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz -= fq;
+		m8 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q=7
+		fq = dist[8*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy += fq;
+		m6 += fq;
+		m9  += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 = fq;
+		m16 = fq;
+		m17 = -fq;
+
+		// q = 8
+		fq = dist[7*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 += fq;
+		m16 -= fq;
+		m17 += fq;
+
+		// q=9
+		fq = dist[10*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 += fq;
+		m17 += fq;
+
+		// q = 10
+		fq = dist[9*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy += fq;
+		m6 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 -= fq;
+		m17 -= fq;
+
+		// q=11
+		fq = dist[12*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 = fq;
+		m16 -= fq;
+		m18 = fq;
+
+		// q=12
+		fq = dist[11*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 += fq;
+		m16 += fq;
+		m18 -= fq;
+
+		// q=13
+		fq = dist[14*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 -= fq;
+		m18 -= fq;
+
+		// q=14
+		fq = dist[13*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 += fq;
+		m18 += fq;
+
+		// q=15
+		fq = dist[16*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 = fq;
+		m17 += fq;
+		m18 -= fq;
+
+		// q=16
+		fq = dist[15*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 += fq;
+		m17 -= fq;
+		m18 += fq;
+
+		// q=17
+		fq = dist[18*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 += fq;
+		m18 += fq;
+
+		// q=18
+		fq = dist[17*Np+n];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 -= fq;
+		m18 -= fq;
+        //---------------------------------------------------------------------//
+
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
+		vx = jx/Den+0.5*porosity*Gx;
+		vy = jy/Den+0.5*porosity*Gy;
+		vz = jz/Den+0.5*porosity*Gz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+        Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+        Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+        if (porosity==1.0){
+            Fx=Den*Gx;
+            Fy=Den*Gy;
+            Fz=Den*Gz;
+        }
+
+        //Calculate pressure for Incompressible-MRT model
+        pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
+
+		//..............carry out relaxation process...............................................
+//		m1 = m1 + rlx_setA*((-30*Den+19*(jx*jx+jy*jy+jz*jz)/Den/porosity + 57*pressure*porosity) - m1);
+//		m2 = m2 + rlx_setA*((12*Den - 5.5*(jx*jx+jy*jy+jz*jz)/Den/porosity-27*pressure*porosity) - m2);
+//		m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
+//		m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
+//		m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
+//		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/Den/porosity) - m9);
+//		m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/Den/porosity)- m10);
+//		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/Den/porosity) - m11);
+//		m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/Den/porosity)- m12);
+//		m13 = m13 + rlx_setA*((jx*jy/Den/porosity) - m13);
+//		m14 = m14 + rlx_setA*((jy*jz/Den/porosity) - m14);
+//		m15 = m15 + rlx_setA*((jx*jz/Den/porosity) - m15);
+//		m16 = m16 + rlx_setB*( - m16);
+//		m17 = m17 + rlx_setB*( - m17);
+//		m18 = m18 + rlx_setB*( - m18);
+		//.......................................................................................................
+
+		//..............carry out relaxation process...............................................
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
+		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4);
+		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6);
+		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8);
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
+		m16 = m16 + rlx_setB*( - m16);
+		m17 = m17 + rlx_setB*( - m17);
+		m18 = m18 + rlx_setB*( - m18);
+		//.......................................................................................................
+
+        jx+=0.5*Fx;//There is no collision for momentum, but they must be updated subject to the body force
+        jy+=0.5*Fy;
+        jz+=0.5*Fz;
+		//.................inverse transformation......................................................
+		// q=0
+		//fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2;
+		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2 
+                  + 0.3333333333333333*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = fq;
+
+		// q = 1
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) + 0.16666666*Fx;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) 
+            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[1*Np+n] = fq;
+
+		// q=2
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) -  0.16666666*Fx;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) 
+            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[2*Np+n] = fq;
+
+		// q = 3
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[3*Np+n] = fq;
+
+		// q = 4
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[4*Np+n] = fq;
+
+		// q = 5
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+		dist[5*Np+n] = fq;
+
+		// q = 6
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11)
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+		dist[6*Np+n] = fq;
+
+		// q = 7
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		dist[7*Np+n] = fq;
+
+		// q = 8
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		dist[8*Np+n] = fq;
+
+		// q = 9
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		dist[9*Np+n] = fq;
+
+		// q = 10
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		dist[10*Np+n] = fq;
+
+		// q = 11
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+		dist[11*Np+n] = fq;
+
+		// q = 12
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+		dist[12*Np+n] = fq;
+
+		// q = 13
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+		dist[13*Np+n] = fq;
+
+		// q= 14
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+		dist[14*Np+n] = fq;
+
+		// q = 15
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) 
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+		dist[15*Np+n] = fq;
+
+		// q = 16
+		//fq =  mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz);
+		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+		dist[16*Np+n] = fq;
+
+		// q = 17
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+		dist[17*Np+n] = fq;
+
+		// q = 18
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+		dist[18*Np+n] = fq;
+		//........................................................................
+
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
+                                             double *Poros,double *Perm, double *Velocity, double Den){
+	int n, nread;
+	double vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    double pressure;//defined for this incompressible model
+	// conserved momemnts
+	double jx,jy,jz;
+	// non-conserved moments
+	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
+	double m3,m5,m7;
+    double fq;
+	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
+    double rlx_setA = rlx;
+    double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+
+	const double mrt_V1=0.05263157894736842;
+	const double mrt_V2=0.012531328320802;
+	const double mrt_V3=0.04761904761904762;
+	const double mrt_V4=0.004594820384294068;
+	const double mrt_V5=0.01587301587301587;
+	const double mrt_V6=0.0555555555555555555555555;
+	const double mrt_V7=0.02777777777777778;
+	const double mrt_V8=0.08333333333333333;
+	const double mrt_V9=0.003341687552213868;
+	const double mrt_V10=0.003968253968253968;
+	const double mrt_V11=0.01388888888888889;
+	const double mrt_V12=0.04166666666666666;
+
+	for (int n=start; n<finish; n++){
+
+		//........................................................................
+		//					READ THE DISTRIBUTIONS
+		//		(read from opposite array due to previous swap operation)
+		//........................................................................
+		// q=0
+		fq = dist[n];
+		m1  = -30.0*fq;
+		m2  = 12.0*fq;
+
+		// q=1
+		nread = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
+		fq = dist[nread]; // reading the f1 data into register fq
+        pressure = fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jx = fq;
+		m4 = -4.0*fq;
+		m9 = 2.0*fq;
+		m10 = -4.0*fq;
+
+		// q=2
+		nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
+		fq = dist[nread];  // reading the f2 data into register fq
+		pressure += fq;
+		m1 -= 11.0*(fq);
+		m2 -= 4.0*(fq);
+		jx -= fq;
+		m4 += 4.0*(fq);
+		m9 += 2.0*(fq);
+		m10 -= 4.0*(fq);
+
+		// q=3
+		nread = neighborList[n+2*Np]; // neighbor 4
+		fq = dist[nread];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy = fq;
+		m6 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 = fq;
+		m12 = -2.0*fq;
+
+		// q = 4
+		nread = neighborList[n+3*Np]; // neighbor 3
+		fq = dist[nread];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jy -= fq;
+		m6 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 += fq;
+		m12 -= 2.0*fq;
+
+		// q=5
+		nread = neighborList[n+4*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz = fq;
+		m8 = -4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q = 6
+		nread = neighborList[n+5*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 -= 11.0*fq;
+		m2 -= 4.0*fq;
+		jz -= fq;
+		m8 += 4.0*fq;
+		m9 -= fq;
+		m10 += 2.0*fq;
+		m11 -= fq;
+		m12 += 2.0*fq;
+
+		// q=7
+		nread = neighborList[n+6*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy += fq;
+		m6 += fq;
+		m9  += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 = fq;
+		m16 = fq;
+		m17 = -fq;
+
+		// q = 8
+		nread = neighborList[n+7*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 += fq;
+		m16 -= fq;
+		m17 += fq;
+
+		// q=9
+		nread = neighborList[n+8*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jy -= fq;
+		m6 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 += fq;
+		m17 += fq;
+
+		// q = 10
+		nread = neighborList[n+9*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jy += fq;
+		m6 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 += fq;
+		m12 += fq;
+		m13 -= fq;
+		m16 -= fq;
+		m17 -= fq;
+
+		// q=11
+		nread = neighborList[n+10*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 = fq;
+		m16 -= fq;
+		m18 = fq;
+
+		// q=12
+		nread = neighborList[n+11*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 += fq;
+		m16 += fq;
+		m18 -= fq;
+
+		// q=13
+		nread = neighborList[n+12*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx += fq;
+		m4 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 -= fq;
+		m18 -= fq;
+
+		// q=14
+		nread = neighborList[n+13*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jx -= fq;
+		m4 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 += fq;
+		m10 += fq;
+		m11 -= fq;
+		m12 -= fq;
+		m15 -= fq;
+		m16 += fq;
+		m18 += fq;
+
+		// q=15
+		nread = neighborList[n+14*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 = fq;
+		m17 += fq;
+		m18 -= fq;
+
+		// q=16
+		nread = neighborList[n+15*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 += fq;
+		m17 -= fq;
+		m18 += fq;
+
+		// q=17
+		nread = neighborList[n+16*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy += fq;
+		m6 += fq;
+		jz -= fq;
+		m8 -= fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 += fq;
+		m18 += fq;
+
+		// q=18
+		nread = neighborList[n+17*Np];
+		fq = dist[nread];
+		pressure += fq;
+		m1 += 8.0*fq;
+		m2 += fq;
+		jy -= fq;
+		m6 -= fq;
+		jz += fq;
+		m8 += fq;
+		m9 -= 2.0*fq;
+		m10 -= 2.0*fq;
+		m14 -= fq;
+		m17 -= fq;
+		m18 -= fq;
+        //---------------------------------------------------------------------//
+
+        porosity = Poros[n];
+        perm = Perm[n];
+
+        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+        c1 = porosity*0.5*GeoFun/sqrt(perm);
+        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
+		vx = jx/Den+0.5*porosity*Gx;
+		vy = jy/Den+0.5*porosity*Gy;
+		vz = jz/Den+0.5*porosity*Gz;
+        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+
+        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+        Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+        Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+        Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+        if (porosity==1.0){
+            Fx=Den*Gx;
+            Fy=Den*Gy;
+            Fz=Den*Gz;
+        }
+
+        //Calculate pressure for Incompressible-MRT model
+        pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
+
+		//..............carry out relaxation process...............................................
+//		m1 = m1 + rlx_setA*((-30*Den+19*(jx*jx+jy*jy+jz*jz)/Den/porosity + 57*pressure*porosity) - m1);
+//		m2 = m2 + rlx_setA*((12*Den - 5.5*(jx*jx+jy*jy+jz*jz)/Den/porosity-27*pressure*porosity) - m2);
+//		m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
+//		m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
+//		m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
+//		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/Den/porosity) - m9);
+//		m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/Den/porosity)- m10);
+//		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/Den/porosity) - m11);
+//		m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/Den/porosity)- m12);
+//		m13 = m13 + rlx_setA*((jx*jy/Den/porosity) - m13);
+//		m14 = m14 + rlx_setA*((jy*jz/Den/porosity) - m14);
+//		m15 = m15 + rlx_setA*((jx*jz/Den/porosity) - m15);
+//		m16 = m16 + rlx_setB*( - m16);
+//		m17 = m17 + rlx_setB*( - m17);
+//		m18 = m18 + rlx_setB*( - m18);
+		//.......................................................................................................
+        
+		//..............carry out relaxation process...............................................
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
+		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4);
+		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6);
+		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8);
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
+		m16 = m16 + rlx_setB*( - m16);
+		m17 = m17 + rlx_setB*( - m17);
+		m18 = m18 + rlx_setB*( - m18);
+		//.......................................................................................................
+        
+        jx+=0.5*Fx;//There is no collision for momentum, but they must be updated subject to the body force
+        jy+=0.5*Fy;
+        jz+=0.5*Fz;
+		//.................inverse transformation......................................................
+		// q=0
+		//fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2;
+		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2 
+                  + 0.3333333333333333*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = fq;
+
+		// q = 1
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) + 0.16666666*Fx;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) 
+            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+Np];
+		dist[nread] = fq;
+
+		// q=2
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) -  0.16666666*Fx;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) 
+            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n];
+		dist[nread] = fq;
+
+		// q = 3
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+3*Np];
+		dist[nread] = fq;
+
+		// q = 4
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+2*Np];
+		dist[nread] = fq;
+
+		// q = 5
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) 
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+		nread = neighborList[n+5*Np];
+		dist[nread] = fq;
+
+		// q = 6
+		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz;
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11)
+				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+		nread = neighborList[n+4*Np];
+		dist[nread] = fq;
+
+		// q = 7
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+7*Np];
+		dist[nread] = fq;
+
+		// q = 8
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+6*Np];
+		dist[nread] = fq;
+
+		// q = 9
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+9*Np];
+		dist[nread] = fq;
+
+		// q = 10
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+  Fz*(0. - (3.*uz)/porosity));
+		nread = neighborList[n+8*Np];
+		dist[nread] = fq;
+
+		// q = 11
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+		nread = neighborList[n+11*Np];
+		dist[nread] = fq;
+
+		// q = 12
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+		nread = neighborList[n+10*Np];
+		dist[nread]= fq;
+
+		// q = 13
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+		nread = neighborList[n+13*Np];
+		dist[nread] = fq;
+
+		// q= 14
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+		nread = neighborList[n+12*Np];
+		dist[nread] = fq;
+
+		// q = 15
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) 
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+		nread = neighborList[n+15*Np];
+		dist[nread] = fq;
+
+		// q = 16
+		//fq =  mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz);
+		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+		nread = neighborList[n+14*Np];
+		dist[nread] = fq;
+
+		// q = 17
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+		nread = neighborList[n+17*Np];
+		dist[nread] = fq;
+
+		// q = 18
+		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz);
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18)
+				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+		nread = neighborList[n+16*Np];
+		dist[nread] = fq;
+		//........................................................................
+
+        //Update velocity on device
+		Velocity[0*Np+n] = ux;
+		Velocity[1*Np+n] = uy;
+		Velocity[2*Np+n] = uz;
+	}
+}
+
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 2ce6ff5e..1cdae815 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -8,7 +8,7 @@ color lattice boltzmann model
 #include <time.h>
 
 ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
-rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
+rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
 {
 	SignDist.resize(Nx,Ny,Nz);           
@@ -30,6 +30,7 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	// set defaults
 	timestepMax = 100000;
 	tau = 1.0;
+    Den = 1.0;//constant density
 	tolerance = 0.01;
 	Fx = Fy = Fz = 0.0;
 	Restart=false;
@@ -37,13 +38,16 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	flux=0.0;
     dp = 10.0; //unit of 'dp': voxel
 	
-	// Color Model parameters
+	// Greyscale Model parameters
 	if (greyscale_db->keyExists( "timestepMax" )){
 		timestepMax = greyscale_db->getScalar<int>( "timestepMax" );
 	}
 	if (greyscale_db->keyExists( "tau" )){
 		tau = greyscale_db->getScalar<double>( "tau" );
 	}
+	if (greyscale_db->keyExists( "Den" )){
+		Den = greyscale_db->getScalar<double>( "Den" );
+	}
 	if (greyscale_db->keyExists( "dp" )){
 		dp = greyscale_db->getScalar<double>( "dp" );
 	}
@@ -423,15 +427,19 @@ void ScaLBL_GreyscaleModel::Run(){
 		//************************************************************************/
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index 9b970a65..ac939aed 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -36,6 +36,7 @@ public:
 	int timestep,timestepMax;
 	int BoundaryCondition;
 	double tau;
+    double Den;//constant density
 	double tolerance;
 	double Fx,Fy,Fz,flux;
 	double din,dout;

From 7e4e91a06beb1ee75633e31e0e25e3fe1e032e45 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Sat, 18 Jan 2020 18:43:20 -0500
Subject: [PATCH 018/121] The CPU version of incompressible MRT greyscale model
 is available now

---
 cpu/Greyscale.cpp | 298 +++++++++++++++-------------------------------
 1 file changed, 93 insertions(+), 205 deletions(-)

diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index 11a8eb5c..95cf516b 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -724,169 +724,113 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
         pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
 		//..............carry out relaxation process...............................................
-//		m1 = m1 + rlx_setA*((-30*Den+19*(jx*jx+jy*jy+jz*jz)/Den/porosity + 57*pressure*porosity) - m1);
-//		m2 = m2 + rlx_setA*((12*Den - 5.5*(jx*jx+jy*jy+jz*jz)/Den/porosity-27*pressure*porosity) - m2);
-//		m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
-//		m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
-//		m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
-//		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/Den/porosity) - m9);
-//		m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/Den/porosity)- m10);
-//		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/Den/porosity) - m11);
-//		m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/Den/porosity)- m12);
-//		m13 = m13 + rlx_setA*((jx*jy/Den/porosity) - m13);
-//		m14 = m14 + rlx_setA*((jy*jz/Den/porosity) - m14);
-//		m15 = m15 + rlx_setA*((jx*jz/Den/porosity) - m15);
-//		m16 = m16 + rlx_setB*( - m16);
-//		m17 = m17 + rlx_setB*( - m17);
-//		m18 = m18 + rlx_setB*( - m18);
-		//.......................................................................................................
-
-		//..............carry out relaxation process...............................................
-		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
-		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
-		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4);
-		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6);
-		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8);
-		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
-		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
-		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
-		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
-		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
-		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
-		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+        jx = jx + Fx;
+		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+        jy = jy + Fy;
+		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+        jz = jz + Fz;
+		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
 		m16 = m16 + rlx_setB*( - m16);
 		m17 = m17 + rlx_setB*( - m17);
 		m18 = m18 + rlx_setB*( - m18);
 		//.......................................................................................................
 
-        jx+=0.5*Fx;//There is no collision for momentum, but they must be updated subject to the body force
-        jy+=0.5*Fy;
-        jz+=0.5*Fz;
 		//.................inverse transformation......................................................
 		// q=0
-		//fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2;
-		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2 
-                  + 0.3333333333333333*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;
 		dist[n] = fq;
 
 		// q = 1
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) + 0.16666666*Fx;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) 
-            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10);
 		dist[1*Np+n] = fq;
 
 		// q=2
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) -  0.16666666*Fx;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) 
-            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10);
 		dist[2*Np+n] = fq;
 
 		// q = 3
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
 		dist[3*Np+n] = fq;
 
 		// q = 4
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
 		dist[4*Np+n] = fq;
 
 		// q = 5
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
 		dist[5*Np+n] = fq;
 
 		// q = 6
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11)
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
 		dist[6*Np+n] = fq;
 
 		// q = 7
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17);
 		dist[7*Np+n] = fq;
 
 		// q = 8
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16);
 		dist[8*Np+n] = fq;
 
 		// q = 9
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17);
 		dist[9*Np+n] = fq;
 
 		// q = 10
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17);
 		dist[10*Np+n] = fq;
 
 		// q = 11
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16);
 		dist[11*Np+n] = fq;
 
 		// q = 12
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18);
 		dist[12*Np+n] = fq;
 
 		// q = 13
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18);
 		dist[13*Np+n] = fq;
 
 		// q= 14
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18);
 		dist[14*Np+n] = fq;
 
 		// q = 15
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) 
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18);
 		dist[15*Np+n] = fq;
 
 		// q = 16
-		//fq =  mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz);
-		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17);
 		dist[16*Np+n] = fq;
 
 		// q = 17
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18);
 		dist[17*Np+n] = fq;
 
 		// q = 18
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18);
 		dist[18*Np+n] = fq;
 		//........................................................................
 
@@ -1258,186 +1202,130 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
         pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
 		//..............carry out relaxation process...............................................
-//		m1 = m1 + rlx_setA*((-30*Den+19*(jx*jx+jy*jy+jz*jz)/Den/porosity + 57*pressure*porosity) - m1);
-//		m2 = m2 + rlx_setA*((12*Den - 5.5*(jx*jx+jy*jy+jz*jz)/Den/porosity-27*pressure*porosity) - m2);
-//		m4 = m4 + rlx_setB*((-0.6666666666666666*jx) - m4);
-//		m6 = m6 + rlx_setB*((-0.6666666666666666*jy) - m6);
-//		m8 = m8 + rlx_setB*((-0.6666666666666666*jz) - m8);
-//		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/Den/porosity) - m9);
-//		m10 = m10 + rlx_setA*(-0.5*((2*jx*jx-jy*jy-jz*jz)/Den/porosity)- m10);
-//		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/Den/porosity) - m11);
-//		m12 = m12 + rlx_setA*(-0.5*((jy*jy-jz*jz)/Den/porosity)- m12);
-//		m13 = m13 + rlx_setA*((jx*jy/Den/porosity) - m13);
-//		m14 = m14 + rlx_setA*((jy*jz/Den/porosity) - m14);
-//		m15 = m15 + rlx_setA*((jx*jz/Den/porosity) - m15);
-//		m16 = m16 + rlx_setB*( - m16);
-//		m17 = m17 + rlx_setB*( - m17);
-//		m18 = m18 + rlx_setB*( - m18);
-		//.......................................................................................................
-        
-		//..............carry out relaxation process...............................................
-		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
-		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
-		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4);
-		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6);
-		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8);
-		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
-		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
-		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
-		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
-		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
-		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
-		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+        jx = jx + Fx;
+		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+        jy = jy + Fy;
+		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+        jz = jz + Fz;
+		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
 		m16 = m16 + rlx_setB*( - m16);
 		m17 = m17 + rlx_setB*( - m17);
 		m18 = m18 + rlx_setB*( - m18);
 		//.......................................................................................................
-        
-        jx+=0.5*Fx;//There is no collision for momentum, but they must be updated subject to the body force
-        jy+=0.5*Fy;
-        jz+=0.5*Fz;
+       
 		//.................inverse transformation......................................................
 		// q=0
-		//fq = mrt_V1*rho-mrt_V2*m1+mrt_V3*m2;
-		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2 
-                  + 0.3333333333333333*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;
 		dist[n] = fq;
 
 		// q = 1
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) + 0.16666666*Fx;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10) 
-            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10);
 		nread = neighborList[n+Np];
 		dist[nread] = fq;
 
 		// q=2
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) -  0.16666666*Fx;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10) 
-            +0.05555555555555555*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10);
 		nread = neighborList[n];
 		dist[nread] = fq;
 
 		// q = 3
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) + 0.16666666*Fy;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
 		nread = neighborList[n+3*Np];
 		dist[nread] = fq;
 
 		// q = 4
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) - 0.16666666*Fy;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
 		nread = neighborList[n+2*Np];
 		dist[nread] = fq;
 
 		// q = 5
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) + 0.16666666*Fz;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) 
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
 		nread = neighborList[n+5*Np];
 		dist[nread] = fq;
 
 		// q = 6
-		//fq = mrt_V1*rho-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11) - 0.16666666*Fz;
-		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11)
-				+0.05555555555555555*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+		fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
 		nread = neighborList[n+4*Np];
 		dist[nread] = fq;
 
 		// q = 7
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17) + 0.08333333333*(Fx+Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17);
 		nread = neighborList[n+7*Np];
 		dist[nread] = fq;
 
 		// q = 8
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16) - 0.08333333333*(Fx+Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16);
 		nread = neighborList[n+6*Np];
 		dist[nread] = fq;
 
 		// q = 9
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17) + 0.08333333333*(Fx-Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17);
 		nread = neighborList[n+9*Np];
 		dist[nread] = fq;
 
 		// q = 10
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)- 0.08333333333*(Fx-Fy);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17);
 		nread = neighborList[n+8*Np];
 		dist[nread] = fq;
 
 		// q = 11
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16) + 0.08333333333*(Fx+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16);
 		nread = neighborList[n+11*Np];
 		dist[nread] = fq;
 
 		// q = 12
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18) - 0.08333333333*(Fx+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18);
 		nread = neighborList[n+10*Np];
 		dist[nread]= fq;
 
 		// q = 13
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18) + 0.08333333333*(Fx-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18);
 		nread = neighborList[n+13*Np];
 		dist[nread] = fq;
 
 		// q= 14
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18) - 0.08333333333*(Fx-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18);
 		nread = neighborList[n+12*Np];
 		dist[nread] = fq;
 
 		// q = 15
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) + 0.08333333333*(Fy+Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18) 
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18);
 		nread = neighborList[n+15*Np];
 		dist[nread] = fq;
 
 		// q = 16
-		//fq =  mrt_V1*rho+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)- 0.08333333333*(Fy+Fz);
-		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+		fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17);
 		nread = neighborList[n+14*Np];
 		dist[nread] = fq;
 
 		// q = 17
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18) + 0.08333333333*(Fy-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18);
 		nread = neighborList[n+17*Np];
 		dist[nread] = fq;
 
 		// q = 18
-		//fq = mrt_V1*rho+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18) - 0.08333333333*(Fy-Fz);
-		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18)
-				+0.027777777777777776*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+		fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18);
 		nread = neighborList[n+16*Np];
 		dist[nread] = fq;
 		//........................................................................

From 060bee7b4495324841b51d205e701fd05fb152fe Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Sat, 18 Jan 2020 22:52:04 -0500
Subject: [PATCH 019/121] GPU version of incompressible greysacle MRT model is
 ready

---
 cpu/Greyscale.cpp |    2 -
 gpu/Greyscale.cu  | 1313 +++++++++++++++++++++++++++++++--------------
 2 files changed, 920 insertions(+), 395 deletions(-)

diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index 95cf516b..f2be769e 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -391,7 +391,6 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
 	double jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	double m3,m5,m7;
     double fq;
 	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
     double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
@@ -851,7 +850,6 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
 	double jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	double m3,m5,m7;
     double fq;
 	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
     double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index fdb0a462..5b8273fe 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -396,14 +396,17 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
 
 __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
                                                   double *Poros,double *Perm, double *Velocity, double Den){
+
 	int n;
 	double vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
+    double pressure;//defined for this incompressible model
 	// conserved momemnts
-	double rho,jx,jy,jz;
+	double jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	double m3,m5,m7;
+    double fq;
+	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
     double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
     double porosity;
     double perm;//voxel permeability
@@ -413,6 +416,20 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
     double rlx_setA = rlx;
     double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
 
+	const double mrt_V1=0.05263157894736842;
+	const double mrt_V2=0.012531328320802;
+	const double mrt_V3=0.04761904761904762;
+	const double mrt_V4=0.004594820384294068;
+	const double mrt_V5=0.01587301587301587;
+	const double mrt_V6=0.0555555555555555555555555;
+	const double mrt_V7=0.02777777777777778;
+	const double mrt_V8=0.08333333333333333;
+	const double mrt_V9=0.003341687552213868;
+	const double mrt_V10=0.003968253968253968;
+	const double mrt_V11=0.01388888888888889;
+	const double mrt_V12=0.04166666666666666;
+
+
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
 	    //........Get 1-D index for this thread....................
@@ -420,447 +437,935 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
 
 		if ( n<finish ){
 
-		// q=0
-		fq = dist[n];
-		rho = fq;
-		m1  = -30.0*fq;
-		m2  = 12.0*fq;
+            //........................................................................
+            //					READ THE DISTRIBUTIONS
+            //		(read from opposite array due to previous swap operation)
+            //........................................................................
+            // q=0
+            fq = dist[n];
+            m1  = -30.0*fq;
+            m2  = 12.0*fq;
 
-		// q=1
-		fq = dist[2*Np+n];
-		rho += fq;
-		m1 -= 11.0*fq;
-		m2 -= 4.0*fq;
-		jx = fq;
-		m4 = -4.0*fq;
-		m9 = 2.0*fq;
-		m10 = -4.0*fq;
+            // q=1
+            fq = dist[2*Np+n];
+            pressure = fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jx = fq;
+            m4 = -4.0*fq;
+            m9 = 2.0*fq;
+            m10 = -4.0*fq;
 
-		// f2 = dist[10*Np+n];
-		fq = dist[1*Np+n];
-		rho += fq;
-		m1 -= 11.0*(fq);
-		m2 -= 4.0*(fq);
-		jx -= fq;
-		m4 += 4.0*(fq);
-		m9 += 2.0*(fq);
-		m10 -= 4.0*(fq);
+            // f2 = dist[10*Np+n];
+            fq = dist[1*Np+n];
+            pressure += fq;
+            m1 -= 11.0*(fq);
+            m2 -= 4.0*(fq);
+            jx -= fq;
+            m4 += 4.0*(fq);
+            m9 += 2.0*(fq);
+            m10 -= 4.0*(fq);
 
-		// q=3
-		fq = dist[4*Np+n];
-		rho += fq;
-		m1 -= 11.0*fq;
-		m2 -= 4.0*fq;
-		jy = fq;
-		m6 = -4.0*fq;
-		m9 -= fq;
-		m10 += 2.0*fq;
-		m11 = fq;
-		m12 = -2.0*fq;
+            // q=3
+            fq = dist[4*Np+n];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jy = fq;
+            m6 = -4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 = fq;
+            m12 = -2.0*fq;
 
-		// q = 4
-		fq = dist[3*Np+n];
-		rho+= fq;
-		m1 -= 11.0*fq;
-		m2 -= 4.0*fq;
-		jy -= fq;
-		m6 += 4.0*fq;
-		m9 -= fq;
-		m10 += 2.0*fq;
-		m11 += fq;
-		m12 -= 2.0*fq;
+            // q = 4
+            fq = dist[3*Np+n];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jy -= fq;
+            m6 += 4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 += fq;
+            m12 -= 2.0*fq;
 
-		// q=5
-		fq = dist[6*Np+n];
-		rho += fq;
-		m1 -= 11.0*fq;
-		m2 -= 4.0*fq;
-		jz = fq;
-		m8 = -4.0*fq;
-		m9 -= fq;
-		m10 += 2.0*fq;
-		m11 -= fq;
-		m12 += 2.0*fq;
+            // q=5
+            fq = dist[6*Np+n];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jz = fq;
+            m8 = -4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 -= fq;
+            m12 += 2.0*fq;
 
-		// q = 6
-		fq = dist[5*Np+n];
-		rho+= fq;
-		m1 -= 11.0*fq;
-		m2 -= 4.0*fq;
-		jz -= fq;
-		m8 += 4.0*fq;
-		m9 -= fq;
-		m10 += 2.0*fq;
-		m11 -= fq;
-		m12 += 2.0*fq;
+            // q = 6
+            fq = dist[5*Np+n];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jz -= fq;
+            m8 += 4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 -= fq;
+            m12 += 2.0*fq;
 
-		// q=7
-		fq = dist[8*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx += fq;
-		m4 += fq;
-		jy += fq;
-		m6 += fq;
-		m9  += fq;
-		m10 += fq;
-		m11 += fq;
-		m12 += fq;
-		m13 = fq;
-		m16 = fq;
-		m17 = -fq;
+            // q=7
+            fq = dist[8*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jy += fq;
+            m6 += fq;
+            m9  += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 = fq;
+            m16 = fq;
+            m17 = -fq;
 
-		// q = 8
-		fq = dist[7*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx -= fq;
-		m4 -= fq;
-		jy -= fq;
-		m6 -= fq;
-		m9 += fq;
-		m10 += fq;
-		m11 += fq;
-		m12 += fq;
-		m13 += fq;
-		m16 -= fq;
-		m17 += fq;
+            // q = 8
+            fq = dist[7*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jy -= fq;
+            m6 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 += fq;
+            m16 -= fq;
+            m17 += fq;
 
-		// q=9
-		fq = dist[10*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx += fq;
-		m4 += fq;
-		jy -= fq;
-		m6 -= fq;
-		m9 += fq;
-		m10 += fq;
-		m11 += fq;
-		m12 += fq;
-		m13 -= fq;
-		m16 += fq;
-		m17 += fq;
+            // q=9
+            fq = dist[10*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jy -= fq;
+            m6 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 -= fq;
+            m16 += fq;
+            m17 += fq;
 
-		// q = 10
-		fq = dist[9*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx -= fq;
-		m4 -= fq;
-		jy += fq;
-		m6 += fq;
-		m9 += fq;
-		m10 += fq;
-		m11 += fq;
-		m12 += fq;
-		m13 -= fq;
-		m16 -= fq;
-		m17 -= fq;
+            // q = 10
+            fq = dist[9*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jy += fq;
+            m6 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 -= fq;
+            m16 -= fq;
+            m17 -= fq;
 
-		// q=11
-		fq = dist[12*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx += fq;
-		m4 += fq;
-		jz += fq;
-		m8 += fq;
-		m9 += fq;
-		m10 += fq;
-		m11 -= fq;
-		m12 -= fq;
-		m15 = fq;
-		m16 -= fq;
-		m18 = fq;
+            // q=11
+            fq = dist[12*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jz += fq;
+            m8 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 = fq;
+            m16 -= fq;
+            m18 = fq;
 
-		// q=12
-		fq = dist[11*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx -= fq;
-		m4 -= fq;
-		jz -= fq;
-		m8 -= fq;
-		m9 += fq;
-		m10 += fq;
-		m11 -= fq;
-		m12 -= fq;
-		m15 += fq;
-		m16 += fq;
-		m18 -= fq;
+            // q=12
+            fq = dist[11*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 += fq;
+            m16 += fq;
+            m18 -= fq;
 
-		// q=13
-		fq = dist[14*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx += fq;
-		m4 += fq;
-		jz -= fq;
-		m8 -= fq;
-		m9 += fq;
-		m10 += fq;
-		m11 -= fq;
-		m12 -= fq;
-		m15 -= fq;
-		m16 -= fq;
-		m18 -= fq;
+            // q=13
+            fq = dist[14*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 -= fq;
+            m16 -= fq;
+            m18 -= fq;
 
-		// q=14
-		fq = dist[13*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jx -= fq;
-		m4 -= fq;
-		jz += fq;
-		m8 += fq;
-		m9 += fq;
-		m10 += fq;
-		m11 -= fq;
-		m12 -= fq;
-		m15 -= fq;
-		m16 += fq;
-		m18 += fq;
+            // q=14
+            fq = dist[13*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jz += fq;
+            m8 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 -= fq;
+            m16 += fq;
+            m18 += fq;
 
-		// q=15
-		fq = dist[16*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jy += fq;
-		m6 += fq;
-		jz += fq;
-		m8 += fq;
-		m9 -= 2.0*fq;
-		m10 -= 2.0*fq;
-		m14 = fq;
-		m17 += fq;
-		m18 -= fq;
+            // q=15
+            fq = dist[16*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy += fq;
+            m6 += fq;
+            jz += fq;
+            m8 += fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 = fq;
+            m17 += fq;
+            m18 -= fq;
 
-		// q=16
-		fq = dist[15*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jy -= fq;
-		m6 -= fq;
-		jz -= fq;
-		m8 -= fq;
-		m9 -= 2.0*fq;
-		m10 -= 2.0*fq;
-		m14 += fq;
-		m17 -= fq;
-		m18 += fq;
+            // q=16
+            fq = dist[15*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy -= fq;
+            m6 -= fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 += fq;
+            m17 -= fq;
+            m18 += fq;
 
-		// q=17
-		fq = dist[18*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jy += fq;
-		m6 += fq;
-		jz -= fq;
-		m8 -= fq;
-		m9 -= 2.0*fq;
-		m10 -= 2.0*fq;
-		m14 -= fq;
-		m17 += fq;
-		m18 += fq;
+            // q=17
+            fq = dist[18*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy += fq;
+            m6 += fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 -= fq;
+            m17 += fq;
+            m18 += fq;
 
-		// q=18
-		fq = dist[17*Np+n];
-		rho += fq;
-		m1 += 8.0*fq;
-		m2 += fq;
-		jy -= fq;
-		m6 -= fq;
-		jz += fq;
-		m8 += fq;
-		m9 -= 2.0*fq;
-		m10 -= 2.0*fq;
-		m14 -= fq;
-		m17 -= fq;
-		m18 -= fq;
+            // q=18
+            fq = dist[17*Np+n];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy -= fq;
+            m6 -= fq;
+            jz += fq;
+            m8 += fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 -= fq;
+            m17 -= fq;
+            m18 -= fq;
+            //---------------------------------------------------------------------//
 
-		//........................................................................
-		//..............carry out relaxation process..............................
-		//..........Toelke, Fruediger et. al. 2006................................
-		if (C == 0.0)	nx = ny = nz = 0.0;
-		m1 = m1 + rlx_setA*((19*(jx*jx+jy*jy+jz*jz)/rho0 - 11*rho) -19*alpha*C - m1);
-		m2 = m2 + rlx_setA*((3*rho - 5.5*(jx*jx+jy*jy+jz*jz)/rho0)- m2);
-		m4 = m4 + rlx_setB*((-0.6666666666666666*jx)- m4);
-		m6 = m6 + rlx_setB*((-0.6666666666666666*jy)- m6);
-		m8 = m8 + rlx_setB*((-0.6666666666666666*jz)- m8);
-		m9 = m9 + rlx_setA*(((2*jx*jx-jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9);
-		m10 = m10 + rlx_setA*( - m10);
-		m11 = m11 + rlx_setA*(((jy*jy-jz*jz)/rho0) + 0.5*alpha*C*(ny*ny-nz*nz)- m11);
-		m12 = m12 + rlx_setA*( - m12);
-		m13 = m13 + rlx_setA*( (jx*jy/rho0) + 0.5*alpha*C*nx*ny - m13);
-		m14 = m14 + rlx_setA*( (jy*jz/rho0) + 0.5*alpha*C*ny*nz - m14);
-		m15 = m15 + rlx_setA*( (jx*jz/rho0) + 0.5*alpha*C*nx*nz - m15);
-		m16 = m16 + rlx_setB*( - m16);
-		m17 = m17 + rlx_setB*( - m17);
-		m18 = m18 + rlx_setB*( - m18);
+            porosity = Poros[n];
+            perm = Perm[n];
 
+            c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+            if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+            GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+            c1 = porosity*0.5*GeoFun/sqrt(perm);
+            if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
+            vx = jx/Den+0.5*porosity*Gx;
+            vy = jy/Den+0.5*porosity*Gy;
+            vz = jz/Den+0.5*porosity*Gz;
+            v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+            ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+            uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+            uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+            u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
+            //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+            Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+            Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+            Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+            if (porosity==1.0){
+                Fx=Den*Gx;
+                Fy=Den*Gy;
+                Fz=Den*Gz;
+            }
 
-		// q=0
-		f0 = dist[n];
-		f1 = dist[2*Np+n];
-		f2 = dist[1*Np+n];
-		f3 = dist[4*Np+n];
-		f4 = dist[3*Np+n];
-		f5 = dist[6*Np+n];
-		f6 = dist[5*Np+n];
-		f7 = dist[8*Np+n];
-		f8 = dist[7*Np+n];
-		f9 = dist[10*Np+n];
-		f10 = dist[9*Np+n];
-		f11 = dist[12*Np+n];
-		f12 = dist[11*Np+n];
-		f13 = dist[14*Np+n];
-		f14 = dist[13*Np+n];
-		f15 = dist[16*Np+n];
-		f16 = dist[15*Np+n];
-		f17 = dist[18*Np+n];
-		f18 = dist[17*Np+n];
+            //Calculate pressure for Incompressible-MRT model
+            pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
-        porosity = Poros[n];
-        perm = Perm[n];
+            //..............carry out relaxation process...............................................
+            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+            jx = jx + Fx;
+            m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+            jy = jy + Fy;
+            m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+            jz = jz + Fz;
+            m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+            m16 = m16 + rlx_setB*( - m16);
+            m17 = m17 + rlx_setB*( - m17);
+            m18 = m18 + rlx_setB*( - m18);
+            //.......................................................................................................
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
-        if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
-        GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
-        c1 = porosity*0.5*GeoFun/sqrt(perm);
-        if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+            //.................inverse transformation......................................................
+            // q=0
+            fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;
+            dist[n] = fq;
 
-		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
-		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
-		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
-		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
-        v_mag=sqrt(vx*vx+vy*vy+vz*vz);
-        ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
-        uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
-        uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
-        u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+            // q = 1
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10);
+            dist[1*Np+n] = fq;
 
-        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
-        if (porosity==1.0){
-            Fx=Gx;
-            Fy=Gy;
-            Fz=Gz;
-        }
+            // q=2
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10);
+            dist[2*Np+n] = fq;
 
-		// q=0
-		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            // q = 3
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
+            dist[3*Np+n] = fq;
 
-		// q = 1
-		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            // q = 4
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
+            dist[4*Np+n] = fq;
 
-		// q=2
-		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            // q = 5
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
+            dist[5*Np+n] = fq;
 
-		// q = 3
-		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            // q = 6
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
+            dist[6*Np+n] = fq;
 
-		// q = 4
-		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            // q = 7
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17);
+            dist[7*Np+n] = fq;
 
-		// q = 5
-		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+            // q = 8
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16);
+            dist[8*Np+n] = fq;
 
-		// q = 6
-		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+            // q = 9
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17);
+            dist[9*Np+n] = fq;
 
-		// q = 7
-		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+            // q = 10
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17);
+            dist[10*Np+n] = fq;
 
-		// q = 8
-		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+            // q = 11
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16);
+            dist[11*Np+n] = fq;
 
-		// q = 9
-		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+            // q = 12
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18);
+            dist[12*Np+n] = fq;
 
-		// q = 10
-		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+            // q = 13
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18);
+            dist[13*Np+n] = fq;
 
-		// q = 11
-		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+            // q= 14
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18);
+            dist[14*Np+n] = fq;
 
-		// q = 12
-		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+            // q = 15
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18);
+            dist[15*Np+n] = fq;
 
-		// q = 13
-		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+            // q = 16
+            fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17);
+            dist[16*Np+n] = fq;
 
-		// q= 14
-		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+            // q = 17
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18);
+            dist[17*Np+n] = fq;
 
-		// q = 15
-		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+            // q = 18
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18);
+            dist[18*Np+n] = fq;
+            //........................................................................
 
-		// q = 16
-		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
-
-		// q = 17
-		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
-
-		// q = 18
-		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
-
-        //Update velocity on device
-		Velocity[0*Np+n] = ux;
-		Velocity[1*Np+n] = uy;
-		Velocity[2*Np+n] = uz;
+            //Update velocity on device
+            Velocity[0*Np+n] = ux;
+            Velocity[1*Np+n] = uy;
+            Velocity[2*Np+n] = uz;
 
 		}
 	}
 }
 
 
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+                                                 double *Poros,double *Perm, double *Velocity,double Den){
+
+	int n, nread;
+	double vx,vy,vz,v_mag;
+    double ux,uy,uz,u_mag;
+    double pressure;//defined for this incompressible model
+	// conserved momemnts
+	double jx,jy,jz;
+	// non-conserved moments
+	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
+    double fq;
+	//double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
+    double GeoFun;//geometric function from Guo's PRE 66, 036304 (2002)
+    double porosity;
+    double perm;//voxel permeability
+    double c0, c1; //Guo's model parameters
+    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
+    double rlx_setA = rlx;
+    double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
+
+	const double mrt_V1=0.05263157894736842;
+	const double mrt_V2=0.012531328320802;
+	const double mrt_V3=0.04761904761904762;
+	const double mrt_V4=0.004594820384294068;
+	const double mrt_V5=0.01587301587301587;
+	const double mrt_V6=0.0555555555555555555555555;
+	const double mrt_V7=0.02777777777777778;
+	const double mrt_V8=0.08333333333333333;
+	const double mrt_V9=0.003341687552213868;
+	const double mrt_V10=0.003968253968253968;
+	const double mrt_V11=0.01388888888888889;
+	const double mrt_V12=0.04166666666666666;
+
+	int S = Np/NBLOCKS/NTHREADS + 1;
+	for (int s=0; s<S; s++){
+	    //........Get 1-D index for this thread....................
+	    n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
+
+		if ( n<finish ){		
+
+            //........................................................................
+            //					READ THE DISTRIBUTIONS
+            //		(read from opposite array due to previous swap operation)
+            //........................................................................
+            // q=0
+            fq = dist[n];
+            m1  = -30.0*fq;
+            m2  = 12.0*fq;
+
+            // q=1
+            nread = neighborList[n]; // neighbor 2 ( > 10Np => odd part of dist)
+            fq = dist[nread]; // reading the f1 data into register fq
+            pressure = fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jx = fq;
+            m4 = -4.0*fq;
+            m9 = 2.0*fq;
+            m10 = -4.0*fq;
+
+            // q=2
+            nread = neighborList[n+Np]; // neighbor 1 ( < 10Np => even part of dist)
+            fq = dist[nread];  // reading the f2 data into register fq
+            pressure += fq;
+            m1 -= 11.0*(fq);
+            m2 -= 4.0*(fq);
+            jx -= fq;
+            m4 += 4.0*(fq);
+            m9 += 2.0*(fq);
+            m10 -= 4.0*(fq);
+
+            // q=3
+            nread = neighborList[n+2*Np]; // neighbor 4
+            fq = dist[nread];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jy = fq;
+            m6 = -4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 = fq;
+            m12 = -2.0*fq;
+
+            // q = 4
+            nread = neighborList[n+3*Np]; // neighbor 3
+            fq = dist[nread];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jy -= fq;
+            m6 += 4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 += fq;
+            m12 -= 2.0*fq;
+
+            // q=5
+            nread = neighborList[n+4*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jz = fq;
+            m8 = -4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 -= fq;
+            m12 += 2.0*fq;
+
+            // q = 6
+            nread = neighborList[n+5*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 -= 11.0*fq;
+            m2 -= 4.0*fq;
+            jz -= fq;
+            m8 += 4.0*fq;
+            m9 -= fq;
+            m10 += 2.0*fq;
+            m11 -= fq;
+            m12 += 2.0*fq;
+
+            // q=7
+            nread = neighborList[n+6*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jy += fq;
+            m6 += fq;
+            m9  += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 = fq;
+            m16 = fq;
+            m17 = -fq;
+
+            // q = 8
+            nread = neighborList[n+7*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jy -= fq;
+            m6 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 += fq;
+            m16 -= fq;
+            m17 += fq;
+
+            // q=9
+            nread = neighborList[n+8*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jy -= fq;
+            m6 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 -= fq;
+            m16 += fq;
+            m17 += fq;
+
+            // q = 10
+            nread = neighborList[n+9*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jy += fq;
+            m6 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 += fq;
+            m12 += fq;
+            m13 -= fq;
+            m16 -= fq;
+            m17 -= fq;
+
+            // q=11
+            nread = neighborList[n+10*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jz += fq;
+            m8 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 = fq;
+            m16 -= fq;
+            m18 = fq;
+
+            // q=12
+            nread = neighborList[n+11*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 += fq;
+            m16 += fq;
+            m18 -= fq;
+
+            // q=13
+            nread = neighborList[n+12*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx += fq;
+            m4 += fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 -= fq;
+            m16 -= fq;
+            m18 -= fq;
+
+            // q=14
+            nread = neighborList[n+13*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jx -= fq;
+            m4 -= fq;
+            jz += fq;
+            m8 += fq;
+            m9 += fq;
+            m10 += fq;
+            m11 -= fq;
+            m12 -= fq;
+            m15 -= fq;
+            m16 += fq;
+            m18 += fq;
+
+            // q=15
+            nread = neighborList[n+14*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy += fq;
+            m6 += fq;
+            jz += fq;
+            m8 += fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 = fq;
+            m17 += fq;
+            m18 -= fq;
+
+            // q=16
+            nread = neighborList[n+15*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy -= fq;
+            m6 -= fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 += fq;
+            m17 -= fq;
+            m18 += fq;
+
+            // q=17
+            nread = neighborList[n+16*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy += fq;
+            m6 += fq;
+            jz -= fq;
+            m8 -= fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 -= fq;
+            m17 += fq;
+            m18 += fq;
+
+            // q=18
+            nread = neighborList[n+17*Np];
+            fq = dist[nread];
+            pressure += fq;
+            m1 += 8.0*fq;
+            m2 += fq;
+            jy -= fq;
+            m6 -= fq;
+            jz += fq;
+            m8 += fq;
+            m9 -= 2.0*fq;
+            m10 -= 2.0*fq;
+            m14 -= fq;
+            m17 -= fq;
+            m18 -= fq;
+            //---------------------------------------------------------------------//
+
+            porosity = Poros[n];
+            perm = Perm[n];
+
+            c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+            if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
+            GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
+            c1 = porosity*0.5*GeoFun/sqrt(perm);
+            if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
+
+            vx = jx/Den+0.5*porosity*Gx;
+            vy = jy/Den+0.5*porosity*Gy;
+            vz = jz/Den+0.5*porosity*Gz;
+            v_mag=sqrt(vx*vx+vy*vy+vz*vz);
+            ux = vx/(c0+sqrt(c0*c0+c1*v_mag));
+            uy = vy/(c0+sqrt(c0*c0+c1*v_mag));
+            uz = vz/(c0+sqrt(c0*c0+c1*v_mag));
+            u_mag=sqrt(ux*ux+uy*uy+uz*uz);
+
+            //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+            Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+            Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+            Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+            if (porosity==1.0){
+                Fx=Den*Gx;
+                Fy=Den*Gy;
+                Fz=Den*Gz;
+            }
+
+            //Calculate pressure for Incompressible-MRT model
+            pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
+
+            //..............carry out relaxation process...............................................
+            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+            jx = jx + Fx;
+            m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+            jy = jy + Fy;
+            m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+            jz = jz + Fz;
+            m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+            m16 = m16 + rlx_setB*( - m16);
+            m17 = m17 + rlx_setB*( - m17);
+            m18 = m18 + rlx_setB*( - m18);
+            //.......................................................................................................
+           
+            //.................inverse transformation......................................................
+            // q=0
+            fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;
+            dist[n] = fq;
+
+            // q = 1
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jx-m4)+mrt_V6*(m9-m10);
+            nread = neighborList[n+Np];
+            dist[nread] = fq;
+
+            // q=2
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m4-jx)+mrt_V6*(m9-m10);
+            nread = neighborList[n];
+            dist[nread] = fq;
+
+            // q = 3
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jy-m6)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
+            nread = neighborList[n+3*Np];
+            dist[nread] = fq;
+
+            // q = 4
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m6-jy)+mrt_V7*(m10-m9)+mrt_V8*(m11-m12);
+            nread = neighborList[n+2*Np];
+            dist[nread] = fq;
+
+            // q = 5
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(jz-m8)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
+            nread = neighborList[n+5*Np];
+            dist[nread] = fq;
+
+            // q = 6
+            fq = mrt_V1*Den-mrt_V4*m1-mrt_V5*m2+0.1*(m8-jz)+mrt_V7*(m10-m9)+mrt_V8*(m12-m11);
+            nread = neighborList[n+4*Np];
+            dist[nread] = fq;
+
+            // q = 7
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jy)+0.025*(m4+m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m16-m17);
+            nread = neighborList[n+7*Np];
+            dist[nread] = fq;
+
+            // q = 8
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jy)-0.025*(m4+m6) +mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12+0.25*m13+0.125*(m17-m16);
+            nread = neighborList[n+6*Np];
+            dist[nread] = fq;
+
+            // q = 9
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jy)+0.025*(m4-m6)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13+0.125*(m16+m17);
+            nread = neighborList[n+9*Np];
+            dist[nread] = fq;
+
+            // q = 10
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jx)+0.025*(m6-m4)+mrt_V7*m9+mrt_V11*m10+mrt_V8*m11+mrt_V12*m12-0.25*m13-0.125*(m16+m17);
+            nread = neighborList[n+8*Np];
+            dist[nread] = fq;
+
+            // q = 11
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx+jz)+0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m18-m16);
+            nread = neighborList[n+11*Np];
+            dist[nread] = fq;
+
+            // q = 12
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jx+jz)-0.025*(m4+m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12+0.25*m15+0.125*(m16-m18);
+            nread = neighborList[n+10*Np];
+            dist[nread]= fq;
+
+            // q = 13
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jx-jz)+0.025*(m4-m8)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15-0.125*(m16+m18);
+            nread = neighborList[n+13*Np];
+            dist[nread] = fq;
+
+            // q= 14
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jx)+0.025*(m8-m4)+mrt_V7*m9+mrt_V11*m10-mrt_V8*m11-mrt_V12*m12-0.25*m15+0.125*(m16+m18);
+            nread = neighborList[n+12*Np];
+            dist[nread] = fq;
+
+            // q = 15
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy+jz)+0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m17-m18);
+            nread = neighborList[n+15*Np];
+            dist[nread] = fq;
+
+            // q = 16
+            fq =  mrt_V1*Den+mrt_V9*m1+mrt_V10*m2-0.1*(jy+jz)-0.025*(m6+m8)-mrt_V6*m9-mrt_V7*m10+0.25*m14+0.125*(m18-m17);
+            nread = neighborList[n+14*Np];
+            dist[nread] = fq;
+
+            // q = 17
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jy-jz)+0.025*(m6-m8)-mrt_V6*m9-mrt_V7*m10-0.25*m14+0.125*(m17+m18);
+            nread = neighborList[n+17*Np];
+            dist[nread] = fq;
+
+            // q = 18
+            fq = mrt_V1*Den+mrt_V9*m1+mrt_V10*m2+0.1*(jz-jy)+0.025*(m8-m6)-mrt_V6*m9-mrt_V7*m10-0.25*m14-0.125*(m17+m18);
+            nread = neighborList[n+16*Np];
+            dist[nread] = fq;
+            //........................................................................
+
+            //Update velocity on device
+            Velocity[0*Np+n] = ux;
+            Velocity[1*Np+n] = uy;
+            Velocity[2*Np+n] = uz;
+
+		}
+	}
+}
+
 
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
 	
@@ -873,6 +1378,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
+
     dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);
 
     cudaError_t err = cudaGetLastError();
@@ -880,3 +1386,24 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 		printf("CUDA error in ScaLBL_D3Q19_AAodd_Greyscale: %s \n",cudaGetErrorString(err));
 	}
 }
+
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den){
+	
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den);
+
+    cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_AAeven_Greyscale_IMRT: %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den){
+
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den);
+
+    cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_AAodd_Greyscale_IMRT: %s \n",cudaGetErrorString(err));
+	}
+}
+

From 783d7ff7b2cf9027ab9f26d48fb5e5595c18fa81 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 20 Jan 2020 13:17:03 -0500
Subject: [PATCH 020/121] remove the reserved labels (i.e. Label=1,2) which
 store some pre-defined voxel perm values that may accidentally overwrite
 use-defined labels

---
 models/GreyscaleModel.cpp | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 1cdae815..ac44c6d3 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -194,10 +194,6 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 						//Mask->id[n] = 0; // set mask to zero since this is an immobile component
 					}
 				}
-				// fluid labels are reserved / negative labels are immobile
-				if (VALUE == 1) POROSITY=1.0;
-				else if (VALUE == 2) POROSITY=1.0;
-				else if (VALUE < 1)	 POROSITY = 0.0;  
 				int idx = Map(i,j,k);
 				if (!(idx < 0)){
                     if (POROSITY<=0.0){
@@ -228,19 +224,13 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 						//Mask->id[n] = 0; // set mask to zero since this is an immobile component
 					}
 				}
-				// Permeability of fluid labels are reserved
-                // NOTE: the voxel permeability of apparent pore nodes should be infinity
-                // TODO: Need to revise the PERMEABILITY of nodes whose VALUE=1 and 2
-				if (VALUE == 1) PERMEABILITY=1.0;
-				else if (VALUE == 2) PERMEABILITY=1.0;
-				else if (VALUE < 1)	 PERMEABILITY = 0.0;  
 				int idx = Map(i,j,k);
 				if (!(idx < 0)){
                     if (PERMEABILITY<=0.0){
                         ERROR("Error: Permeability for grey voxel must be > 0.0 ! \n");
                     }
                     else{
-					    Permeability[idx] = PERMEABILITY;
+					    Permeability[idx] = PERMEABILITY/Dm->voxel_length/Dm->voxel_length;
                     }
                 }
 			}
@@ -254,13 +244,15 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
 
 	if (rank==0){
+        printf("Image resolution: %.5g [um/voxel]\n",Dm->voxel_length);
 		printf("Component labels: %lu \n",NLABELS);
 		for (unsigned int idx=0; idx<NLABELS; idx++){
 			VALUE=LabelList[idx];
 			POROSITY=PorosityList[idx];
 			PERMEABILITY=PermeabilityList[idx];
 			double volume_fraction  = double(label_count_global[idx])/double((Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-			printf("   label=%d, porosity=%.3g, permeability=%.3g, volume fraction==%.3g\n",VALUE,POROSITY,PERMEABILITY,volume_fraction); 
+			printf("   label=%d, porosity=%.3g, permeability=%.3g [um^2] (=%.3g [voxel^2]), volume fraction=%.3g\n",
+                    VALUE,POROSITY,PERMEABILITY,PERMEABILITY/Dm->voxel_length/Dm->voxel_length,volume_fraction); 
 		}
 	}
 

From fb33408a956bffbca4c20f8b6fc77f527686e1ee Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 20 Jan 2020 19:29:03 -0500
Subject: [PATCH 021/121] 1.disable debug write-out; 2. add a weighted porosity

---
 models/GreyscaleModel.cpp          | 17 +++++++++++++----
 models/GreyscaleModel.h            |  1 +
 tests/lbpm_greyscale_simulator.cpp |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index ac44c6d3..b1f6603e 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -8,7 +8,7 @@ color lattice boltzmann model
 #include <time.h>
 
 ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
-rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),
+rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),GreyPorosity(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
 {
 	SignDist.resize(Nx,Ny,Nz);           
@@ -243,6 +243,13 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 	
 	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
 
+    //Initialize a weighted porosity after considering grey voxels
+    GreyPorosity=0.0;
+	for (unsigned int idx=0; idx<NLABELS; idx++){
+		double volume_fraction  = double(label_count_global[idx])/double((Nx-2)*(Ny-2)*(Nz-2)*nprocs);
+        GreyPorosity+=volume_fraction*PorosityList[idx];
+    }
+
 	if (rank==0){
         printf("Image resolution: %.5g [um/voxel]\n",Dm->voxel_length);
 		printf("Component labels: %lu \n",NLABELS);
@@ -251,11 +258,12 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 			POROSITY=PorosityList[idx];
 			PERMEABILITY=PermeabilityList[idx];
 			double volume_fraction  = double(label_count_global[idx])/double((Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-			printf("   label=%d, porosity=%.3g, permeability=%.3g [um^2] (=%.3g [voxel^2]), volume fraction=%.3g\n",
+			printf("   label=%d: porosity=%.3g, permeability=%.3g [um^2] (=%.3g [voxel^2]), volume fraction=%.3g\n",
                     VALUE,POROSITY,PERMEABILITY,PERMEABILITY/Dm->voxel_length/Dm->voxel_length,volume_fraction); 
+            printf("             effective porosity=%.3g\n",volume_fraction*POROSITY);
 		}
+        printf("The weighted porosity, considering both open and grey voxels, is %.3g\n",GreyPorosity);
 	}
-
 }
 
 
@@ -497,7 +505,8 @@ void ScaLBL_GreyscaleModel::Run(){
 			Hs=sumReduce( Dm->Comm, Hs);
 			Xs=sumReduce( Dm->Comm, Xs);
 			double h = Dm->voxel_length;
-			double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
+			//double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
+			double absperm = h*h*mu*GreyPorosity*flow_rate / force_mag;
 
             if (rank==0){
 				printf("     AbsPerm = %.5g [micron^2]\n",absperm);
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index ac939aed..b427218b 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -41,6 +41,7 @@ public:
 	double Fx,Fy,Fz,flux;
 	double din,dout;
     double dp;//solid particle diameter, unit in voxel
+    double GreyPorosity;
 	
 	int Nx,Ny,Nz,N,Np;
 	int rank,nprocx,nprocy,nprocz,nprocs;
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index 61322d6d..ef253cd7 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 		Greyscale.Initialize();   // initializing the model will set initial conditions for variables
 		Greyscale.Run();	 
 		Greyscale.VelocityField();
-		Greyscale.WriteDebug();
+		//Greyscale.WriteDebug();
 	}
 	// ****************************************************
 	MPI_Barrier(comm);

From e3afe1eba80700d65b2b00e8ab9b65354ff201a5 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 21 Jan 2020 14:31:33 -0500
Subject: [PATCH 022/121] add a restart utitlity to the greyscale simulator

---
 models/GreyscaleModel.cpp          | 95 ++++++++++++++++++++----------
 tests/lbpm_greyscale_simulator.cpp |  2 +-
 2 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index b1f6603e..06077780 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -1,5 +1,5 @@
 /*
-color lattice boltzmann model
+Greyscale lattice boltzmann model
  */
 #include "models/GreyscaleModel.h"
 #include "analysis/distance.h"
@@ -7,6 +7,12 @@ color lattice boltzmann model
 #include <stdlib.h>
 #include <time.h>
 
+template<class TYPE>
+void DeleteArray( const TYPE *p )
+{
+    delete [] p;
+}
+
 ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),GreyPorosity(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -117,6 +123,7 @@ void ScaLBL_GreyscaleModel::ReadInput(){
 		Mask->Decomp(Filename);
 	}
 	else{
+        if (rank==0) printf("Filename of input image is not found, reading ID.0* instead.");
 		Mask->ReadIDs();
 	}
 	for (int i=0; i<Nx*Ny*Nz; i++) id[i] = Mask->id[i];  // save what was read
@@ -357,39 +364,23 @@ void ScaLBL_GreyscaleModel::Create(){
 
 
 void ScaLBL_GreyscaleModel::Initialize(){
-	
 	if (rank==0)	printf ("Initializing distributions \n");
 	ScaLBL_D3Q19_Init(fq, Np);
-	/*
-	 * This function initializes model
-	 */
+
 	if (Restart == true){
 		if (rank==0){
-			printf("Reading restart file! \n");
+			printf("Initializing distributions from Restart! \n");
 		}
-
 		// Read in the restart file to CPU buffers
-		int *TmpMap;
-		TmpMap = new int[Np];
-		
-		double *cDist;
-		cDist = new double[19*Np];
-		ScaLBL_CopyToHost(TmpMap, dvcMap, Np*sizeof(int));
-    	
-		ifstream File(LocalRestartFile,ios::binary);
-		int idx;
-		double value;
-		for (int n=0; n<Np; n++){
-			// Read the distributions
-			for (int q=0; q<19; q++){
-				File.read((char*) &value, sizeof(value));
-				cDist[q*Np+n] = value;
-			}
-		}
-		File.close();
-		
+        std::shared_ptr<double> cfq;
+        cfq = std::shared_ptr<double>(new double[19*Np],DeleteArray<double>);
+        FILE *File;
+        File=fopen(LocalRestartFile,"rb");
+        fread(cfq.get(),sizeof(double),19*Np,File);
+        fclose(File);
+
 		// Copy the restart data to the GPU
-		ScaLBL_CopyToDevice(fq,cDist,19*Np*sizeof(double));
+		ScaLBL_CopyToDevice(fq,cfq.get(),19*Np*sizeof(double));
 		ScaLBL_DeviceBarrier();
 
 		MPI_Barrier(comm);
@@ -400,6 +391,21 @@ void ScaLBL_GreyscaleModel::Run(){
 	int nprocs=nprocx*nprocy*nprocz;
 	const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 	
+	int analysis_interval = 1000; 	// number of timesteps in between in situ analysis 
+	int visualization_interval = 1000; 	 
+	int restart_interval = 10000; 	// number of timesteps in between in saving distributions for restart 
+	if (analysis_db->keyExists( "analysis_interval" )){
+		analysis_interval = analysis_db->getScalar<int>( "analysis_interval" );
+	}
+	if (analysis_db->keyExists( "visualization_interval" )){
+		visualization_interval = analysis_db->getScalar<int>( "visualization_interval" );
+	}
+	if (analysis_db->keyExists( "restart_interval" )){
+		restart_interval = analysis_db->getScalar<int>( "restart_interval" );
+	}
+	if (greyscale_db->keyExists( "timestep" )){
+		timestep = greyscale_db->getScalar<int>( "timestep" );
+	}
 
 	if (rank==0){
 		printf("********************************************************\n");
@@ -418,8 +424,7 @@ void ScaLBL_GreyscaleModel::Run(){
 
 	//************ MAIN ITERATION LOOP ***************************************/
 	PROFILE_START("Loop");
-    //std::shared_ptr<Database> analysis_db;
-	timestep=0;
+	auto current_db = db->cloneDatabase();
 	double rlx = 1.0/tau;
 	double error = 1.0;
 	double flow_rate_previous = 0.0;
@@ -443,7 +448,7 @@ void ScaLBL_GreyscaleModel::Run(){
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
-		if (timestep%1000==0){
+		if (timestep%analysis_interval==0){
 			//ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
 			//ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
@@ -518,7 +523,7 @@ void ScaLBL_GreyscaleModel::Run(){
                     WriteHeader=true;
                 log_file = fopen("Permeability.csv","a");
                 if (WriteHeader)
-                    fprintf(log_file,"timesteps Fx Fy Fz mu Vs As Hs Xs vax vay vaz absperm \n",
+                    fprintf(log_file,"timestep Fx Fy Fz mu Vs As Hs Xs vax vay vaz AbsPerm \n",
                             timestep,Fx,Fy,Fz,mu,h*h*h*Vs,h*h*As,h*Hs,Xs,vax,vay,vaz,absperm);
 
                 fprintf(log_file,"%i %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g %.8g\n",timestep, Fx, Fy, Fz, mu, 
@@ -526,7 +531,35 @@ void ScaLBL_GreyscaleModel::Run(){
                 fclose(log_file);
             }
 		}
+
+		if (timestep%visualization_interval==0){
+            VelocityField();
+        }
+
+		if (timestep%restart_interval==0){
+            //Use rank=0 write out Restart.db
+            if (rank==0) {
+                greyscale_db->putScalar<int>("timestep",timestep);    		
+                greyscale_db->putScalar<bool>( "Restart", true );
+                current_db->putDatabase("Greyscale", greyscale_db);
+                std::ofstream OutStream("Restart.db");
+                current_db->print(OutStream, "");
+                OutStream.close();
+      
+            }
+            //Write out Restart data.
+            std::shared_ptr<double> cfq;
+            cfq = std::shared_ptr<double>(new double[19*Np],DeleteArray<double>);
+            ScaLBL_CopyToHost(cfq.get(),fq,19*Np*sizeof(double));// Copy restart data to the CPU
+
+            FILE *RESTARTFILE;
+            RESTARTFILE=fopen(LocalRestartFile,"wb");
+            fwrite(cfq.get(),sizeof(double),19*Np,RESTARTFILE);
+            fclose(RESTARTFILE);
+		    MPI_Barrier(comm);
+        }
 	}
+
 	PROFILE_STOP("Loop");
 	PROFILE_SAVE("lbpm_greyscale_simulator",1);
 	//************************************************************************
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index ef253cd7..b7ed442e 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -54,7 +54,7 @@ int main(int argc, char **argv)
 		Greyscale.Create();       // creating the model will create data structure to match the pore structure and allocate variables
 		Greyscale.Initialize();   // initializing the model will set initial conditions for variables
 		Greyscale.Run();	 
-		Greyscale.VelocityField();
+		//Greyscale.VelocityField();
 		//Greyscale.WriteDebug();
 	}
 	// ****************************************************

From 0372b9d1e8379ba0e2c3e4f14aeb6eece9946c77 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 21 Jan 2020 23:24:10 -0500
Subject: [PATCH 023/121] save the work, update how flow_rate is computed in
 greyscale simulator

---
 models/GreyscaleModel.cpp | 36 ++++++++++++++++++++----------------
 models/GreyscaleModel.h   |  1 +
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 06077780..36f853b1 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -99,6 +99,7 @@ void ScaLBL_GreyscaleModel::SetDomain(){
 	Velocity_x.resize(Nx,Ny,Nz);
 	Velocity_y.resize(Nx,Ny,Nz);
 	Velocity_z.resize(Nx,Ny,Nz);
+	PorosityMap.resize(Nx,Ny,Nz);
 
 	id = new signed char [N];
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
@@ -449,37 +450,40 @@ void ScaLBL_GreyscaleModel::Run(){
 		//************************************************************************/
 		
 		if (timestep%analysis_interval==0){
-			//ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-			//ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
+			ScaLBL_Comm->RegularLayout(Map,Porosity,PorosityMap);
 			
 			double count_loc=0;
 			double count;
 			double vax,vay,vaz;
-			double vax_loc,vay_loc,vaz_loc;
-			vax_loc = vay_loc = vaz_loc = 0.f;
+            double px_loc,py_loc,pz_loc;
+            double px,py,pz;
+            double mass_loc,mass_glb;
+
+			px_loc = py_loc = pz_loc = 0.f;
+            mass_loc = 0.f;
 			for (int k=1; k<Nz-1; k++){
 				for (int j=1; j<Ny-1; j++){
 					for (int i=1; i<Nx-1; i++){
 						if (SignDist(i,j,k) > 0){
-							vax_loc += Velocity_x(i,j,k);
-							vay_loc += Velocity_y(i,j,k);
-							vaz_loc += Velocity_z(i,j,k);
-							count_loc+=1.0;
+							px_loc   += Velocity_x(i,j,k)*Den*PorosityMap(i,j,k);
+							py_loc   += Velocity_y(i,j,k)*Den*PorosityMap(i,j,k);
+							pz_loc   += Velocity_z(i,j,k)*Den*PorosityMap(i,j,k);
+							mass_loc += Den*PorosityMap(i,j,k);
 						}
 					}
 				}
 			}
-			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&px_loc,  &px,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&py_loc,  &py,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&pz_loc,  &pz,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&mass_loc,&mass_glb,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			
-			vax /= count;
-			vay /= count;
-			vaz /= count;
+			vax = px/mass_glb;
+			vay = py/mass_glb;
+			vaz = pz/mass_glb;
 			
 			double force_mag = sqrt(Fx*Fx+Fy*Fy+Fz*Fz);
 			double dir_x = Fx/force_mag;
@@ -492,7 +496,7 @@ void ScaLBL_GreyscaleModel::Run(){
 				dir_z = 1.0;
 				force_mag = 1.0;
 			}
-			double flow_rate = (vax*dir_x + vay*dir_y + vaz*dir_z);
+			double flow_rate = (px*dir_x + py*dir_y + pz*dir_z)/mass_glb;
 			
 			error = fabs(flow_rate - flow_rate_previous) / fabs(flow_rate);
 			flow_rate_previous = flow_rate;
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index b427218b..d1399053 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -71,6 +71,7 @@ public:
     DoubleArray Velocity_x;
     DoubleArray Velocity_y;
     DoubleArray Velocity_z;
+    DoubleArray PorosityMap;
 		
 private:
 	MPI_Comm comm;

From 2cee75ae977883539a860da7c1d595e495771d93 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 21 Nov 2019 13:29:26 -0500
Subject: [PATCH 024/121] Copying halo when reading grid file

---
 models/ColorModel.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 69b5f485..ad0f6d66 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -4,6 +4,7 @@ color lattice boltzmann model
 #include "models/ColorModel.h"
 #include "analysis/distance.h"
 #include "analysis/morphology.h"
+#include "common/Communication.h"
 #include "common/ReadMicroCT.h"
 #include <stdlib.h>
 #include <time.h>
@@ -191,8 +192,17 @@ void ScaLBL_ColorModel::ReadInput(){
 		IMAGE_INDEX++;
 	}
 	else if (domain_db->keyExists( "GridFile" )){
+        // Read the local domain data
 	    auto input_id = readMicroCT( *domain_db, MPI_COMM_WORLD );
-	    for (int i=0; i<Nx*Ny*Nz; i++) Mask->id[i] = input_id(i);
+        // Fill the halo (assuming GCW of 1)
+        array<int,3> size0 = { input_id.size(0), input_id.size(1), input_id.size(2) };
+        ArraySize size1 = { Mask->Nx, Mask->Ny, Mask->Nz };
+        ASSERT( size1[0] == size0[0]+2 && size1[1] == size0[1]+2 && size1[2] == size0[2]+2 );
+        fillHalo<signed char> fill( MPI_COMM_WORLD, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
+        Array<signed char> id_view;
+        id_view.viewRaw( size1, Mask->id );
+        fill.copy( input_id, id_view );
+        fill.fill( id_view );
 	}
 	else if (domain_db->keyExists( "Filename" )){
 		auto Filename = domain_db->getScalar<std::string>( "Filename" );

From 0006695d5f65d6f6a6853266b3f647a206f50336 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 12 Dec 2019 13:58:51 -0500
Subject: [PATCH 025/121] Adding MPIFLAGS option

---
 cmake/libraries.cmake |  2 +-
 cmake/macros.cmake    | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/libraries.cmake b/cmake/libraries.cmake
index 54d70b5d..ebc37f8f 100644
--- a/cmake/libraries.cmake
+++ b/cmake/libraries.cmake
@@ -77,7 +77,7 @@ MACRO( CONFIGURE_MPI )
                 ENDIF ()
             ELSE ()
                 # Search for the MPI executable in the current directory
-                FIND_PROGRAM ( MPIEXEC  NAMES mpiexec mpirun lamexec  PATHS ${MPI_DIRECTORY}/bin  NO_DEFAULT_PATH )
+                FIND_PROGRAM( MPIEXEC  NAMES mpiexec mpirun lamexec  PATHS ${MPI_DIRECTORY}/bin  NO_DEFAULT_PATH )
                 IF ( NOT MPIEXEC )
                     MESSAGE( FATAL_ERROR "Could not locate mpi executable" )
                 ENDIF()
diff --git a/cmake/macros.cmake b/cmake/macros.cmake
index 8791616c..d1c8dbe7 100644
--- a/cmake/macros.cmake
+++ b/cmake/macros.cmake
@@ -848,7 +848,7 @@ FUNCTION( ADD_${PROJ}_TEST EXEFILE ${ARGN} )
     ADD_PROJ_PROVISIONAL_TEST( ${EXEFILE} )
     CREATE_TEST_NAME( ${EXEFILE} ${ARGN} )
     IF ( USE_MPI_FOR_SERIAL_TESTS )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
     ELSE()
         ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
@@ -877,7 +877,7 @@ FUNCTION( ADD_${PROJ}_WEEKLY_TEST EXEFILE PROCS ${ARGN} )
     ELSEIF( ${PROCS} STREQUAL "1" )
         CREATE_TEST_NAME( "${EXEFILE}_WEEKLY" ${ARGN} )
         IF ( USE_MPI_FOR_SERIAL_TESTS )
-            ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+            ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" 1 $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
             SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         ELSE()
             ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
@@ -909,7 +909,7 @@ FUNCTION( ADD_${PROJ}_TEST_PARALLEL EXEFILE PROCS ${ARGN} )
     ELSEIF ( ${PROCS} GREATER ${TEST_MAX_PROCS} )
         MESSAGE("Disabling test ${TESTNAME} (exceeds maximum number of processors ${TEST_MAX_PROCS})")
     ELSE()
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
@@ -930,7 +930,7 @@ MACRO( ADD_${PROJ}_TEST_THREAD_MPI EXEFILE PROCS THREADS ${ARGN} )
         SET_TESTS_PROPERTIES ( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${TOT_PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
     ELSEIF ( USE_MPI OR USE_EXT_MPI )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
         SET_TESTS_PROPERTIES ( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${TOT_PROCS} )
         ADD_RESOURCE_LOCK( ${TESTNAME} ${EXEFILE} ${ARGN} )
@@ -966,7 +966,7 @@ FUNCTION( ADD_${PROJ}_EXAMPLE EXEFILE PROCS ${ARGN} )
         ADD_TEST( NAME ${TESTNAME} COMMAND $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
     ELSEIF ( USE_EXT_MPI AND NOT (${PROCS} GREATER ${TEST_MAX_PROCS}) )
         CREATE_TEST_NAME( "example--${EXEFILE}_${PROCS}procs" ${ARGN} )
-        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
+        ADD_TEST( NAME ${TESTNAME} COMMAND ${MPIEXEC} ${MPIFLAGS} "${MPIEXEC_NUMPROC_FLAG}" ${PROCS} $<TARGET_FILE:${LAST_TEST}> ${ARGN} )
         SET_PROPERTY( TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT OMPI_MCA_hwloc_base_binding_policy=none )
     ENDIF()
     SET_TESTS_PROPERTIES( ${TESTNAME} PROPERTIES FAIL_REGULAR_EXPRESSION "${TEST_FAIL_REGULAR_EXPRESSION}" PROCESSORS ${PROCS} )

From 3c854fd002c02650e1400a44ea705dd1c84d9810 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Thu, 2 Jan 2020 13:23:51 -0500
Subject: [PATCH 026/121] Updating StackTrace and improving performance
 converting uCT data

---
 StackTrace/StackTrace.cpp      |  26 ++++++--
 StackTrace/StackTrace.h        |  11 ++++
 StackTrace/Utilities.cpp       |  58 ++++++++++++++---
 StackTrace/Utilities.h         |  18 +++++
 StackTrace/string_view.h       |   2 +-
 analysis/runAnalysis.cpp       |  11 ++--
 common/Communication.hpp       |  12 ++--
 common/ReadMicroCT.cpp         |  37 +++++------
 common/Utilities.cpp           | 116 ++++++++++++++++++++++++++++++++-
 common/Utilities.h             |  31 +++++++++
 tests/lbpm_color_simulator.cpp |  70 ++++++++++----------
 11 files changed, 303 insertions(+), 89 deletions(-)

diff --git a/StackTrace/StackTrace.cpp b/StackTrace/StackTrace.cpp
index e9292990..55a24352 100644
--- a/StackTrace/StackTrace.cpp
+++ b/StackTrace/StackTrace.cpp
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cerrno>
 #include <csignal>
 #include <cstring>
 #include <iostream>
@@ -348,8 +349,11 @@ static inline int exec3( const char *cmd, FUNCTION &fun )
         if ( buffer[0] != 0 )
             fun( buffer );
     }
-    auto status = pclose( pipe );
-    int code    = WEXITSTATUS( status );
+    int code = pclose( pipe );
+    if ( errno == ECHILD ) {
+        errno = 0;
+        code  = 0;
+    }
     std::this_thread::yield(); // Allow any signals to process
     resetSignal( SIGCHLD );    // Clear child exited
     return code;
@@ -1741,7 +1745,7 @@ std::vector<int> StackTrace::defaultSignalsToCatch()
  *  Set the signal handlers                                                  *
  ****************************************************************************/
 static std::function<void( const StackTrace::abort_error &err )> abort_fun;
-static StackTrace::abort_error rethrow()
+StackTrace::abort_error rethrow()
 {
     StackTrace::abort_error error;
 #ifdef USE_LINUX
@@ -1775,14 +1779,14 @@ static StackTrace::abort_error rethrow()
     }
     return error;
 }
-static void term_func_abort( int sig )
+void StackTrace::terminateFunctionSignal( int sig )
 {
     StackTrace::abort_error err;
     err.type      = StackTrace::terminateType::signal;
     err.signal    = sig;
     err.bytes     = StackTrace::Utilities::getMemoryUsage();
     err.stack     = StackTrace::backtrace();
-    err.stackType = StackTrace::printStackType::global;
+    err.stackType = StackTrace::getDefaultStackType();
     abort_fun( err );
 }
 static bool signals_set[256] = { false };
@@ -1829,7 +1833,7 @@ void StackTrace::setErrorHandler( std::function<void( const StackTrace::abort_er
 {
     abort_fun = abort;
     std::set_terminate( term_func );
-    setSignals( defaultSignalsToCatch(), &term_func_abort );
+    setSignals( defaultSignalsToCatch(), &terminateFunctionSignal );
     std::set_unexpected( term_func );
 }
 void StackTrace::clearErrorHandler()
@@ -2215,7 +2219,7 @@ void StackTrace::cleanupStackTrace( multi_stack_info &stack )
             // Remove callstack (and all children) for threads that are just contributing
             bool test = function.find( "_callstack_signal_handler" ) != npos ||
                         function.find( "getGlobalCallStacks" ) != npos ||
-                        function.find( "(" ) == npos;
+                        function.find( "backtrace" ) != npos || function.find( "(" ) == npos;
             if ( test ) {
                 it = stack.children.erase( it );
                 continue;
@@ -2515,3 +2519,11 @@ const char *StackTrace::abort_error::what() const noexcept
             d_msg.erase( i, 1 );
     return d_msg.c_str();
 }
+
+
+/****************************************************************************
+ * Get/Set default stack type                                                *
+ ****************************************************************************/
+static StackTrace::printStackType abort_stackType = StackTrace::printStackType::global;
+void StackTrace::setDefaultStackType( StackTrace::printStackType type ) { abort_stackType = type; }
+StackTrace::printStackType StackTrace::getDefaultStackType() { return abort_stackType; }
diff --git a/StackTrace/StackTrace.h b/StackTrace/StackTrace.h
index ce315020..3773509c 100644
--- a/StackTrace/StackTrace.h
+++ b/StackTrace/StackTrace.h
@@ -246,6 +246,10 @@ void clearSignals();
 void raiseSignal( int signal );
 
 
+//! Default function to abort after catching a signal
+void terminateFunctionSignal( int signal );
+
+
 //! Return a list of all signals that can be caught
 std::vector<int> allSignalsToCatch();
 
@@ -289,6 +293,13 @@ multi_stack_info generateFromString( const std::vector<std::string> &str );
 multi_stack_info generateFromString( const std::string &str );
 
 
+//! Set default stack type
+void setDefaultStackType( StackTrace::printStackType );
+
+//! Get default stack type
+StackTrace::printStackType getDefaultStackType();
+
+
 } // namespace StackTrace
 
 
diff --git a/StackTrace/Utilities.cpp b/StackTrace/Utilities.cpp
index 734a0056..11f05777 100644
--- a/StackTrace/Utilities.cpp
+++ b/StackTrace/Utilities.cpp
@@ -8,8 +8,10 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <mutex>
 #include <sstream>
 #include <stdexcept>
+#include <typeinfo>
 
 #ifdef USE_MPI
 #include "mpi.h"
@@ -19,6 +21,10 @@
 #include "MemoryApp.h"
 #endif
 
+#ifdef USE_GCOV
+extern "C" void __gcov_flush( void );
+#endif
+
 
 #define perr std::cerr
 
@@ -65,6 +71,12 @@
 // clang-format on
 
 
+#ifdef __GNUC__
+#define USE_ABI
+#include <cxxabi.h>
+#endif
+
+
 namespace StackTrace {
 
 
@@ -96,13 +108,12 @@ inline size_t findfirst( const std::vector<TYPE> &X, TYPE Y )
 /****************************************************************************
  *  Function to terminate the program                                        *
  ****************************************************************************/
-static bool abort_throwException      = false;
-static printStackType abort_stackType = printStackType::global;
-static int force_exit                 = 0;
+static bool abort_throwException = false;
+static int force_exit            = 0;
 void Utilities::setAbortBehavior( bool throwException, int stackType )
 {
     abort_throwException = throwException;
-    abort_stackType      = static_cast<printStackType>( stackType );
+    StackTrace::setDefaultStackType( static_cast<printStackType>( stackType ) );
 }
 void Utilities::abort( const std::string &message, const std::string &filename, const int line )
 {
@@ -112,16 +123,28 @@ void Utilities::abort( const std::string &message, const std::string &filename,
     err.type      = terminateType::abort;
     err.line      = line;
     err.bytes     = Utilities::getMemoryUsage();
-    err.stackType = abort_stackType;
+    err.stackType = StackTrace::getDefaultStackType();
     err.stack     = StackTrace::backtrace();
     throw err;
 }
-static void terminate( const StackTrace::abort_error &err )
+static std::mutex terminate_mutex;
+static inline void callAbort()
 {
+#ifdef USE_GCOV
+    __gcov_flush();
+#endif
+    terminate_mutex.unlock();
+    std::abort();
+}
+void Utilities::terminate( const StackTrace::abort_error &err )
+{
+    // Lock mutex to ensure multiple threads do not try to abort simultaneously
+    terminate_mutex.lock();
+    // Clear the error handlers
     clearErrorHandler();
     // Print the message and abort
     if ( force_exit > 1 ) {
-        std::abort();
+        callAbort();
     } else if ( !abort_throwException ) {
         // Use MPI_abort (will terminate all processes)
         force_exit = 2;
@@ -135,10 +158,11 @@ static void terminate( const StackTrace::abort_error &err )
             MPI_Abort( MPI_COMM_WORLD, -1 );
         }
 #endif
-        std::abort();
+        callAbort();
     } else {
         perr << err.what();
-        std::abort();
+        perr.flush();
+        callAbort();
     }
 }
 
@@ -149,7 +173,7 @@ static void terminate( const StackTrace::abort_error &err )
 static void setTerminateErrorHandler()
 {
     // Set the terminate routine for runtime errors
-    StackTrace::setErrorHandler( terminate );
+    StackTrace::setErrorHandler( Utilities::terminate );
 }
 void Utilities::setErrorHandlers()
 {
@@ -293,4 +317,18 @@ std::string Utilities::exec( const string_view &cmd, int &exit_code )
 }
 
 
+/****************************************************************************
+ *  Get the type name                                                        *
+ ****************************************************************************/
+std::string Utilities::getTypeName( const std::type_info &id )
+{
+    std::string name = id.name();
+#if defined( USE_ABI )
+    int status;
+    name = abi::__cxa_demangle( name.c_str(), 0, 0, &status );
+#endif
+    return name;
+}
+
+
 } // namespace StackTrace
diff --git a/StackTrace/Utilities.h b/StackTrace/Utilities.h
index 10ed9085..83c8d7aa 100644
--- a/StackTrace/Utilities.h
+++ b/StackTrace/Utilities.h
@@ -4,6 +4,7 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <typeinfo>
 
 #include "StackTrace/StackTrace.h"
 #include "StackTrace/string_view.h"
@@ -28,9 +29,14 @@ void abort( const std::string &message, const std::string &filename, const int l
 void setAbortBehavior( bool throwException, int stackType = 2 );
 
 
+//! Function to terminate the application
+void terminate( const StackTrace::abort_error &err );
+
+
 //! Function to set the error handlers
 void setErrorHandlers();
 
+
 //! Function to clear the error handlers
 void clearErrorHandlers();
 
@@ -92,6 +98,18 @@ void cause_segfault();
 std::string exec( const StackTrace::string_view &cmd, int &exit_code );
 
 
+//! Return the hopefully demangled name of the given type
+std::string getTypeName( const std::type_info &id );
+
+
+//! Return the hopefully demangled name of the given type
+template<class TYPE>
+inline std::string getTypeName()
+{
+    return getTypeName( typeid( TYPE ) );
+}
+
+
 } // namespace Utilities
 } // namespace StackTrace
 
diff --git a/StackTrace/string_view.h b/StackTrace/string_view.h
index d83d1f24..ee729f63 100644
--- a/StackTrace/string_view.h
+++ b/StackTrace/string_view.h
@@ -119,7 +119,7 @@ public:
         int result = 0;
         for ( int i = 0; i < N && result == 0; i++ )
             if ( d_data[i] != other[i] )
-                result = d_data[i] < other[i] ? -i : i;
+                result = d_data[i] < other[i] ? -( i + 1 ) : ( i + 1 );
         if ( result == 0 )
             result = size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
         return result;
diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index caa03b1b..6c76f58b 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -767,6 +767,8 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
         double *Pressure, double *Velocity, double *fq, double *Den)
 {
     int N = d_N[0]*d_N[1]*d_N[2];
+    NULL_USE( N );
+    NULL_USE( Phi );
     
 	auto db = input_db->getDatabase( "Analysis" );
     //int timestep = db->getWithDefault<int>( "timestep", 0 );
@@ -937,8 +939,6 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
  ******************************************************************/
 void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPhase &Averages, const double *Phi, double *Pressure, double *Velocity, double *fq, double *Den)
 {
-    int N = d_N[0]*d_N[1]*d_N[2];
-
     // Check which analysis steps we need to perform
 	auto color_db =  input_db->getDatabase( "Color" );
 	auto vis_db =  input_db->getDatabase( "Visualization" );
@@ -954,7 +954,7 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
         finish();
     }
 
-    PROFILE_START("run");
+    PROFILE_START("basic");
 
     // Copy the appropriate variables to the host (so we can spawn new threads)
     ScaLBL_DeviceBarrier();
@@ -983,7 +983,6 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
     }
     PROFILE_STOP("Copy data to host");
 
-    PROFILE_START("run",1);
     // Spawn threads to do the analysis work
     //if (timestep%d_restart_interval==0){
     // if ( matches(type,AnalysisType::ComputeAverages) ) {
@@ -1036,12 +1035,11 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
         d_wait_vis = d_tpool.add_work(work);
     }
 
-    PROFILE_STOP("run");
+    PROFILE_STOP("basic");
 }
 
 void runAnalysis::WriteVisData(int timestep, std::shared_ptr<Database> input_db, SubPhase &Averages, const double *Phi, double *Pressure, double *Velocity, double *fq, double *Den)
 {
-    int N = d_N[0]*d_N[1]*d_N[2];
 	auto color_db =  input_db->getDatabase( "Color" );
 	auto vis_db =  input_db->getDatabase( "Visualization" );
     //int timestep = color_db->getWithDefault<int>( "timestep", 0 );
@@ -1068,7 +1066,6 @@ void runAnalysis::WriteVisData(int timestep, std::shared_ptr<Database> input_db,
     d_wait_vis = d_tpool.add_work(work2);
 
     //Averages.WriteVis = false;
-   // }
     
     PROFILE_STOP("write vis");
 }
diff --git a/common/Communication.hpp b/common/Communication.hpp
index cb9f3f18..33fed3a7 100644
--- a/common/Communication.hpp
+++ b/common/Communication.hpp
@@ -44,9 +44,9 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     if ( !src_data.empty() ) {
         int i1[3] = { src_size[0] * src_rank.ix, src_size[1] * src_rank.jy, src_size[2] * src_rank.kz };
         int i2[3] = { i1[0] + src_size[0] - 1, i1[1] + src_size[1] - 1, i1[2] + src_size[2] - 1 };
-        for ( size_t i=0; i<dst_rank.nx; i++ ) {
-            for ( size_t j=0; j<dst_rank.ny; j++ ) {
-                for ( size_t k=0; k<dst_rank.nz; k++ ) {
+        for ( int i=0; i<dst_rank.nx; i++ ) {
+            for ( int j=0; j<dst_rank.ny; j++ ) {
+                for ( int k=0; k<dst_rank.nz; k++ ) {
                     int j1[3] = { i * dst_size[0], j * dst_size[1], k * dst_size[2] };
                     int j2[3] = { j1[0] + dst_size[0] - 1, j1[1] + dst_size[1] - 1, j1[2] + dst_size[2] - 1 };
                     auto index = calcOverlap( i1, i2, j1, j2 );
@@ -65,9 +65,9 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     Array<TYPE> dst_data( dst_size[0], dst_size[1], dst_size[2] );
     int i1[3] = { dst_size[0] * dst_rank.ix, dst_size[1] * dst_rank.jy, dst_size[2] * dst_rank.kz };
     int i2[3] = { i1[0] + dst_size[0] - 1, i1[1] + dst_size[1] - 1, i1[2] + dst_size[2] - 1 };
-    for ( size_t i=0; i<src_rank.nx; i++ ) {
-        for ( size_t j=0; j<src_rank.ny; j++ ) {
-            for ( size_t k=0; k<src_rank.nz; k++ ) {
+    for ( int i=0; i<src_rank.nx; i++ ) {
+        for ( int j=0; j<src_rank.ny; j++ ) {
+            for ( int k=0; k<src_rank.nz; k++ ) {
                 int j1[3] = { i * src_size[0], j * src_size[1], k * src_size[2] };
                 int j2[3] = { j1[0] + src_size[0] - 1, j1[1] + src_size[1] - 1, j1[2] + src_size[2] - 1 };
                 auto index = calcOverlap( i1, i2, j1, j2 );
diff --git a/common/ReadMicroCT.cpp b/common/ReadMicroCT.cpp
index 94c728ef..79ef241e 100644
--- a/common/ReadMicroCT.cpp
+++ b/common/ReadMicroCT.cpp
@@ -70,8 +70,6 @@ Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
     auto n = domain.getVector<int>( "n" );
     int rank = comm_rank(MPI_COMM_WORLD);
     auto nproc = domain.getVector<int>( "nproc" );
-	auto ReadValues = domain.getVector<int>( "ReadValues" );
-	auto WriteValues = domain.getVector<int>( "WriteValues" );
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     
     // Determine the largest file number to get
@@ -95,29 +93,26 @@ Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
     		ERROR( "Invalid name for first file" );
     	}
     	data = readMicroCT( filename );
-
-    	// Relabel the data
-    	for (int k = 0; k<1024; k++){
-    		for (int j = 0; j<1024; j++){
-    			for (int i = 0; i<1024; i++){
-    				//n = k*Nfx*Nfy + j*Nfx + i;
-    				//char locval = loc_id[n];
-    				char locval = data(i,j,k);
-    				for (int idx=0; idx<ReadValues.size(); idx++){
-    					signed char oldvalue=ReadValues[idx];
-    					signed char newvalue=WriteValues[idx];
-    					if (locval == oldvalue){
-    						data(i,j,k) = newvalue;
-    						idx = ReadValues.size();
-    					}
-    				}
-    			}
-    		}
-    	}
     }
 
     // Redistribute the data
     data = redistribute( srcRankInfo, data, rankInfo, { n[0], n[1], n[2] }, comm );
 
+	// Relabel the data
+    auto ReadValues = domain.getVector<int>( "ReadValues" );
+    auto WriteValues = domain.getVector<int>( "WriteValues" );
+    ASSERT( ReadValues.size() == WriteValues.size() );
+    int readMaxValue = 0;
+    for ( auto v : ReadValues )
+        readMaxValue = std::max( data.max()+1, v );
+    std::vector<int> map( readMaxValue + 1, -1 );
+    for ( size_t i=0; i<ReadValues.size(); i++ )
+        map[ReadValues[i]] = WriteValues[i];
+    for ( size_t i=0; i<data.length(); i++ ) {
+        int t = data(i);
+        ASSERT( t >= 0 && t <= readMaxValue );
+        data(i) = map[t];
+    }
+
     return data;
 }
diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index f6d810af..1cf764be 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -1,10 +1,116 @@
 #include "common/Utilities.h"
+#include "StackTrace/StackTrace.h"
+#include "StackTrace/ErrorHandlers.h"
+
+#ifdef USE_TIMER
+#include "MemoryApp.h"
+#include "ProfilerApp.h"
+#endif
+
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
 
-#include <math.h>
 #include <algorithm>
+#include <math.h>
+#include <mutex>
 
 
-// Factor a number into it's prime factors
+// Mutex for Utility functions
+static std::mutex Utilities_mutex;
+
+
+/****************************************************************************
+ *  Function to perform the default startup/shutdown sequences               *
+ ****************************************************************************/
+void Utilities::startup( int argc, char **argv )
+{
+    NULL_USE( argc );
+    NULL_USE( argv );
+    // Disable OpenMP
+    Utilities::setenv( "OMP_NUM_THREADS", "1" );
+    Utilities::setenv( "MKL_NUM_THREADS", "1" );
+    // Start MPI
+#ifdef USE_MPI
+    int provided;
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
+    if ( provided < MPI_THREAD_MULTIPLE ) {
+        int rank;
+        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+        if ( rank == 0 )
+            std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+    }
+    StackTrace::globalCallStackInitialize( MPI_COMM_WORLD );
+#endif
+    // Set the error handlers
+    Utilities::setAbortBehavior( true, 3 );
+    Utilities::setErrorHandlers();
+}
+void Utilities::shutdown()
+{
+    // Clear the error handlers
+    Utilities::clearErrorHandlers();
+    StackTrace::clearSignals();
+    StackTrace::clearSymbols();
+    int rank = 0;
+#ifdef USE_MPI
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+    StackTrace::globalCallStackFinalize();
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Finalize();
+#endif
+#ifdef USE_TIMER
+    PROFILE_DISABLE();
+    auto memory = MemoryApp::getMemoryStats();
+    if ( rank == 0 && memory.N_new > memory.N_delete )
+        MemoryApp::print( std::cout );
+#endif
+}
+
+
+/****************************************************************************
+ *  Function to set an environemental variable                               *
+ ****************************************************************************/
+void Utilities::setenv( const std::string &name, const std::string &value )
+{
+    Utilities_mutex.lock();
+#if defined( USE_LINUX ) || defined( USE_MAC )
+    bool pass = false;
+    if ( !value.empty() )
+        pass = ::setenv( name.data(), value.data(), 1 ) == 0;
+    else
+        pass = ::unsetenv( name.data() ) == 0;
+#elif defined( USE_WINDOWS )
+    bool pass = SetEnvironmentVariable( name.data(), value.data() ) != 0;
+#else
+#error Unknown OS
+#endif
+    Utilities_mutex.unlock();
+    if ( !pass ) {
+        char msg[1024];
+        if ( !value.empty() )
+            sprintf(
+                msg, "Error setting enviornmental variable: %s=%s\n", name.data(), value.data() );
+        else
+            sprintf( msg, "Error clearing enviornmental variable: %s\n", name.data() );
+        ERROR( msg );
+    }
+}
+std::string Utilities::getenv( const std::string &name )
+{
+    std::string var;
+    Utilities_mutex.lock();
+    auto tmp = std::getenv( name.data() );
+    if ( tmp )
+        var = std::string( tmp );
+    Utilities_mutex.unlock();
+    return var;
+}
+
+
+/****************************************************************************
+ *  Factor a number into it's prime factors                                  *
+ ****************************************************************************/
 std::vector<int> Utilities::factor(size_t number)
 {
     if ( number<=3 ) 
@@ -54,9 +160,13 @@ std::vector<int> Utilities::factor(size_t number)
 }
 
 
-// Dummy function to prevent compiler from optimizing away variable
+/****************************************************************************
+ *  Dummy function to prevent compiler from optimizing away variable         *
+ ****************************************************************************/
 void Utilities::nullUse( void* data )
 {
     NULL_USE(data);
 }
 
+
+
diff --git a/common/Utilities.h b/common/Utilities.h
index 90cb4008..da579966 100644
--- a/common/Utilities.h
+++ b/common/Utilities.h
@@ -25,6 +25,37 @@ using StackTrace::Utilities::sleep_ms;
 using StackTrace::Utilities::sleep_s;
 
 
+/*!
+ * \brief Start MPI, error handlers
+ * \details This routine will peform the default startup sequence
+ * \param argc              argc from main
+ * \param argv              argv from main
+ */
+void startup( int argc, char **argv );
+
+/*!
+ * \brief Stop MPI, error handlers
+ * \details This routine will peform the default shutdown sequence to match startup
+ */
+void shutdown();
+
+
+/*!
+ * Get an environmental variable
+ * @param name              The name of the environmental variable
+ * @return                  The value of the enviornmental variable
+ */
+std::string getenv( const std::string &name );
+
+
+/*!
+ * Set an environmental variable
+ * @param name              The name of the environmental variable
+ * @param value             The value to set
+ */
+void setenv( const std::string &name, const std::string &value );
+
+
 //! std::string version of sprintf
 inline std::string stringf( const char *format, ... );
 
diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index e8e675e2..1f63c653 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -7,6 +7,7 @@
 #include <fstream>
 
 #include "models/ColorModel.h"
+#include "common/Utilities.h"
 
 //#define WRE_SURFACES
 
@@ -15,7 +16,6 @@
  * James E. McClure 2013-2014
  */
 
-using namespace std;
 
 //*************************************************************************
 // Implementation of Two-Phase Immiscible LBM using CUDA
@@ -23,27 +23,26 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
-  // Initialize MPI
-  int provided_thread_support = -1;
-  MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-  int rank = comm_rank(comm);
-  int nprocs = comm_size(comm);
-  if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
-    std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+  // Initialize MPI and error handlers
+  Utilities::startup( argc, argv );
+
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
-	if (rank == 0){
-		printf("********************************************************\n");
-		printf("Running Color LBM	\n");
-		printf("********************************************************\n");
-	}
-	// Initialize compute device
-	int device=ScaLBL_SetDevice(rank);
-	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
-	
+    MPI_Comm comm;
+    MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+    int rank = comm_rank(comm);
+    int nprocs = comm_size(comm);
+
+    if (rank == 0){
+	    printf("********************************************************\n");
+	    printf("Running Color LBM	\n");
+	    printf("********************************************************\n");
+    }
+    // Initialize compute device
+    ScaLBL_SetDevice(rank);
+    ScaLBL_DeviceBarrier();
+    MPI_Barrier(comm);
+
     PROFILE_ENABLE(1);
     //PROFILE_ENABLE_TRACE();
     //PROFILE_ENABLE_MEMORY();
@@ -51,23 +50,26 @@ int main(int argc, char **argv)
     PROFILE_START("Main");
     Utilities::setErrorHandlers();
 
-	auto filename = argv[1];
-	ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
-	ColorModel.ReadParams(filename);
-	ColorModel.SetDomain();    
-	ColorModel.ReadInput();    
-	ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
-	ColorModel.Initialize();   // initializing the model will set initial conditions for variables
-	ColorModel.Run();	       
-	//ColorModel.WriteDebug();
-	
+    auto filename = argv[1];
+    ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
+    ColorModel.ReadParams(filename);
+    ColorModel.SetDomain();    
+    ColorModel.ReadInput();    
+    ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+    ColorModel.Initialize();   // initializing the model will set initial conditions for variables
+    ColorModel.Run();	       
+    //ColorModel.WriteDebug();
+
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);
-	// ****************************************************
-	MPI_Barrier(comm);
+    // ****************************************************
+
+    MPI_Barrier(comm);
+    MPI_Comm_free(&comm);
+
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Comm_free(&comm);
-  MPI_Finalize();
+
+  Utilities::shutdown();
 }
 
 

From 78c2e710b996c40e92bbe219c8d1fe38d02c6799 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Wed, 22 Jan 2020 12:01:29 -0500
Subject: [PATCH 027/121] Fixing compile warnings

---
 IO/netcdf.cpp                         |  1 +
 analysis/SubPhase.cpp                 | 10 ++--
 analysis/SubPhase.h                   |  2 +-
 analysis/TwoPhase.cpp                 |  7 ++-
 analysis/dcel.cpp                     |  6 ++-
 analysis/morphology.cpp               | 18 +-------
 analysis/uCT.cpp                      |  2 +
 cmake/libraries.cmake                 |  4 ++
 common/Domain.cpp                     | 42 ++++++++---------
 common/Domain.h                       | 10 ++--
 common/MPI_Helpers.h                  |  6 +++
 common/ScaLBL.cpp                     | 25 +++++-----
 models/ColorModel.cpp                 | 53 ++++++++++-----------
 models/DFHModel.cpp                   |  1 -
 models/MRTModel.cpp                   |  4 --
 tests/GenerateSphereTest.cpp          |  5 +-
 tests/TestBubbleDFH.cpp               |  4 +-
 tests/TestColorGradDFH.cpp            |  6 +--
 tests/TestCommD3Q19.cpp               | 27 +++--------
 tests/TestFluxBC.cpp                  | 20 ++------
 tests/TestForceD3Q19.cpp              | 14 ++----
 tests/TestInterfaceSpeed.cpp          |  6 +--
 tests/TestMap.cpp                     |  6 +--
 tests/TestMassConservationD3Q7.cpp    | 25 ++--------
 tests/TestNetcdf.cpp                  |  1 -
 tests/TestSubphase.cpp                |  6 +--
 tests/TestTopo3D.cpp                  |  3 +-
 tests/TestTorus.cpp                   |  7 ++-
 tests/TestTorusEvolve.cpp             | 10 ++--
 tests/lbpm_morph_pp.cpp               |  8 ++--
 tests/lbpm_morphdrain_pp.cpp          | 31 +++++--------
 tests/lbpm_morphopen_pp.cpp           | 26 ++++-------
 tests/lbpm_permeability_simulator.cpp |  9 +---
 tests/lbpm_refine_pp.cpp              | 29 ++++++------
 tests/lbpm_serial_decomp.cpp          | 39 +++++++---------
 tests/lbpm_uCT_pp.cpp                 | 66 +++++++++++++--------------
 tests/pmmc_cylinder.cpp               | 10 ----
 37 files changed, 215 insertions(+), 334 deletions(-)

diff --git a/IO/netcdf.cpp b/IO/netcdf.cpp
index e355c344..b36bb6d6 100644
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@@ -189,6 +189,7 @@ std::vector<size_t> getAttDim( int fid, const std::string& att )
 {
     std::vector<size_t> dim(1,0);
     int err = nc_inq_attlen( fid, NC_GLOBAL, att.c_str(), dim.data() );
+    CHECK_NC_ERR( err );
     return dim;
 }
 std::vector<std::string> getVarNames( int fid )
diff --git a/analysis/SubPhase.cpp b/analysis/SubPhase.cpp
index 0848ded1..76541ffd 100644
--- a/analysis/SubPhase.cpp
+++ b/analysis/SubPhase.cpp
@@ -169,7 +169,6 @@ void SubPhase::Basic(){
 	
 	nb.reset(); wb.reset();
 
-	double nA,nB;
 	double count_w = 0.0;
 	double count_n = 0.0;
 	
@@ -297,8 +296,8 @@ void SubPhase::Basic(){
 		double saturation=gwb.V/(gwb.V + gnb.V);
 		double water_flow_rate=gwb.V*(gwb.Px*dir_x + gwb.Py*dir_y + gwb.Pz*dir_z)/gwb.M / Dm->Volume;
 		double not_water_flow_rate=gnb.V*(gnb.Px*dir_x + gnb.Py*dir_y + gnb.Pz*dir_z)/gnb.M/ Dm->Volume;
-		double total_flow_rate = water_flow_rate + not_water_flow_rate;
-		double fractional_flow= water_flow_rate / total_flow_rate;
+		//double total_flow_rate = water_flow_rate + not_water_flow_rate;
+		//double fractional_flow = water_flow_rate / total_flow_rate;
 
 		double h = Dm->voxel_length;		
 		double krn = h*h*nu_n*not_water_flow_rate / force_mag ;
@@ -697,7 +696,8 @@ void SubPhase::Full(){
 }
 
 
-void SubPhase::AggregateLabels(char *FILENAME){
+void SubPhase::AggregateLabels( const std::string& filename )
+{
 	
 	int nx = Dm->Nx;
 	int ny = Dm->Ny;
@@ -721,7 +721,7 @@ void SubPhase::AggregateLabels(char *FILENAME){
 	}
 	MPI_Barrier(Dm->Comm);
 
-	Dm->AggregateLabels(FILENAME);
+	Dm->AggregateLabels( filename );
 
 }
 
diff --git a/analysis/SubPhase.h b/analysis/SubPhase.h
index 683fc46a..71b87ef0 100644
--- a/analysis/SubPhase.h
+++ b/analysis/SubPhase.h
@@ -101,7 +101,7 @@ public:
 	void Basic();
 	void Full();
 	void Write(int time);
-    void AggregateLabels(char *FILENAME);
+    void AggregateLabels( const std::string& filename );
 
 private:
 	FILE *TIMELOG;
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index 9b87daef..9b2e5fd8 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -204,6 +204,7 @@ TwoPhase::~TwoPhase()
 
 void TwoPhase::ColorToSignedDistance(double Beta, DoubleArray &ColorData, DoubleArray &DistData)
 {
+    NULL_USE( Beta );
   /*double factor,temp,value;
 		factor=0.5/Beta;
 	// Initialize to -1,1 (segmentation)
@@ -627,8 +628,8 @@ void TwoPhase::ComputeLocal()
 
 void TwoPhase::AssignComponentLabels()
 {
-	int LabelNWP=1;
-	int LabelWP=2;
+	//int LabelNWP=1;
+	//int LabelWP=2;
 	// NOTE: labeling the wetting phase components is tricky! One sandstone media had over 800,000 components
 	// NumberComponents_WP = ComputeGlobalPhaseComponent(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2,Dm->rank_info,PhaseID,LabelWP,Label_WP);
 	// treat all wetting phase is connected
@@ -1172,6 +1173,8 @@ void TwoPhase::Reduce()
 
 void TwoPhase::NonDimensionalize(double D, double viscosity, double IFT)
 {
+    NULL_USE( viscosity );
+    NULL_USE( IFT );
 	awn_global *= D;
 	ans_global *= D;
 	ans_global *= D;
diff --git a/analysis/dcel.cpp b/analysis/dcel.cpp
index 4c7be292..ca21c0e6 100644
--- a/analysis/dcel.cpp
+++ b/analysis/dcel.cpp
@@ -352,6 +352,8 @@ double DECL::EdgeAngle(int edge)
 
 void Isosurface(DoubleArray &A, const double &v)
 {
+    NULL_USE( v );
+
 	Point P,Q;
 	Point PlaceHolder;
 	Point C0,C1,C2,C3,C4,C5,C6,C7;
@@ -562,7 +564,7 @@ void Isosurface(DoubleArray &A, const double &v)
 					if (P.z == 1.0 && Q.z == 1.0) HalfEdge[idx_edge][3] = -6;  // ghost twin for z=1 face
 				}
 				// Find all the angles
-				for (int idx=0; idx<EdgeCount; idx++){
+				/*for (int idx=0; idx<EdgeCount; idx++){
 					int V1=HalfEdge[idx][0];
 					int V2=HalfEdge[idx][1];
 					int T1= HalfEdge[idx_edge][2];
@@ -570,7 +572,7 @@ void Isosurface(DoubleArray &A, const double &v)
 					if (twin == -1){
 						
 					}
-				}
+				}*/
 
 				// Map vertices to global coordinates
 				for (int idx=0;idx<VertexCount;idx++) {
diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index ebc7fc0c..05278313 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -34,9 +34,6 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 	int nx = Dm->Nx;
 	int ny = Dm->Ny;
 	int nz = Dm->Nz;
-	int iproc = Dm->iproc();
-	int jproc = Dm->jproc();
-	int kproc = Dm->kproc();
 	int nprocx = Dm->nprocx();
 	int nprocy = Dm->nprocy();
 	int nprocz = Dm->nprocz();
@@ -122,7 +119,6 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 	int sendtag,recvtag;
 	sendtag = recvtag = 7;
 
-	int x,y,z;
 	int ii,jj,kk;
 	int Nx = nx;
 	int Ny = ny;
@@ -336,9 +332,6 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	int nx = Dm->Nx;
 	int ny = Dm->Ny;
 	int nz = Dm->Nz;
-	int iproc = Dm->iproc();
-	int jproc = Dm->jproc();
-	int kproc = Dm->kproc();
 	int nprocx = Dm->nprocx();
 	int nprocy = Dm->nprocy();
 	int nprocz = Dm->nprocz();
@@ -427,7 +420,6 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	int sendtag,recvtag;
 	sendtag = recvtag = 7;
 
-	int x,y,z;
 	int ii,jj,kk;
 	int Nx = nx;
 	int Ny = ny;
@@ -693,17 +685,11 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	return final_void_fraction;
 }
 
-double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetGrowth){
-	
+double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetGrowth)
+{
 	int Nx = Dm->Nx;
 	int Ny = Dm->Ny;
 	int Nz = Dm->Nz;
-	int iproc = Dm->iproc();
-	int jproc = Dm->jproc();
-	int kproc = Dm->kproc();
-	int nprocx = Dm->nprocx();
-	int nprocy = Dm->nprocy();
-	int nprocz = Dm->nprocz();
 	int rank = Dm->rank();
 	
 	double count=0.0;
diff --git a/analysis/uCT.cpp b/analysis/uCT.cpp
index 6a327432..912f8e85 100644
--- a/analysis/uCT.cpp
+++ b/analysis/uCT.cpp
@@ -157,6 +157,7 @@ void solve( const Array<float>& VOL, Array<float>& Mean, Array<char>& ID,
     //	int depth = 5;
     //	float sigsq=0.1;
 	int nlm_count = NLM3D( MultiScaleSmooth, Mean, Dist, NonLocalMean, depth, sigsq);
+    NULL_USE( nlm_count );
 	fillFloat.fill(NonLocalMean);
 }
 
@@ -201,6 +202,7 @@ void refine( const Array<float>& Dist_coarse,
 //	int depth = 3;
 //	float sigsq = 0.1;
 	int nlm_count = NLM3D( MultiScaleSmooth, Mean, Dist, NonLocalMean, depth, sigsq);
+    NULL_USE( nlm_count );
 	fillFloat.fill(NonLocalMean);
     segment( NonLocalMean, ID, 0.001 );
     for (size_t i=0; i<ID.length(); i++) {
diff --git a/cmake/libraries.cmake b/cmake/libraries.cmake
index ebc37f8f..dca31ea9 100644
--- a/cmake/libraries.cmake
+++ b/cmake/libraries.cmake
@@ -305,4 +305,8 @@ MACRO ( CONFIGURE_LBPM )
         SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
         SET( CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_RPATH} "${TIMER_DIRECTORY}" "${LBPM_INSTALL_DIR}/lib" )
     ENDIF()
+    # Suppress some common warnings
+    IF ( USING_GCC )
+        SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -Wno-unused-parameter")
+    ENDIF()
 ENDMACRO ()
diff --git a/common/Domain.cpp b/common/Domain.cpp
index a88b47da..a4959508 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -58,6 +58,9 @@ Domain::Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz,
 	recvData_xY(NULL), recvData_yZ(NULL), recvData_Xz(NULL), recvData_XY(NULL), recvData_YZ(NULL), recvData_XZ(NULL),
 	id(NULL)
 {	
+    NULL_USE( rnk );
+    NULL_USE( npy );
+    NULL_USE( npz );
 	// set up the neighbor ranks
     int myrank;
     MPI_Comm_rank( Comm, &myrank );
@@ -241,7 +244,7 @@ void Domain::initialize( std::shared_ptr<Database> db )
 	INSIST(nprocs == nproc[0]*nproc[1]*nproc[2],"Fatal error in processor count!");
 }
 
-void Domain::Decomp(std::string Filename)
+void Domain::Decomp( const std::string& Filename )
 {
 	//.......................................................................
 	// Reading the domain information file
@@ -251,7 +254,6 @@ void Domain::Decomp(std::string Filename)
 	int nprocs, nprocx, nprocy, nprocz, nx, ny, nz;
 	int64_t global_Nx,global_Ny,global_Nz;
 	int64_t i,j,k,n;
-	int BC=0;
 	int64_t xStart,yStart,zStart;
 	int checkerSize;
 	//int inlet_layers_x, inlet_layers_y, inlet_layers_z;
@@ -331,7 +333,7 @@ void Domain::Decomp(std::string Filename)
 	if (RANK==0){
 		printf("Input media: %s\n",Filename.c_str());
 		printf("Relabeling %lu values\n",ReadValues.size());
-		for (int idx=0; idx<ReadValues.size(); idx++){
+		for (size_t idx=0; idx<ReadValues.size(); idx++){
 			int oldvalue=ReadValues[idx];
 			int newvalue=WriteValues[idx];
 			printf("oldvalue=%d, newvalue =%d \n",oldvalue,newvalue);
@@ -374,7 +376,7 @@ void Domain::Decomp(std::string Filename)
 					n = k*global_Nx*global_Ny+j*global_Nx+i;
 					//char locval = loc_id[n];
 					char locval = SegData[n];
-					for (int idx=0; idx<ReadValues.size(); idx++){
+					for (size_t idx=0; idx<ReadValues.size(); idx++){
 						signed char oldvalue=ReadValues[idx];
 						signed char newvalue=WriteValues[idx];
 						if (locval == oldvalue){
@@ -387,10 +389,10 @@ void Domain::Decomp(std::string Filename)
 			}
 		}
 		if (RANK==0){
-			for (int idx=0; idx<ReadValues.size(); idx++){
+			for (size_t idx=0; idx<ReadValues.size(); idx++){
 				long int label=ReadValues[idx];
 				long int count=LabelCount[idx];
-				printf("Label=%d, Count=%d \n",label,count);
+				printf("Label=%ld, Count=%ld \n",label,count);
 			}
 		}
 
@@ -475,7 +477,7 @@ void Domain::Decomp(std::string Filename)
 			printf("Checkerboard pattern at y outlet for %i layers \n",outlet_layers_y);
 			// use checkerboard pattern
 			for (int k = 0; k<global_Nz; k++){
-				for (int j = yStart + ny*nprocy - outlet_layers_y; i < yStart + ny*nprocy; j++){
+				for (int j = yStart + ny*nprocy - outlet_layers_y; j < yStart + ny*nprocy; j++){
 					for (int i = 0; i<global_Nx; i++){
 						if ( (i/checkerSize + k/checkerSize)%2 == 0){
 							// void checkers
@@ -587,7 +589,7 @@ void Domain::Decomp(std::string Filename)
 	MPI_Barrier(Comm);
 }
 
-void Domain::AggregateLabels(char *FILENAME){
+void Domain::AggregateLabels( const std::string& filename ){
 	
 	int nx = Nx;
 	int ny = Ny;
@@ -664,8 +666,7 @@ void Domain::AggregateLabels(char *FILENAME){
 			}
 		}
 		// write the output
-		FILE *OUTFILE;
-		OUTFILE = fopen(FILENAME,"wb");
+		FILE *OUTFILE = fopen(filename.c_str(),"wb");
 		fwrite(FullID,1,full_size,OUTFILE);
 		fclose(OUTFILE);
 	}
@@ -1145,19 +1146,18 @@ void Domain::CommunicateMeshHalo(DoubleArray &Mesh)
 }
 
 // Ideally stuff below here should be moved somewhere else -- doesn't really belong here
-void WriteCheckpoint(const char *FILENAME, const double *cDen, const double *cfq, int Np)
+void WriteCheckpoint(const char *FILENAME, const double *cDen, const double *cfq, size_t Np)
 {
-    int q,n;
     double value;
     ofstream File(FILENAME,ios::binary);
-    for (n=0; n<Np; n++){
+    for (size_t n=0; n<Np; n++){
         // Write the two density values
         value = cDen[n];
         File.write((char*) &value, sizeof(value));
         value = cDen[Np+n];
         File.write((char*) &value, sizeof(value));
         // Write the even distributions
-        for (q=0; q<19; q++){
+        for (size_t q=0; q<19; q++){
             value = cfq[q*Np+n];
             File.write((char*) &value, sizeof(value));
         }
@@ -1166,16 +1166,15 @@ void WriteCheckpoint(const char *FILENAME, const double *cDen, const double *cfq
 
 }
 
-void ReadCheckpoint(char *FILENAME, double *cPhi, double *cfq, int Np)
+void ReadCheckpoint(char *FILENAME, double *cPhi, double *cfq, size_t Np)
 {
-    int q=0, n=0;
     double value=0;
     ifstream File(FILENAME,ios::binary);
-    for (n=0; n<Np; n++){
+    for (size_t n=0; n<Np; n++){
         File.read((char*) &value, sizeof(value));
         cPhi[n] = value;
         // Read the distributions
-        for (q=0; q<19; q++){
+        for (size_t q=0; q<19; q++){
             File.read((char*) &value, sizeof(value));
             cfq[q*Np+n] = value;
         }
@@ -1183,13 +1182,12 @@ void ReadCheckpoint(char *FILENAME, double *cPhi, double *cfq, int Np)
     File.close();
 }
 
-void ReadBinaryFile(char *FILENAME, double *Data, int N)
+void ReadBinaryFile(char *FILENAME, double *Data, size_t N)
 {
-  int n;
   double value;
   ifstream File(FILENAME,ios::binary);
   if (File.good()){
-    for (n=0; n<N; n++){
+    for (size_t n=0; n<N; n++){
       // Write the two density values                                                                                
       File.read((char*) &value, sizeof(value));
       Data[n] = value;
@@ -1197,7 +1195,7 @@ void ReadBinaryFile(char *FILENAME, double *Data, int N)
     }
   }
   else {
-    for (n=0; n<N; n++) Data[n] = 1.2e-34;
+    for (size_t n=0; n<N; n++) Data[n] = 1.2e-34;
   }
   File.close();
 }
diff --git a/common/Domain.h b/common/Domain.h
index 5d84d1d6..df2812c1 100755
--- a/common/Domain.h
+++ b/common/Domain.h
@@ -177,11 +177,11 @@ public: // Public variables (need to create accessors instead)
     signed char *id;
 
     void ReadIDs();
-    void Decomp(std::string Filename);
+    void Decomp( const std::string& filename );
     void CommunicateMeshHalo(DoubleArray &Mesh);
     void CommInit(); 
     int PoreCount();
-    void AggregateLabels(char *FILENAME);
+    void AggregateLabels( const std::string& filename );
 
 private:
 
@@ -244,10 +244,10 @@ private:
 
 };
 
-void WriteCheckpoint(const char *FILENAME, const double *cDen, const double *cfq, int Np);
+void WriteCheckpoint(const char *FILENAME, const double *cDen, const double *cfq, size_t Np);
 
-void ReadCheckpoint(char *FILENAME, double *cDen, double *cfq, int Np);
+void ReadCheckpoint(char *FILENAME, double *cDen, double *cfq, size_t Np);
 
-void ReadBinaryFile(char *FILENAME, double *Data, int N);
+void ReadBinaryFile(char *FILENAME, double *Data, size_t N);
 
 #endif
diff --git a/common/MPI_Helpers.h b/common/MPI_Helpers.h
index 0cae743b..1d20318e 100644
--- a/common/MPI_Helpers.h
+++ b/common/MPI_Helpers.h
@@ -188,6 +188,12 @@ inline int sumReduce( MPI_Comm comm, int x )
 	MPI_Allreduce(&x,&y,1,MPI_INT,MPI_SUM,comm);
     return y;
 }
+inline long long sumReduce( MPI_Comm comm, long long x )
+{
+    long long y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_LONG_LONG,MPI_SUM,comm);
+    return y;
+}
 inline bool sumReduce( MPI_Comm comm, bool x )
 {
     int y = sumReduce( comm, x?1:0 );
diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 21656757..e8a75994 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -365,7 +365,7 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	int idx,i,j,k,n;
 
 	// Check that Map has size matching sub-domain
-	if (Map.size(0) != Nx)
+	if ( (int) Map.size(0) != Nx)
 		ERROR("ScaLBL_Communicator::MemoryOptimizedLayout: Map array dimensions do not match! \n");
 
 	// Initialize Map
@@ -1480,7 +1480,6 @@ void ScaLBL_Communicator::RecvHalo(double *data){
 
 void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, DoubleArray &regdata){
 	// Gets data from the device and stores in regular layout
-	int i,j,k,n,idx;
 	int Nx = map.size(0);
 	int Ny = map.size(1);
 	int Nz = map.size(2);
@@ -1492,11 +1491,10 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 	double value;
 	TmpDat = new double [N];
 	ScaLBL_CopyToHost(&TmpDat[0],&data[0], N*sizeof(double));
-	for (k=0; k<Nz; k++){
-		for (j=0; j<Ny; j++){
-			for (i=0; i<Nx; i++){
-				n=k*Nx*Ny+j*Nx+i;
-				idx=map(i,j,k);
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
+				auto idx=map(i,j,k);
 				if (!(idx<0)){
 					value=TmpDat[idx];
 					regdata(i,j,k)=value;
@@ -1510,8 +1508,9 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 }
 
 
-void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
-	double Value=(vA-vB)/(vA+vB);
+void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB)
+{
+	//double Value=(vA-vB)/(vA+vB);
 	if (kproc == 0) {
 		// Set the phase indicator field and density on the z inlet
 		ScaLBL_Color_BC_z(dvcSendList_z, Map, Phi, Den, vA, vB, sendCount_z, N);
@@ -1519,8 +1518,9 @@ void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double
 	}
 }
 
-void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
-	double Value=(vA-vB)/(vA+vB);
+void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB)
+{
+	//double Value=(vA-vB)/(vA+vB);
 	if (kproc == nprocz-1){
 		// Set the phase indicator field and density on the Z outlet
 		ScaLBL_Color_BC_Z(dvcSendList_Z, Map, Phi, Den, vA, vB, sendCount_Z, N);
@@ -1528,7 +1528,8 @@ void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double
 	}
 }
 
-void ScaLBL_Communicator::D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time){
+void ScaLBL_Communicator::D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time)
+{
     //ScaLBL_D3Q19_Pressure_BC_z(int *LIST,fq,din,Nx,Ny,Nz);
 	if (kproc == 0) {
 		if (time%2==0){
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index ad0f6d66..3ce8149e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -186,7 +186,6 @@ void ScaLBL_ColorModel::ReadInput(){
 	if (color_db->keyExists( "image_sequence" )){
 		auto ImageList = color_db->getVector<std::string>( "image_sequence");
 		int IMAGE_INDEX = color_db->getWithDefault<int>( "image_index", 0 );
-		int IMAGE_COUNT = ImageList.size();
 		std::string first_image = ImageList[IMAGE_INDEX];
 		Mask->Decomp(first_image);
 		IMAGE_INDEX++;
@@ -195,9 +194,9 @@ void ScaLBL_ColorModel::ReadInput(){
         // Read the local domain data
 	    auto input_id = readMicroCT( *domain_db, MPI_COMM_WORLD );
         // Fill the halo (assuming GCW of 1)
-        array<int,3> size0 = { input_id.size(0), input_id.size(1), input_id.size(2) };
-        ArraySize size1 = { Mask->Nx, Mask->Ny, Mask->Nz };
-        ASSERT( size1[0] == size0[0]+2 && size1[1] == size0[1]+2 && size1[2] == size0[2]+2 );
+        array<int,3> size0 = { (int) input_id.size(0), (int) input_id.size(1), (int) input_id.size(2) };
+        ArraySize size1 = { (size_t) Mask->Nx, (size_t) Mask->Ny, (size_t) Mask->Nz };
+        ASSERT( (int) size1[0] == size0[0]+2 && (int) size1[1] == size0[1]+2 && (int) size1[2] == size0[2]+2 );
         fillHalo<signed char> fill( MPI_COMM_WORLD, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
         Array<signed char> id_view;
         id_view.viewRaw( size1, Mask->id );
@@ -216,7 +215,6 @@ void ScaLBL_ColorModel::ReadInput(){
 	// Generate the signed distance map
 	// Initialize the domain and communication
 	Array<char> id_solid(Nx,Ny,Nz);
-	int count = 0;
 	// Solve for the position of the solid phase
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
@@ -233,7 +231,6 @@ void ScaLBL_ColorModel::ReadInput(){
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
 			for (int i=0;i<Nx;i++){
-				int n=k*Nx*Ny+j*Nx+i;
 				// Initialize distance to +/- 1
 				Averages->SDs(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 			}
@@ -266,7 +263,7 @@ void ScaLBL_ColorModel::AssignComponentLabels(double *phase)
 	double label_count_global[NLABELS];
 	// Assign the labels
 
-	for (int idx=0; idx<NLABELS; idx++) label_count[idx]=0;
+	for (size_t idx=0; idx<NLABELS; idx++) label_count[idx]=0;
 
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
@@ -294,7 +291,8 @@ void ScaLBL_ColorModel::AssignComponentLabels(double *phase)
 	// Set Dm to match Mask
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
-	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
+	for (size_t idx=0; idx<NLABELS; idx++)
+		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
 
 	if (rank==0){
 		printf("Component labels: %lu \n",NLABELS);
@@ -373,16 +371,16 @@ void ScaLBL_ColorModel::Create(){
 	}
 	// check that TmpMap is valid
 	for (int idx=0; idx<ScaLBL_Comm->LastExterior(); idx++){
-		int n = TmpMap[idx];
+		auto n = TmpMap[idx];
 		if (n > Nx*Ny*Nz){
-			printf("Bad value! idx=%i \n");
+			printf("Bad value! idx=%i \n", n);
 			TmpMap[idx] = Nx*Ny*Nz-1;
 		}
 	}
 	for (int idx=ScaLBL_Comm->FirstInterior(); idx<ScaLBL_Comm->LastInterior(); idx++){
-		int n = TmpMap[idx];
-		if (n > Nx*Ny*Nz){
-			printf("Bad value! idx=%i \n");
+		auto n = TmpMap[idx];
+		if ( n > Nx*Ny*Nz ){
+			printf("Bad value! idx=%i \n",n);
 			TmpMap[idx] = Nx*Ny*Nz-1;
 		}
 	}
@@ -553,8 +551,9 @@ void ScaLBL_ColorModel::Run(){
 	}  
 	
 	if (color_db->keyExists( "residual_endpoint_threshold" )){
-		RESIDUAL_ENDPOINT_THRESHOLD  = color_db->getScalar<double>( "residual_endpoint_threshold" );
+		RESIDUAL_ENDPOINT_THRESHOLD = color_db->getScalar<double>( "residual_endpoint_threshold" );
 	}
+    NULL_USE( RESIDUAL_ENDPOINT_THRESHOLD );
 	if (color_db->keyExists( "noise_threshold" )){
 		NOISE_THRESHOLD  = color_db->getScalar<double>( "noise_threshold" );
 		USE_BUMP_RATE = true;
@@ -874,7 +873,7 @@ void ScaLBL_ColorModel::Run(){
 							WriteHeader=true;
 						kr_log_file = fopen("relperm.csv","a");
 						if (WriteHeader)
-							fprintf(kr_log_file,"timesteps sat.water eff.perm.oil eff.perm.water eff.perm.oil.connected eff.perm.water.connected eff.perm.oil.disconnected eff.perm.water.disconnected cap.pressure cap.pressure.connected pressure.drop Ca M\n",CURRENT_STEADY_TIMESTEPS,current_saturation,kAeff,kBeff,pAB,viscous_pressure_drop,Ca,Mobility);
+							fprintf(kr_log_file,"timesteps sat.water eff.perm.oil eff.perm.water eff.perm.oil.connected eff.perm.water.connected eff.perm.oil.disconnected eff.perm.water.disconnected cap.pressure cap.pressure.connected pressure.drop Ca M\n");
 
 						fprintf(kr_log_file,"%i %.5g %.5g %.5g %.5g %.5g %.5g %.5g %.5g %.5g %.5g %.5g %.5g\n",CURRENT_STEADY_TIMESTEPS,current_saturation,kAeff,kBeff,kAeff_connected,kBeff_connected,kAeff_disconnected,kBeff_disconnected,pAB,pAB_connected,viscous_pressure_drop,Ca,Mobility);
 						fclose(kr_log_file);
@@ -937,7 +936,7 @@ void ScaLBL_ColorModel::Run(){
 				else if (USE_SEED){
 					delta_volume = volA*Dm->Volume - initial_volume;
 					CURRENT_MORPH_TIMESTEPS += analysis_interval;
-					double massChange = SeedPhaseField(seed_water);
+					//double massChange = SeedPhaseField(seed_water);
 					if (rank==0) printf("***Seed water in oil %f, volume change %f / %f ***\n", seed_water, delta_volume, delta_volume_target);
 				}
 				else if (USE_MORPHOPEN_OIL){
@@ -1010,7 +1009,6 @@ void ScaLBL_ColorModel::Run(){
 
 double ScaLBL_ColorModel::ImageInit(std::string Filename){
 	
-	bool suppress = false;
 	if (rank==0) printf("Re-initializing fluids from file: %s \n", Filename.c_str());
 	Mask->Decomp(Filename);
 	for (int i=0; i<Nx*Ny*Nz; i++) id[i] = Mask->id[i];  // save what was read
@@ -1080,10 +1078,9 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,Averages->SDs,vF,vS,phase_label,Dm->Comm);
 		MPI_Barrier(Dm->Comm);
 
-		int count_oil=0;
-		int count_connected=0;
-		int count_porespace=0;
-		int count_water=0;
+		long long count_connected=0;
+		long long count_porespace=0;
+		long long count_water=0;
 		for (int k=1; k<nz-1; k++){
 			for (int j=1; j<ny-1; j++){
 				for (int i=1; i<nx-1; i++){
@@ -1311,8 +1308,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	// 1. Copy phase field to CPU
 	ScaLBL_CopyToHost(phase.data(), Phi, N*sizeof(double));
 
-	double count,count_global,volume_initial,volume_final,volume_connected;
-	count = 0.f;
+	double count = 0.f;
 	for (int k=1; k<Nz-1; k++){
 		for (int j=1; j<Ny-1; j++){
 			for (int i=1; i<Nx-1; i++){
@@ -1320,7 +1316,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	volume_initial = sumReduce( Dm->Comm, count);
+	double volume_initial = sumReduce( Dm->Comm, count);
 	/*
 	sprintf(LocalRankFilename,"phi_initial.%05i.raw",rank);
 	FILE *INPUT = fopen(LocalRankFilename,"wb");
@@ -1352,16 +1348,16 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}	
-	volume_connected = sumReduce( Dm->Comm, count);
+	double volume_connected = sumReduce( Dm->Comm, count);
 	second_biggest = sumReduce( Dm->Comm, second_biggest);
 
-	int reach_x, reach_y, reach_z;
+	/*int reach_x, reach_y, reach_z;
 	for (int k=0; k<Nz; k++){
 		for (int j=0; j<Ny; j++){
 			for (int i=0; i<Nx; i++){
 			}
 		}
-	}
+	}*/
 
 	// 3. Generate a distance map to the largest object -> phase_distance
 	CalcDist(phase_distance,phase_id,*Dm);
@@ -1417,7 +1413,6 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 		for (int k=0; k<Nz; k++){
 			for (int j=0; j<Ny; j++){
 				for (int i=0; i<Nx; i++){
-					int n = k*Nx*Ny + j*Nx + i;
 					double d = phase_distance(i,j,k);
 					if (Averages->SDs(i,j,k) > 0.f){
 						if (d < 3.f){
@@ -1441,7 +1436,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	volume_final= sumReduce( Dm->Comm, count);
+	double volume_final= sumReduce( Dm->Comm, count);
 
 	delta_volume = (volume_final-volume_initial);
 	if (rank == 0)  printf("MorphInit: change fluid volume fraction by %f \n", delta_volume/volume_initial);
diff --git a/models/DFHModel.cpp b/models/DFHModel.cpp
index 7c7898de..4eb03bea 100644
--- a/models/DFHModel.cpp
+++ b/models/DFHModel.cpp
@@ -114,7 +114,6 @@ void ScaLBL_DFHModel::SetDomain(){
 }
 
 void ScaLBL_DFHModel::ReadInput(){
-	size_t readID;
 	//.......................................................................
 	if (rank == 0)    printf("Read input media... \n");
 	//.......................................................................
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index cd52aa2f..9ba733ae 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -94,7 +94,6 @@ void ScaLBL_MRTModel::SetDomain(){
 
 void ScaLBL_MRTModel::ReadInput(){
     int rank=Dm->rank();
-    size_t readID;
     //.......................................................................
     //.......................................................................
     Mask->ReadIDs();
@@ -106,7 +105,6 @@ void ScaLBL_MRTModel::ReadInput(){
 	// Generate the signed distance map
 	// Initialize the domain and communication
 	Array<char> id_solid(Nx,Ny,Nz);
-	int count = 0;
 	// Solve for the position of the solid phase
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
@@ -122,7 +120,6 @@ void ScaLBL_MRTModel::ReadInput(){
 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
 			for (int i=0;i<Nx;i++){
-				int n=k*Nx*Ny+j*Nx+i;
 				// Initialize distance to +/- 1
 				Distance(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 			}
@@ -191,7 +188,6 @@ void ScaLBL_MRTModel::Run(){
 	double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
 	
 	Minkowski Morphology(Mask);
-	int SIZE=Np*sizeof(double);
 
 	if (rank==0){
 		bool WriteHeader=false;
diff --git a/tests/GenerateSphereTest.cpp b/tests/GenerateSphereTest.cpp
index 13a90f38..53fc8746 100644
--- a/tests/GenerateSphereTest.cpp
+++ b/tests/GenerateSphereTest.cpp
@@ -363,7 +363,7 @@ int main(int argc, char **argv)
 	        nspheres = domain_db->getScalar<int>( "nspheres");
 
 		//printf("Set domain \n");
-		int BoundaryCondition=1;
+		//int BoundaryCondition=1;
 		//Nz += 2;
 		//Nx = Ny = Nz;	// Cubic domain
 		int N = Nx*Ny*Nz;
@@ -396,7 +396,7 @@ int main(int argc, char **argv)
 		int sum = 0;
 		double sum_local;
 		double iVol_global = 1.0/(1.0*(Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-		double porosity, pore_vol;
+		double porosity;
 		//...........................................................................
 		DoubleArray SignDist(Nx,Ny,Nz);
 		//.......................................................................
@@ -450,7 +450,6 @@ int main(int argc, char **argv)
 			}
 		}
 		sum=0;
-		pore_vol = 0.0;
 		for ( k=1;k<Nz-1;k++){
 			for ( j=1;j<Ny-1;j++){
 				for ( i=1;i<Nx-1;i++){
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index 8ea6e8d1..cddb4d9c 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -193,7 +193,7 @@ int main(int argc, char **argv)
 		//	char value;
 		char *id;
 		id = new char[N];
-		double sum, sum_local;
+		double sum;
 		//...........................................................................
 		if (rank == 0) cout << "Setting up bubble..." << endl;
 	    double BubbleRadius = 15.5; // Radius of the capillary tube
@@ -516,7 +516,7 @@ int main(int argc, char **argv)
 		DoubleArray PhaseField(Nx,Ny,Nz);
         ScaLBL_Comm->RegularLayout(Map,Phi,PhaseField);
     	FILE *OUTFILE;
-		sprintf(LocalRankFilename,"Phase.raw",rank);
+		sprintf(LocalRankFilename,"Phase.raw");
 		OUTFILE = fopen(LocalRankFilename,"wb");
     	fwrite(PhaseField.data(),8,N,OUTFILE);
     	fclose(OUTFILE);
diff --git a/tests/TestColorGradDFH.cpp b/tests/TestColorGradDFH.cpp
index 2b0abf32..d6376d82 100644
--- a/tests/TestColorGradDFH.cpp
+++ b/tests/TestColorGradDFH.cpp
@@ -53,9 +53,6 @@ int main(int argc, char **argv)
 	    int Nx = db->getVector<int>( "n" )[0];
 	    int Ny = db->getVector<int>( "n" )[1];
 	    int Nz = db->getVector<int>( "n" )[2];
-	    int nprocx = db->getVector<int>( "nproc" )[0];
-	    int nprocy = db->getVector<int>( "nproc" )[1];
-	    int nprocz = db->getVector<int>( "nproc" )[2];
 
 	    if (rank==0){
 	    	printf("********************************************************\n");
@@ -64,7 +61,7 @@ int main(int argc, char **argv)
 	    }
 
 	    // Get the rank info
-	    std::shared_ptr<Domain> Dm(new Domain(db,comm));
+		auto Dm = std::make_shared<Domain>(db,comm);
 		Nx += 2;
 		Ny += 2;
 		Nz += 2;
@@ -111,7 +108,6 @@ int main(int argc, char **argv)
 		MPI_Barrier(comm);
 
 		//......................device distributions.................................
-		int dist_mem_size = Np*sizeof(double);
 		int neighborSize=18*Np*sizeof(int);
 		if (rank==0)	printf ("Allocating distributions \n");
 		int *NeighborList;
diff --git a/tests/TestCommD3Q19.cpp b/tests/TestCommD3Q19.cpp
index 0c3988c8..e1fa821f 100644
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@@ -49,7 +49,7 @@ extern void GlobalFlipScaLBL_D3Q19_Init(double *dist, IntArray Map, int Np, int
 	{1,1,0},{-1,-1,0},{1,-1,0},{-1,1,0},{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1},
 	{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1}};
 	
-	int q,i,j,k,n,N;
+	int q,i,j,k;
 	int Cqx,Cqy,Cqz; // Discrete velocity
 	int x,y,z;		// Global indices
 	int xn,yn,zn; 	// Global indices of neighbor 
@@ -59,8 +59,6 @@ extern void GlobalFlipScaLBL_D3Q19_Init(double *dist, IntArray Map, int Np, int
 	Y = Ny*nprocy;
 	Z = Nz*nprocz;
     NULL_USE(Z);
-	N = (Nx+2)*(Ny+2)*(Nz+2);	// size of the array including halo
-
 
 	for (k=0; k<Nz; k++){ 
 		for (j=0; j<Ny; j++){
@@ -104,16 +102,13 @@ extern int GlobalCheckDebugDist(double *dist, IntArray Map, int Np, int Nx, int
 {
 
 	int returnValue = 0;
-	int q,i,j,k,n,N,idx;
-	int Cqx,Cqy,Cqz; // Discrete velocity
+	int q,i,j,k,idx;
 	int x,y,z;		// Global indices
-	int xn,yn,zn; 	// Global indices of neighbor 
 	int X,Y,Z;		// Global size
 	X = Nx*nprocx;
 	Y = Ny*nprocy;
 	Z = Nz*nprocz;
 	NULL_USE(Z);
-	N = (Nx+2)*(Ny+2)*(Nz+2);	// size of the array including halo
 	for (k=0; k<Nz; k++){ 
 		for (j=0; j<Ny; j++){
 			for (i=0; i<Nx; i++){
@@ -168,9 +163,6 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
@@ -178,10 +170,7 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(comm,&rank);
 	MPI_Comm_size(comm,&nprocs);
 	int check;
-
 	{
-		MPI_Request req1[18],req2[18];
-		MPI_Status stat1[18],stat2[18];
 
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -191,11 +180,8 @@ int main(int argc, char **argv)
 
 		// BGK Model parameters
 		string FILENAME;
-		unsigned int nBlocks, nthreads;
-		int timestepMax, interval;
-		double tau,Fx,Fy,Fz,tol;
 		// Domain variables
-		int i,j,k,n;
+		int i,j,k;
 
         // Load inputs
         auto db = loadInputs( nprocs );
@@ -223,8 +209,7 @@ int main(int argc, char **argv)
 		char LocalRankFilename[40];
 		sprintf(LocalRankFilename,"ID.%05i",rank);
 
-		char *id;
-		id = new char[Nx*Ny*Nz];
+		auto id = new char[Nx*Ny*Nz];
 
 		/*		if (rank==0) printf("Assigning phase ID from file \n");
 		if (rank==0) printf("Initialize from segmented data: solid=0, NWP=1, WP=2 \n");
@@ -237,7 +222,7 @@ int main(int argc, char **argv)
 		for (k=0;k<Nz;k++){
 			for (j=0;j<Ny;j++){
 				for (i=0;i<Nx;i++){
-					n = k*Nx*Ny+j*Nx+i;
+					int n = k*Nx*Ny+j*Nx+i;
 					id[n] = 1;
 					Dm->id[n] = id[n];
 				}
@@ -270,7 +255,7 @@ int main(int argc, char **argv)
 		for (k=1;k<Nz-1;k++){
 			for (j=1;j<Ny-1;j++){
 				for (i=1;i<Nx-1;i++){
-					n = k*Nx*Ny+j*Nx+i;
+					int n = k*Nx*Ny+j*Nx+i;
 					if (id[n] == component){
 						sum_local+=1.0;
 					}
diff --git a/tests/TestFluxBC.cpp b/tests/TestFluxBC.cpp
index 7548a10b..020bbd89 100644
--- a/tests/TestFluxBC.cpp
+++ b/tests/TestFluxBC.cpp
@@ -32,21 +32,14 @@ int main (int argc, char **argv)
 	}
 	{
 	  int i,j,k,n,Np;
-		bool pBC=true;
-		double Lx,Ly,Lz;
-		Lx = Ly = Lz = 1.f;
 		double din,dout;
-		int BC=1;
 
 	    // Load inputs
 	    auto db = loadInputs( nprocs );
 	    int Nx = db->getVector<int>( "n" )[0];
 	    int Ny = db->getVector<int>( "n" )[1];
 	    int Nz = db->getVector<int>( "n" )[2];
-	    int nprocx = db->getVector<int>( "nproc" )[0];
-	    int nprocy = db->getVector<int>( "nproc" )[1];
-	    int nprocz = db->getVector<int>( "nproc" )[2];
-		std::shared_ptr<Domain> Dm(new Domain(db,comm));
+		auto Dm = std::make_shared<Domain>(db,comm);
 		
 		Nx += 2;   Ny+=2;	Nz += 2;
 		Nx = Ny = Nz;	// Cubic domain
@@ -55,8 +48,7 @@ int main (int argc, char **argv)
 		//.......................................................................
 		// Assign the phase ID
 		//.......................................................................
-		char *id;
-		id = new char[N];
+		auto id = new char[N];
 		for (k=0;k<Nz;k++){
 			for (j=0;j<Ny;j++){
 				for (i=0;i<Nx;i++){
@@ -160,9 +152,7 @@ int main (int argc, char **argv)
     	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
     	ScaLBL_CopyToHost(&VEL[0],&dvc_vel[0],SIZE);
 
-		double err,value,Q;
-
-    	Q = 0.f;    	
+    	double Q = 0.f;    	
     	k=1;
     	for (j=1;j<Ny-1;j++){
     		for (i=1;i<Nx-1;i++){
@@ -176,7 +166,7 @@ int main (int argc, char **argv)
 
     	// respect backwards read / write!!!
 		printf("Inlet Flux: input=%f, output=%f \n",flux,Q);
-		err = fabs(flux + Q);
+		double err = fabs(flux + Q);
 		if (err > 1e-12){
 			error = 1;
 			printf("  Inlet error %f \n",err);
@@ -185,7 +175,7 @@ int main (int argc, char **argv)
 		// Consider a larger number of timesteps and simulate flow
 		double Fx, Fy, Fz;
 		double tau = 1.0;
-		double mu=(tau-0.5)/3.0;
+		//double mu=(tau-0.5)/3.0;
 		double rlx_setA=1.0/tau;
 		double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
 		dout=1.f;
diff --git a/tests/TestForceD3Q19.cpp b/tests/TestForceD3Q19.cpp
index 65453122..b8f88aae 100644
--- a/tests/TestForceD3Q19.cpp
+++ b/tests/TestForceD3Q19.cpp
@@ -457,24 +457,16 @@ int main (int argc, char **argv)
 		double *x = new double[1];
 		ASSERT(x!=NULL);
 	}
-
-    // set the error code
-    // Note: the error code should be consistent across all processors
-    int error = 0;
     
     int Np = 1;
-    int Q = 9;
+    //int Q = 9;
 
     double Fx = 1.0;
     double Fy = 1.0;
     double Fz = 1.0;
     
-    double *dist;    
-    double * Velocity;
-    
-    dist = new double [19*Np];
-    Velocity = new double [3*Np];
-
+    auto dist = new double [19*Np];
+    //auto Velocity  = new double [3*Np
 
     for (int n=0; n<Np; n++){
     	dist[n] = 0.3333333333333333;
diff --git a/tests/TestInterfaceSpeed.cpp b/tests/TestInterfaceSpeed.cpp
index 94b3c47a..40d53b47 100644
--- a/tests/TestInterfaceSpeed.cpp
+++ b/tests/TestInterfaceSpeed.cpp
@@ -24,7 +24,7 @@ int main (int argc, char *argv[])
 	MPI_Comm_rank(comm,&rank);
 	MPI_Comm_size(comm,&nprocs);
 
-	int i,j,k,n;
+	int i,j,k;
 
     // Load inputs
 	string FILENAME = argv[1];
@@ -36,7 +36,7 @@ int main (int argc, char *argv[])
     int Ny = domain_db->getVector<int>( "n" )[1];
     int Nz = domain_db->getVector<int>( "n" )[2];
 
-    std::shared_ptr<Domain> Dm(new Domain(domain_db,comm));
+    auto Dm = std::make_shared<Domain>(domain_db,comm);
 
     Nx+=2; Ny+=2; Nz+=2;
 
@@ -44,7 +44,7 @@ int main (int argc, char *argv[])
 
 	Dm->CommInit();
 
-	std::shared_ptr<TwoPhase> Averages(new TwoPhase(Dm));
+	auto Averages = std::make_shared<TwoPhase>(Dm);
 	int timestep=0;
 
 	double Cx,Cy,Cz;
diff --git a/tests/TestMap.cpp b/tests/TestMap.cpp
index 3e56cdf9..a47c0d9e 100644
--- a/tests/TestMap.cpp
+++ b/tests/TestMap.cpp
@@ -56,11 +56,7 @@ int main(int argc, char **argv)
 	    int Nx = db->getVector<int>( "n" )[0];
 	    int Ny = db->getVector<int>( "n" )[1];
 	    int Nz = db->getVector<int>( "n" )[2];
-	    int nprocx = db->getVector<int>( "nproc" )[0];
-	    int nprocy = db->getVector<int>( "nproc" )[1];
-	    int nprocz = db->getVector<int>( "nproc" )[2];
-
-		std::shared_ptr<Domain> Dm(new Domain(db,comm));
+		auto Dm = std::make_shared<Domain>(db,comm);
 
 		Nx += 2;
 		Ny += 2;
diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index d6f4243d..bbfe8cae 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -66,9 +66,6 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
@@ -76,19 +73,6 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(comm,&rank);
 	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
-	int nprocx,nprocy,nprocz;
-	int iproc,jproc,kproc;
-	int sendtag,recvtag;
-	//*****************************************
-	// MPI ranks for all 18 neighbors
-	//**********************************
-	int rank_x,rank_y,rank_z,rank_X,rank_Y,rank_Z;
-	int rank_xy,rank_XY,rank_xY,rank_Xy;
-	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
-	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
-	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -110,7 +94,6 @@ int main(int argc, char **argv)
 	Ny = CM.Ny;
 	Nz = CM.Nz;
 	N = Nx*Ny*Nz;
-	int dist_mem_size = N*sizeof(double);
 
 	//CM.ReadInput(); 
 	double radius=0.4*double(Nx);
@@ -142,11 +125,9 @@ int main(int argc, char **argv)
 	CM.Run();
 	int D3Q7[7][3]={{0,0,0},{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1}};
 	// Compare and make sure mass is conserved at every lattice site
-	double *Error;
-	Error = new double [N];
-	double *A_q, *B_q;
-	A_q = new double [7*Np];
-	B_q = new double [7*Np];
+	auto Error = new double[N];
+	auto A_q = new double[7*Np];
+	//auto B_q = new double[7*Np];
 	bool CleanCheck = true;
 	double original,final, sum_q;
 	double total_mass_A_0 = 0.0;
diff --git a/tests/TestNetcdf.cpp b/tests/TestNetcdf.cpp
index 7b6bae02..5ea5139f 100644
--- a/tests/TestNetcdf.cpp
+++ b/tests/TestNetcdf.cpp
@@ -14,7 +14,6 @@ void load( const std::string& );
 void test_NETCDF( UnitTest& ut )
 {
     const int rank = comm_rank( MPI_COMM_WORLD );
-    const int size = comm_size( MPI_COMM_WORLD );
     int nprocx = 2;
     int nprocy = 2;
     int nprocz = 2;
diff --git a/tests/TestSubphase.cpp b/tests/TestSubphase.cpp
index 8eb479bc..fd6383be 100644
--- a/tests/TestSubphase.cpp
+++ b/tests/TestSubphase.cpp
@@ -60,13 +60,11 @@ int main(int argc, char **argv)
 		}
 
 		// Get the rank info
-		std::shared_ptr<Domain> Dm(new Domain(db,comm));
-		//   const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
-		std::shared_ptr<SubPhase> Averages(new SubPhase(Dm));
+		auto Dm = std::make_shared<Domain>(db,comm);
+		auto Averages = std::make_shared<SubPhase>(Dm);
 		Nx += 2;
 		Ny += 2;
 		Nz += 2;
-		int N = Nx*Ny*Nz;
 		//.......................................................................
 		for ( k=1;k<Nz-1;k++){
 			for ( j=1;j<Ny-1;j++){
diff --git a/tests/TestTopo3D.cpp b/tests/TestTopo3D.cpp
index 4b8cd17f..8d00ef5a 100644
--- a/tests/TestTopo3D.cpp
+++ b/tests/TestTopo3D.cpp
@@ -60,12 +60,11 @@ int main(int argc, char **argv)
 		}
 
 		// Get the rank info
-		std::shared_ptr<Domain> Dm(new Domain(db,comm));
+		auto Dm = std::make_shared<Domain>(db,comm);
 
 		Nx += 2;
 		Ny += 2;
 		Nz += 2;
-		int N = Nx*Ny*Nz;
 		//.......................................................................
 		for ( k=1;k<Nz-1;k++){
 			for ( j=1;j<Ny-1;j++){
diff --git a/tests/TestTorus.cpp b/tests/TestTorus.cpp
index ef69a6d2..2d486774 100644
--- a/tests/TestTorus.cpp
+++ b/tests/TestTorus.cpp
@@ -60,13 +60,12 @@ int main(int argc, char **argv)
     }
 
     // Get the rank info
-		std::shared_ptr<Domain> Dm(new Domain(db,comm));
+    auto Dm = std::make_shared<Domain>(db,comm);
  //   const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
-		std::shared_ptr<TwoPhase> Averages(new TwoPhase(Dm));
+    auto Averages = std::make_shared<TwoPhase>(Dm);
 	Nx += 2;
 	Ny += 2;
 	Nz += 2;
-	int N = Nx*Ny*Nz;
 	//.......................................................................
 	for ( k=1;k<Nz-1;k++){
 		for ( j=1;j<Ny-1;j++){
@@ -142,7 +141,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-    double beta = 0.95;
+    //double beta = 0.95;
 	if (rank==0) printf("initializing the system \n");
 
 	Averages->UpdateSolid();
diff --git a/tests/TestTorusEvolve.cpp b/tests/TestTorusEvolve.cpp
index dedec45d..1a65d268 100644
--- a/tests/TestTorusEvolve.cpp
+++ b/tests/TestTorusEvolve.cpp
@@ -60,12 +60,11 @@ int main(int argc, char **argv)
     }
 
     // Get the rank info
-    std::shared_ptr<Domain> Dm(new Domain(db,comm));
+    auto Dm = std::make_shared<Domain>(db,comm);
 
     Nx += 2;
     Ny += 2;
     Nz += 2;
-    int N = Nx*Ny*Nz;
     //.......................................................................
     for ( k=1;k<Nz-1;k++){
     	for ( j=1;j<Ny-1;j++){
@@ -98,14 +97,13 @@ int main(int argc, char **argv)
 	//.......................................................................
 	// Assign the phase ID field based and the signed distance
 	//.......................................................................
-    double R1,R2,R;
     double CX,CY,CZ; //CY1,CY2;
     CX=Nx*nprocx*0.5;
     CY=Ny*nprocy*0.5;
     CZ=Nz*nprocz*0.5;
-    R1 = (Nx-2)*nprocx*0.3; // middle radius
-    R2 = (Nx-2)*nprocx*0.1; // donut thickness
-    R = 0.4*nprocx*(Nx-2);
+    auto R1 = (Nx-2)*nprocx*0.3; // middle radius
+    auto R2 = (Nx-2)*nprocx*0.1; // donut thickness
+    //auto R = 0.4*nprocx*(Nx-2);
     
     Minkowski Object(Dm);
 
diff --git a/tests/lbpm_morph_pp.cpp b/tests/lbpm_morph_pp.cpp
index 6947bf1f..8fe8b228 100644
--- a/tests/lbpm_morph_pp.cpp
+++ b/tests/lbpm_morph_pp.cpp
@@ -34,7 +34,6 @@ int main(int argc, char **argv)
 		//.......................................................................
 	        int n, nx, ny, nz;
 		char LocalRankFilename[40];
-		char FILENAME[128];
 
 		string filename;
 		double SW;
@@ -239,10 +238,9 @@ int main(int argc, char **argv)
 		}
 		MPI_Barrier(comm);
 
-		sprintf(FILENAME,READFILE.c_str());
-		sprintf(FILENAME+strlen(FILENAME),".morph.raw");
-		if (rank==0) printf("Writing file to: %s \n", FILENAME);
-		Mask->AggregateLabels(FILENAME);
+        auto filename2 = READFILE + ".morph.raw";
+		if (rank==0) printf("Writing file to: %s \n", filename2.c_str());
+		Mask->AggregateLabels(filename2);
 	}
 
 	MPI_Barrier(comm);
diff --git a/tests/lbpm_morphdrain_pp.cpp b/tests/lbpm_morphdrain_pp.cpp
index 4994e081..8d73b1e4 100644
--- a/tests/lbpm_morphdrain_pp.cpp
+++ b/tests/lbpm_morphdrain_pp.cpp
@@ -32,10 +32,7 @@ int main(int argc, char **argv)
 		//.......................................................................
 		// Reading the domain information file
 		//.......................................................................
-		int n, nprocx, nprocy, nprocz, nx, ny, nz;
-		char LocalRankString[8];
 		char LocalRankFilename[40];
-		char FILENAME[128];
 
 		string filename;
 		double SW,Rcrit_new;
@@ -43,8 +40,10 @@ int main(int argc, char **argv)
 			filename=argv[1];
 			Rcrit_new=0.f; 
 			//SW=strtod(argv[2],NULL);
-		}
-		else ERROR("No input database provided\n");
+		} else {
+            ERROR("No input database provided\n");
+        }
+        NULL_USE( Rcrit_new );
 		// read the input database 
 		auto db = std::make_shared<Database>( filename );
 		auto domain_db = db->getDatabase( "Domain" );
@@ -62,19 +61,16 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Performing morphological opening with target saturation %f \n", SW);
 		//	GenerateResidual(id,nx,ny,nz,Saturation);
 		
-		nx = size[0];
-		ny = size[1];
-		nz = size[2];
-		nprocx = nproc[0];
-		nprocy = nproc[1];
-		nprocz = nproc[2];
+		int nx = size[0];
+		int ny = size[1];
+		int nz = size[2];
 
-		int N = (nx+2)*(ny+2)*(nz+2);
+		size_t N = (nx+2)*(ny+2)*(nz+2);
 
 		std::shared_ptr<Domain> Dm (new Domain(domain_db,comm));
 		std::shared_ptr<Domain> Mask (new Domain(domain_db,comm));
 		//		std::shared_ptr<Domain> Dm (new Domain(nx,ny,nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BC));
-		for (n=0; n<N; n++) Dm->id[n]=1;
+		for (size_t n=0; n<N; n++) Dm->id[n]=1;
 		Dm->CommInit();
 
 		signed char *id;
@@ -116,7 +112,6 @@ int main(int argc, char **argv)
 		for (int k=0;k<nz;k++){
 			for (int j=0;j<ny;j++){
 				for (int i=0;i<nx;i++){
-					int n = k*nx*ny+j*nx+i;
 					// Initialize distance to +/- 1
 					SignDist(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 				}
@@ -158,7 +153,6 @@ int main(int argc, char **argv)
 			for (int k=0;k<nz;k++){
 				for (int j=0;j<ny;j++){
 					for (int i=0;i<nx;i++){
-						int n = k*nx*ny+j*nx+i;
 						// Initialize distance to +/- 1
 						SignDist(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 					}
@@ -204,10 +198,9 @@ int main(int argc, char **argv)
 		}
 		MPI_Barrier(comm);
 
-		sprintf(FILENAME,READFILE.c_str());
-		sprintf(FILENAME+strlen(FILENAME),".morphdrain.raw");
-		if (rank==0) printf("Writing file to: %s \n", FILENAME);
-		Mask->AggregateLabels(FILENAME);
+        auto filename2 = READFILE + ".morphdrain.raw";
+		if (rank==0) printf("Writing file to: %s \n", filename2.data() );
+		Mask->AggregateLabels( filename2 );
 	}
 
 	MPI_Barrier(comm);
diff --git a/tests/lbpm_morphopen_pp.cpp b/tests/lbpm_morphopen_pp.cpp
index 48106a97..f8819348 100644
--- a/tests/lbpm_morphopen_pp.cpp
+++ b/tests/lbpm_morphopen_pp.cpp
@@ -32,10 +32,7 @@ int main(int argc, char **argv)
 		//.......................................................................
 		// Reading the domain information file
 		//.......................................................................
-		int n, nprocx, nprocy, nprocz, nx, ny, nz;
-		char LocalRankString[8];
 		char LocalRankFilename[40];
-		char FILENAME[128];
 
 		string filename;
 		double SW,Rcrit_new;
@@ -45,6 +42,7 @@ int main(int argc, char **argv)
 			//SW=strtod(argv[2],NULL);
 		}
 		else ERROR("No input database provided\n");
+        NULL_USE( Rcrit_new );
 		// read the input database 
 		auto db = std::make_shared<Database>( filename );
 		auto domain_db = db->getDatabase( "Domain" );
@@ -69,19 +67,16 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Performing morphological opening with target saturation %f \n", SW);
 		//	GenerateResidual(id,nx,ny,nz,Saturation);
 		
-		nx = size[0];
-		ny = size[1];
-		nz = size[2];
-		nprocx = nproc[0];
-		nprocy = nproc[1];
-		nprocz = nproc[2];
+		int nx = size[0];
+		int ny = size[1];
+		int nz = size[2];
 
-		int N = (nx+2)*(ny+2)*(nz+2);
+		size_t N = (nx+2)*(ny+2)*(nz+2);
 
 		std::shared_ptr<Domain> Dm (new Domain(domain_db,comm));
 		std::shared_ptr<Domain> Mask (new Domain(domain_db,comm));
 		//		std::shared_ptr<Domain> Dm (new Domain(nx,ny,nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BC));
-		for (n=0; n<N; n++) Dm->id[n]=1;
+		for (size_t n=0; n<N; n++) Dm->id[n]=1;
 		Dm->CommInit();
 
 		signed char *id;
@@ -119,7 +114,6 @@ int main(int argc, char **argv)
 		for (int k=0;k<nz;k++){
 			for (int j=0;j<ny;j++){
 				for (int i=0;i<nx;i++){
-					int n = k*nx*ny+j*nx+i;
 					// Initialize distance to +/- 1
 					SignDist(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 				}
@@ -161,7 +155,6 @@ int main(int argc, char **argv)
 			for (int k=0;k<nz;k++){
 				for (int j=0;j<ny;j++){
 					for (int i=0;i<nx;i++){
-						int n = k*nx*ny+j*nx+i;
 						// Initialize distance to +/- 1
 						SignDist(i,j,k) = 2.0*double(id_solid(i,j,k))-1.0;
 					}
@@ -207,10 +200,9 @@ int main(int argc, char **argv)
 		}
 		MPI_Barrier(comm);
 
-		sprintf(FILENAME,READFILE.c_str());
-		sprintf(FILENAME+strlen(FILENAME),".morphopen.raw");
-		if (rank==0) printf("Writing file to: %s \n", FILENAME);
-		Mask->AggregateLabels(FILENAME);
+        auto filename2 = READFILE + ".morphopen.raw";
+		if (rank==0) printf("Writing file to: %s \n", filename2.data());
+		Mask->AggregateLabels(filename2);
 	}
 
 	MPI_Barrier(comm);
diff --git a/tests/lbpm_permeability_simulator.cpp b/tests/lbpm_permeability_simulator.cpp
index 4fb5bbac..dbcfb96b 100644
--- a/tests/lbpm_permeability_simulator.cpp
+++ b/tests/lbpm_permeability_simulator.cpp
@@ -23,9 +23,6 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
@@ -33,10 +30,6 @@ int main(int argc, char **argv)
 	MPI_Comm_rank(comm,&rank);
 	MPI_Comm_size(comm,&nprocs);
 	{
-		// parallel domain size (# of sub-domains)
-		int nprocx,nprocy,nprocz;
-		int iproc,jproc,kproc;
-
 		if (rank == 0){
 			printf("********************************************************\n");
 			printf("Running Single Phase Permeability Calculation \n");
@@ -44,10 +37,10 @@ int main(int argc, char **argv)
 		}
 		// Initialize compute device
 		int device=ScaLBL_SetDevice(rank);
+        NULL_USE( device );
 		ScaLBL_DeviceBarrier();
 		MPI_Barrier(comm);
 		
-		
 		ScaLBL_MRTModel MRT(rank,nprocs,comm);
 		auto filename = argv[1];
 		MRT.ReadParams(filename);
diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index 4b903d78..d90dbb04 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -26,10 +26,9 @@ int main(int argc, char **argv)
 		//.......................................................................
 		// Reading the domain information file
 		//.......................................................................
-		int nprocx, nprocy, nprocz, nx, ny, nz, nspheres;
 		double Lx, Ly, Lz;
+        Lx = Ly = Lz = 1.0;
 		int i,j,k,n;
-		int BC=0;
 
 		string filename;
 		if (argc > 1){
@@ -47,12 +46,12 @@ int main(int argc, char **argv)
 		auto ReadValues = domain_db->getVector<char>( "ReadValues" );
 		auto WriteValues = domain_db->getVector<char>( "WriteValues" );
 		
-		nx = size[0];
-		ny = size[1];
-		nz = size[2];
-		nprocx = nproc[0];
-		nprocy = nproc[1];
-		nprocz = nproc[2];
+		int nx = size[0];
+		int ny = size[1];
+		int nz = size[2];
+		int nprocx = nproc[0];
+		int nprocy = nproc[1];
+		int nprocz = nproc[2];
 
 		// Check that the number of processors >= the number of ranks
 		if ( rank==0 ) {
@@ -66,10 +65,9 @@ int main(int argc, char **argv)
 
 		char LocalRankFilename[40];
 
-		int rnx,rny,rnz;
-		rnx=2*nx;
-		rny=2*ny;
-		rnz=2*nz;
+		int rnx=2*nx;
+		int rny=2*ny;
+		int rnz=2*nz;
 
 		if (rank==0) printf("Refining mesh to %i x %i x %i \n",rnx,rny,rnz);
 
@@ -128,13 +126,12 @@ int main(int argc, char **argv)
 			}
 		}
 
-		int ri,rj,rk,rn; //refined mesh indices
 		//char *RefineLabel;
 		//RefineLabel = new char [rnx*rny*rnz];
 		Array <char> RefineLabel(rnx,rny,rnz);
-		for (rk=1; rk<rnz-1; rk++){
-			for (rj=1; rj<rny-1; rj++){
-				for (ri=1; ri<rnx-1; ri++){
+		for (int rk=1; rk<rnz-1; rk++){
+			for (int rj=1; rj<rny-1; rj++){
+				for (int ri=1; ri<rnx-1; ri++){
 					n = rk*rnx*rny+rj*rnx+ri;
 					// starting node for each processor matches exactly
 					i = (ri-1)/2+1;
diff --git a/tests/lbpm_serial_decomp.cpp b/tests/lbpm_serial_decomp.cpp
index d08bb21b..30bef4d3 100644
--- a/tests/lbpm_serial_decomp.cpp
+++ b/tests/lbpm_serial_decomp.cpp
@@ -47,11 +47,7 @@ int main(int argc, char **argv)
 	//.......................................................................
 	// Reading the domain information file
 	//.......................................................................
-	int nprocs, nprocx, nprocy, nprocz, nx, ny, nz, nspheres;
-	double Lx, Ly, Lz;
-	int64_t Nx,Ny,Nz;
 	int64_t i,j,k,n;
-	int BC=0;
 	int64_t xStart,yStart,zStart;
 	int checkerSize;
 	int inlet_count_x, inlet_count_y, inlet_count_z;
@@ -112,25 +108,25 @@ int main(int argc, char **argv)
 		ReadType = "8bit";
 	}
 
-	nx = size[0];
-	ny = size[1];
-	nz = size[2];
-	nprocx = nproc[0];
-	nprocy = nproc[1];
-	nprocz = nproc[2];
-	Nx = SIZE[0];
-	Ny = SIZE[1];
-	Nz = SIZE[2];
+	int nx = size[0];
+	int ny = size[1];
+	int nz = size[2];
+	int nprocx = nproc[0];
+	int nprocy = nproc[1];
+	int nprocz = nproc[2];
+	long int Nx = SIZE[0];
+	long int Ny = SIZE[1];
+	long int Nz = SIZE[2];
 
 	printf("Input media: %s\n",Filename.c_str());
 	printf("Relabeling %lu values\n",ReadValues.size());
-	for (int idx=0; idx<ReadValues.size(); idx++){
+	for (size_t idx=0; idx<ReadValues.size(); idx++){
 		int oldvalue=ReadValues[idx];
 		int newvalue=WriteValues[idx];
 		printf("oldvalue=%d, newvalue =%d \n",oldvalue,newvalue);
 	}
 
-	nprocs=nprocx*nprocy*nprocz;
+	int nprocs=nprocx*nprocy*nprocz;
 
 	char *SegData = NULL;
 	// Rank=0 reads the entire segmented data and distributes to worker processes
@@ -172,7 +168,7 @@ int main(int argc, char **argv)
 				n = k*Nx*Ny+j*Nx+i;
 				//char locval = loc_id[n];
 				char locval = SegData[n];
-				for (int idx=0; idx<ReadValues.size(); idx++){
+				for (size_t idx=0; idx<ReadValues.size(); idx++){
 					signed char oldvalue=ReadValues[idx];
 					signed char newvalue=WriteValues[idx];
 					if (locval == oldvalue){
@@ -185,10 +181,10 @@ int main(int argc, char **argv)
 		}
 	}
 	if (rank==0){
-		for (int idx=0; idx<ReadValues.size(); idx++){
+		for (size_t idx=0; idx<ReadValues.size(); idx++){
 			long int label=ReadValues[idx];
 			long int count=LabelCount[idx];
-			printf("Label=%d, Count=%d \n",label,count);
+			printf("Label=%ld, Count=%ld \n",label,count);
 		}
 	}
 	
@@ -215,7 +211,7 @@ int main(int argc, char **argv)
 		printf("Checkerboard pattern at y inlet for %i layers \n",inlet_count_y);
 		// use checkerboard pattern
 		for (int k = 0; k<Nz; k++){
-			for (int j = yStart; i < yStart+inlet_count_y; j++){
+			for (int j = yStart; j < yStart+inlet_count_y; j++){
 				for (int i = 0; i<Nx; i++){
 					if ( (i/checkerSize + k/checkerSize)%2 == 0){
 						// void checkers
@@ -272,7 +268,7 @@ int main(int argc, char **argv)
 		printf("Checkerboard pattern at y outlet for %i layers \n",outlet_count_y);
 		// use checkerboard pattern
 		for (int k = 0; k<Nz; k++){
-			for (int j = yStart + ny*nprocy - outlet_count_y; i < yStart + ny*nprocy; j++){
+			for (int j = yStart + ny*nprocy - outlet_count_y; j < yStart + ny*nprocy; j++){
 				for (int i = 0; i<Nx; i++){
 					if ( (i/checkerSize + k/checkerSize)%2 == 0){
 						// void checkers
@@ -306,9 +302,6 @@ int main(int argc, char **argv)
 		}
 	}
 
-	// Get the rank info
-	int64_t N = (nx+2)*(ny+2)*(nz+2);
-
 	// number of sites to use for periodic boundary condition transition zone
 	int64_t z_transition_size = (nprocz*nz - (Nz - zStart))/2;
 	if (z_transition_size < 0) z_transition_size=0;
diff --git a/tests/lbpm_uCT_pp.cpp b/tests/lbpm_uCT_pp.cpp
index c31768ac..0285b864 100644
--- a/tests/lbpm_uCT_pp.cpp
+++ b/tests/lbpm_uCT_pp.cpp
@@ -61,7 +61,7 @@ int main(int argc, char **argv)
         auto L = domain_db->getVector<double>( "L" );
         auto size = domain_db->getVector<int>( "n" );
         auto nproc = domain_db->getVector<int>( "nproc" );
-        int BoundaryCondition = domain_db->getScalar<int>( "BC" );
+        //int BoundaryCondition = domain_db->getScalar<int>( "BC" );
         int nx = size[0];
         int ny = size[1];
         int nz = size[2];
@@ -91,10 +91,10 @@ int main(int argc, char **argv)
             printf("Number of MPI ranks required: %i \n", nprocx*nprocy*nprocz);
             printf("Number of MPI ranks used: %i \n", nprocs);
             printf("Full domain size: %i x %i x %i  \n",nx*nprocx,ny*nprocy,nz*nprocz);
-	    printf("target value = %f \n",target);
-	    printf("background value = %f \n",background);
-	    printf("cylinder center = %i, %i, %i \n",center[0],center[1],center[2]);
-	    printf("cylinder radius = %f \n",CylRad);
+            printf("target value = %f \n",target);
+            printf("background value = %f \n",background);
+            printf("cylinder center = %i, %i, %i \n",center[0],center[1],center[2]);
+            printf("cylinder radius = %f \n",CylRad);
         }
         if ( nprocs < nprocx*nprocy*nprocz ){
             ERROR("Insufficient number of processors");
@@ -196,19 +196,19 @@ int main(int argc, char **argv)
         filter_src( *Dm[0], LOCVOL[0] );
 
         // Set up the mask to be distance to cylinder (crop outside cylinder)
-	if (rank==0) printf("Cropping with cylinder: %i, %i, %i, radius=%f \n",Dm[0]->nprocx()*Nx[0],Dm[0]->nprocy()*Ny[0],Dm[0]->nprocz()*Nz[0],CylRad);
+        if (rank==0) printf("Cropping with cylinder: %i, %i, %i, radius=%f \n",Dm[0]->nprocx()*Nx[0],Dm[0]->nprocy()*Ny[0],Dm[0]->nprocz()*Nz[0],CylRad);
         for (int k=0;k<Nz[0]+2;k++) {
             for (int j=0;j<Ny[0]+2;j++) {
                 for (int i=0;i<Nx[0]+2;i++) {
-		  float x= float(Dm[0]->iproc()*Nx[0]+i-1);
-		  float y= float (Dm[0]->jproc()*Ny[0]+j-1);
-		  float z= float(Dm[0]->kproc()*Nz[0]+k-1);
-		  float cx = float(center[0] - offset[0]);
-		  float cy = float(center[1] - offset[1]);
-		  float cz = float(center[2] - offset[2]);
+                  //float x= float(Dm[0]->iproc()*Nx[0]+i-1);
+                  float y= float (Dm[0]->jproc()*Ny[0]+j-1);
+                  float z= float(Dm[0]->kproc()*Nz[0]+k-1);
+                  //float cx = float(center[0] - offset[0]);
+                  float cy = float(center[1] - offset[1]);
+                  float cz = float(center[2] - offset[2]);
                     // distance from the center line 
                     MASK(i,j,k) = sqrt((z-cz)*(z-cz) + (y-cy)*(y-cy));
-		    //if (sqrt(((z-cz)*(z-cz) + (y-cy)*(y-cy)) ) > CylRad) LOCVOL[0](i,j,k)=background;
+                    //if (sqrt(((z-cz)*(z-cz) + (y-cy)*(y-cy)) ) > CylRad) LOCVOL[0](i,j,k)=background;
                 }
             }
         }
@@ -219,18 +219,18 @@ int main(int argc, char **argv)
         float THRESHOLD=0.5*(target+background);
         float mean_plus=0;
         float mean_minus=0;
-	float min_value = LOCVOL[0](0);
-	float max_value = LOCVOL[0](0);
+        float min_value = LOCVOL[0](0);
+        float max_value = LOCVOL[0](0);
         int count_plus=0;
         int count_minus=0;
         for (int k=1;k<Nz[0]+1;k++) {
             for (int j=1;j<Ny[0]+1;j++) {
                 for (int i=1;i<Nx[0]+1;i++) {
 
-		  
-		  //LOCVOL[0](i,j,k) = MASK(i,j,k);
+                  
+                  //LOCVOL[0](i,j,k) = MASK(i,j,k);
                  if (MASK(i,j,k) < CylRad ){
-		      auto tmp = LOCVOL[0](i,j,k);
+                      auto tmp = LOCVOL[0](i,j,k);
                         /*                        if ((tmp-background)*(tmp-target) > 0){
                             // direction to background / target is the same
                             if (fabs(tmp-target) > fabs(tmp-background)) tmp=background; // tmp closer to background
@@ -241,20 +241,20 @@ int main(int argc, char **argv)
                             mean_plus += tmp;
                             count_plus++;
                         } 
-			else {
+                        else {
                             mean_minus += tmp;
                             count_minus++;
                         }
-			if (tmp < min_value) min_value = tmp;
-			if (tmp > max_value) max_value = tmp;
-		    }
+                        if (tmp < min_value) min_value = tmp;
+                        if (tmp > max_value) max_value = tmp;
+                    }
                 }
             }
         }
-	count_plus=sumReduce( Dm[0]->Comm, count_plus);
-	count_minus=sumReduce( Dm[0]->Comm, count_minus);
-      	if (rank==0) printf("minimum value=%f, max value=%f \n",min_value,max_value);
-	if (rank==0) printf("plus=%i, minus=%i \n",count_plus,count_minus);
+        count_plus=sumReduce( Dm[0]->Comm, count_plus);
+        count_minus=sumReduce( Dm[0]->Comm, count_minus);
+              if (rank==0) printf("minimum value=%f, max value=%f \n",min_value,max_value);
+        if (rank==0) printf("plus=%i, minus=%i \n",count_plus,count_minus);
         ASSERT( count_plus > 0 && count_minus > 0 );
         MPI_Barrier(comm);
         mean_plus = sumReduce( Dm[0]->Comm, mean_plus ) / count_plus;
@@ -262,25 +262,25 @@ int main(int argc, char **argv)
         MPI_Barrier(comm);
         if (rank==0) printf("    Region 1 mean (+): %f, Region 2 mean (-): %f \n",mean_plus, mean_minus);
 
-	//if (rank==0) printf("Scale the input data (size = %i) \n",LOCVOL[0].length());
+        //if (rank==0) printf("Scale the input data (size = %i) \n",LOCVOL[0].length());
         for (size_t i=0; i<LOCVOL[0].length(); i++) {
-	    if ( MASK(i) > CylRad ){
-	      LOCVOL[0](i)=background;
+            if ( MASK(i) > CylRad ){
+              LOCVOL[0](i)=background;
             }
             if ( LOCVOL[0](i) >= THRESHOLD ) {
                 auto tmp = LOCVOL[0](i)/ mean_plus;
                 LOCVOL[0](i) = std::min( tmp, 1.0f );
             } 
-	    else {
+            else {
                 auto tmp = -LOCVOL[0](i)/mean_minus;
                 LOCVOL[0](i) = std::max( tmp, -1.0f );
             }
-	    //LOCVOL[0](i) = MASK(i);
+            //LOCVOL[0](i) = MASK(i);
         }
 
         // Fill the source data for the coarse meshes
-	if (rank==0) printf("Coarsen the mesh for N_levels=%i \n",N_levels);
-	MPI_Barrier(comm); 
+        if (rank==0) printf("Coarsen the mesh for N_levels=%i \n",N_levels);
+        MPI_Barrier(comm); 
         PROFILE_START("CoarsenMesh");
         for (int i=1; i<N_levels; i++) {
             Array<float> filter(ratio[0],ratio[1],ratio[2]);
diff --git a/tests/pmmc_cylinder.cpp b/tests/pmmc_cylinder.cpp
index a573fee3..3c5e25b6 100644
--- a/tests/pmmc_cylinder.cpp
+++ b/tests/pmmc_cylinder.cpp
@@ -11,17 +11,11 @@
 
 int main (int argc, char **argv)
 {
-	
 	//  printf("Radius = %s \n,"RADIUS);  
-	int SIZE = N*N*N;
 	int Nx,Ny,Nz;
 	Nx = Ny = Nz = N;
 	int i,j,k,p,q,r;
 	
-//	double *Solid; // cylinder
-//	double *Phase; // region of the cylinder	
-//	Solid = new double [SIZE];
-//	Phase = new double [SIZE];
 	DoubleArray SignDist(Nx,Ny,Nz);
 	DoubleArray Phase(Nx,Ny,Nz);
 	double fluid_isovalue = 0.0;
@@ -36,9 +30,6 @@ int main (int argc, char **argv)
 	//...........................................................................
 	double awn,ans,aws,lwns,nwp_volume;
 	double As;
-	double dEs,dAwn,dAns;			 // Global surface energy (calculated by rank=0)
-	double awn_global,ans_global,aws_global,lwns_global,nwp_volume_global;	
-	double As_global;
 	//	bool add=1;			// Set to false if any corners contain nw-phase ( F > fluid_isovalue)
 	int cube[8][3] = {{0,0,0},{1,0,0},{0,1,0},{1,1,0},{0,0,1},{1,0,1},{0,1,1},{1,1,1}};  // cube corners
 	//	int count_in=0,count_out=0;
@@ -75,7 +66,6 @@ int main (int argc, char **argv)
 	int n_local_nws_pts;
 	
 	int c;
-	int newton_steps = 0;
 	//...........................................................................
 	int ncubes = (Nx-2)*(Ny-2)*(Nz-2);	// Exclude the "upper" halo
 	IntArray cubeList(3,ncubes);

From acb2d30454421c4a7516f7b05189fac34d671f71 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Wed, 22 Jan 2020 12:19:04 -0500
Subject: [PATCH 028/121] Fixing compile warnings

---
 cpu/D3Q19.cpp           | 515 +++++++++++++++++++---------------------
 cpu/dfh.cpp             |  33 ++-
 tests/TestBubbleDFH.cpp |  34 +--
 3 files changed, 267 insertions(+), 315 deletions(-)

diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp
index 2af59883..6b858d61 100644
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@@ -243,8 +243,6 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, doub
 	// odd distributions in disteven and even distributions in distodd.
 	int n,N;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double din = 0.f;
 	N = Nx*Ny*Nz;
 
@@ -256,26 +254,26 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, doub
 		// Read distributions from "opposite" memory convention
 		//........................................................................
 		//........................................................................
-		f1 = distodd[n];
-		f3 = distodd[N+n];
-		f5 = distodd[2*N+n];
-		f7 = distodd[3*N+n];
-		f9 = distodd[4*N+n];
-		f11 = distodd[5*N+n];
-		f13 = distodd[6*N+n];
-		f15 = distodd[7*N+n];
-		f17 = distodd[8*N+n];
+		double f1 = distodd[n];
+		double f3 = distodd[N+n];
+		//double f5 = distodd[2*N+n];
+		double f7 = distodd[3*N+n];
+		double f9 = distodd[4*N+n];
+		//double f11 = distodd[5*N+n];
+		double f13 = distodd[6*N+n];
+		//double f15 = distodd[7*N+n];
+		double f17 = distodd[8*N+n];
 		//........................................................................
-		f0 = disteven[n];
-		f2 = disteven[N+n];
-		f4 = disteven[2*N+n];
-		f6 = disteven[3*N+n];
-		f8 = disteven[4*N+n];
-		f10 = disteven[5*N+n];
-		f12 = disteven[6*N+n];
-		f14 = disteven[7*N+n];
-		f16 = disteven[8*N+n];
-		f18 = disteven[9*N+n];
+		double f0 = disteven[n];
+		double f2 = disteven[N+n];
+		double f4 = disteven[2*N+n];
+		double f6 = disteven[3*N+n];
+		double f8 = disteven[4*N+n];
+		double f10 = disteven[5*N+n];
+		double f12 = disteven[6*N+n];
+		//double f14 = disteven[7*N+n];
+		double f16 = disteven[8*N+n];
+		//double f18 = disteven[9*N+n];
 		//...................................................
 
 		// Determine the outlet flow velocity
@@ -288,59 +286,58 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_z(double *disteven, double *distodd, doub
 }
 
 extern "C" double ScaLBL_D3Q19_AAodd_Flux_BC_z(int *d_neighborList, int *list, double *dist, double flux, 
-		double area, int count, int Np){
+		double area, int count, int Np)
+{
 	int idx, n;
 	int nread;
 
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double factor = 1.f/(area);
 	double sum = 0.f;
 	
 	for (idx=0; idx<count; idx++){
 		n = list[idx];
 
-		f0 = dist[n];
+		double f0 = dist[n];
 
 		nread = d_neighborList[n];
-		f1 = dist[nread];
+		double f1 = dist[nread];
 
 		nread = d_neighborList[n+2*Np];
-		f3 = dist[nread];
+		double f3 = dist[nread];
 
 		nread = d_neighborList[n+6*Np];
-		f7 = dist[nread];
+		double f7 = dist[nread];
 
 		nread = d_neighborList[n+8*Np];
-		f9 = dist[nread];
+		double f9 = dist[nread];
 
 		nread = d_neighborList[n+12*Np];
-		f13 = dist[nread];
+		double f13 = dist[nread];
 
 		nread = d_neighborList[n+16*Np];
-		f17 = dist[nread];
+		double f17 = dist[nread];
 
 		nread = d_neighborList[n+Np];
-		f2 = dist[nread];
+		double f2 = dist[nread];
 
 		nread = d_neighborList[n+3*Np];
-		f4 = dist[nread];
+		double f4 = dist[nread];
 
 		nread = d_neighborList[n+5*Np];
-		f6 = dist[nread];
+		double f6 = dist[nread];
 
 		nread = d_neighborList[n+7*Np];
-		f8 = dist[nread];
+		double f8 = dist[nread];
 
 		nread = d_neighborList[n+9*Np];
-		f10 = dist[nread];
+		double f10 = dist[nread];
 
 		nread = d_neighborList[n+11*Np];
-		f12 = dist[nread];
+		double f12 = dist[nread];
 
 		nread = d_neighborList[n+15*Np];
-		f16 = dist[nread];
+		double f16 = dist[nread];
 
 		sum += factor*(f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17));
 	}
@@ -349,30 +346,29 @@ extern "C" double ScaLBL_D3Q19_AAodd_Flux_BC_z(int *d_neighborList, int *list, d
 }
 
 extern "C" double ScaLBL_D3Q19_AAeven_Flux_BC_z(int *list, double *dist, double flux, double area, 
-		 int count, int Np){
+		 int count, int Np)
+{
 	int idx, n;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double factor = 1.f/(area);
 	double sum = 0.f;
 	
 	for (idx=0; idx<count; idx++){
 		n = list[idx];
-		f0 = dist[n];
-		f1 = dist[2*Np+n];
-		f2 = dist[1*Np+n];
-		f3 = dist[4*Np+n];
-		f4 = dist[3*Np+n];
-		f6 = dist[5*Np+n];
-		f7 = dist[8*Np+n];
-		f8 = dist[7*Np+n];
-		f9 = dist[10*Np+n];
-		f10 = dist[9*Np+n];
-		f12 = dist[11*Np+n];
-		f13 = dist[14*Np+n];
-		f16 = dist[15*Np+n];
-		f17 = dist[18*Np+n];
+		double f0 = dist[n];
+		double f1 = dist[2*Np+n];
+		double f2 = dist[1*Np+n];
+		double f3 = dist[4*Np+n];
+		double f4 = dist[3*Np+n];
+		double f6 = dist[5*Np+n];
+		double f7 = dist[8*Np+n];
+		double f8 = dist[7*Np+n];
+		double f9 = dist[10*Np+n];
+		double f10 = dist[9*Np+n];
+		double f12 = dist[11*Np+n];
+		double f13 = dist[14*Np+n];
+		double f16 = dist[15*Np+n];
+		double f17 = dist[18*Np+n];
 		sum += factor*(f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17));
 	}
 	return sum;
@@ -385,8 +381,6 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub
 	// odd distributions in disteven and even distributions in distodd.
 	int n,N;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double dout = 0.f;
 
 	N = Nx*Ny*Nz;
@@ -399,26 +393,26 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub
 		//........................................................................
 		// Read distributions from "opposite" memory convention
 		//........................................................................
-		f1 = distodd[n];
-		f3 = distodd[N+n];
-		f5 = distodd[2*N+n];
-		f7 = distodd[3*N+n];
-		f9 = distodd[4*N+n];
-		f11 = distodd[5*N+n];
-		f13 = distodd[6*N+n];
-		f15 = distodd[7*N+n];
-		f17 = distodd[8*N+n];
+		double f1 = distodd[n];
+		double f3 = distodd[N+n];
+		double f5 = distodd[2*N+n];
+		double f7 = distodd[3*N+n];
+		double f9 = distodd[4*N+n];
+		double f11 = distodd[5*N+n];
+		//double f13 = distodd[6*N+n];
+		double f15 = distodd[7*N+n];
+		//double f17 = distodd[8*N+n];
 		//........................................................................
-		f0 = disteven[n];
-		f2 = disteven[N+n];
-		f4 = disteven[2*N+n];
-		f6 = disteven[3*N+n];
-		f8 = disteven[4*N+n];
-		f10 = disteven[5*N+n];
-		f12 = disteven[6*N+n];
-		f14 = disteven[7*N+n];
-		f16 = disteven[8*N+n];
-		f18 = disteven[9*N+n];
+		double f0 = disteven[n];
+		double f2 = disteven[N+n];
+		double f4 = disteven[2*N+n];
+		//double f6 = disteven[3*N+n];
+		double f8 = disteven[4*N+n];
+		double f10 = disteven[5*N+n];
+		//double f12 = disteven[6*N+n];
+		double f14 = disteven[7*N+n];
+		//double f16 = disteven[8*N+n];
+		double f18 = disteven[9*N+n];
 
 		sum += (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18));
 
@@ -429,28 +423,25 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub
 
 extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, double din, int count, int Np)
 {
-	int idx, n;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double ux,uy,uz,Cyz,Cxz;
 	ux = uy = 0.0;
 	for (int idx=0; idx<count; idx++){
-		n = list[idx];
-		f0 = dist[n];
-		f1 = dist[2*Np+n];
-		f2 = dist[1*Np+n];
-		f3 = dist[4*Np+n];
-		f4 = dist[3*Np+n];
-		f6 = dist[5*Np+n];
-		f7 = dist[8*Np+n];
-		f8 = dist[7*Np+n];
-		f9 = dist[10*Np+n];
-		f10 = dist[9*Np+n];
-		f12 = dist[11*Np+n];
-		f13 = dist[14*Np+n];
-		f16 = dist[15*Np+n];
-		f17 = dist[18*Np+n];
+		int n = list[idx];
+		double f0 = dist[n];
+		double f1 = dist[2*Np+n];
+		double f2 = dist[1*Np+n];
+		double f3 = dist[4*Np+n];
+		double f4 = dist[3*Np+n];
+		double f6 = dist[5*Np+n];
+		double f7 = dist[8*Np+n];
+		double f8 = dist[7*Np+n];
+		double f9 = dist[10*Np+n];
+		double f10 = dist[9*Np+n];
+		double f12 = dist[11*Np+n];
+		double f13 = dist[14*Np+n];
+		double f16 = dist[15*Np+n];
+		double f17 = dist[18*Np+n];
 		//...................................................
 		// Determine the inlet flow velocity
 		//ux = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14);
@@ -460,11 +451,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, doubl
 		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
 		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
 
-		f5 = f6 + 0.33333333333333338*uz;
-		f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz;
-		f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz;
-		f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz;
-		f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz;
+		double f5 = f6 + 0.33333333333333338*uz;
+		double f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz;
+		double f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz;
+		double f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz;
+		double f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz;
 
 		dist[6*Np+n] = f5;
 		dist[12*Np+n] = f11;
@@ -476,31 +467,28 @@ extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, doubl
 
 extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist, double dout, int count, int Np)
 {
-	int idx, n;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double ux,uy,uz,Cyz,Cxz;
 	ux = uy = 0.0;
 	for (int idx=0; idx<count; idx++){
-		n = list[idx];
+		int n = list[idx];
 		//........................................................................
 		// Read distributions 
 		//........................................................................
-		f0 = dist[n];
-		f1 = dist[2*Np+n];
-		f2 = dist[1*Np+n];
-		f3 = dist[4*Np+n];
-		f4 = dist[3*Np+n];
-		f5 = dist[6*Np+n];
-		f7 = dist[8*Np+n];
-		f8 = dist[7*Np+n];
-		f9 = dist[10*Np+n];
-		f10 = dist[9*Np+n];
-		f11 = dist[12*Np+n];
-		f14 = dist[13*Np+n];
-		f15 = dist[16*Np+n];
-		f18 = dist[17*Np+n];
+		double f0 = dist[n];
+		double f1 = dist[2*Np+n];
+		double f2 = dist[1*Np+n];
+		double f3 = dist[4*Np+n];
+		double f4 = dist[3*Np+n];
+		double f5 = dist[6*Np+n];
+		double f7 = dist[8*Np+n];
+		double f8 = dist[7*Np+n];
+		double f9 = dist[10*Np+n];
+		double f10 = dist[9*Np+n];
+		double f11 = dist[12*Np+n];
+		double f14 = dist[13*Np+n];
+		double f15 = dist[16*Np+n];
+		double f18 = dist[17*Np+n];
 		
 		// Determine the outlet flow velocity
 		//ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
@@ -510,11 +498,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist, doubl
 		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
 		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
 
-		f6 = f5 - 0.33333333333333338*uz;
-		f12 = f11 - 0.16666666666666678*(uz+ux)+Cxz;
-		f13 = f14 - 0.16666666666666678*(uz-ux)-Cxz;
-		f16 = f15 - 0.16666666666666678*(uy+uz)+Cyz;
-		f17 = f18 - 0.16666666666666678*(uz-uy)-Cyz;
+		double f6 = f5 - 0.33333333333333338*uz;
+		double f12 = f11 - 0.16666666666666678*(uz+ux)+Cxz;
+		double f13 = f14 - 0.16666666666666678*(uz-ux)-Cxz;
+		double f16 = f15 - 0.16666666666666678*(uy+uz)+Cyz;
+		double f17 = f18 - 0.16666666666666678*(uz-uy)-Cyz;
 
 		dist[5*Np+n] = f6;
 		dist[11*Np+n] = f12;
@@ -527,57 +515,54 @@ extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist, doubl
 
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *d_neighborList, int *list, double *dist, double din, int count, int Np)
 {
-	int idx, n;
 	int nread;
 	int nr5,nr11,nr14,nr15,nr18;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double ux,uy,uz,Cyz,Cxz;
 	ux = uy = 0.0;
 
 	for (int idx=0; idx<count; idx++){
-		n = list[idx];
-		f0 = dist[n];
+		int n = list[idx];
+		double f0 = dist[n];
 
 		nread = d_neighborList[n];
-		f1 = dist[nread];
+		double f1 = dist[nread];
 
 		nread = d_neighborList[n+2*Np];
-		f3 = dist[nread];
+		double f3 = dist[nread];
 
 		nread = d_neighborList[n+6*Np];
-		f7 = dist[nread];
+		double f7 = dist[nread];
 
 		nread = d_neighborList[n+8*Np];
-		f9 = dist[nread];
+		double f9 = dist[nread];
 
 		nread = d_neighborList[n+12*Np];
-		f13 = dist[nread];
+		double f13 = dist[nread];
 
 		nread = d_neighborList[n+16*Np];
-		f17 = dist[nread];
+		double f17 = dist[nread];
 
 		nread = d_neighborList[n+Np];
-		f2 = dist[nread];
+		double f2 = dist[nread];
 
 		nread = d_neighborList[n+3*Np];
-		f4 = dist[nread];
+		double f4 = dist[nread];
 
 		nread = d_neighborList[n+5*Np];
-		f6 = dist[nread];
+		double f6 = dist[nread];
 
 		nread = d_neighborList[n+7*Np];
-		f8 = dist[nread];
+		double f8 = dist[nread];
 
 		nread = d_neighborList[n+9*Np];
-		f10 = dist[nread];
+		double f10 = dist[nread];
 
 		nread = d_neighborList[n+11*Np];
-		f12 = dist[nread];
+		double f12 = dist[nread];
 
 		nread = d_neighborList[n+15*Np];
-		f16 = dist[nread];
+		double f16 = dist[nread];
 
 		// Unknown distributions
 		nr5 = d_neighborList[n+4*Np];
@@ -595,11 +580,11 @@ extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *d_neighborList, int *list,
 		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
 		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
 
-		f5 = f6 + 0.33333333333333338*uz;
-		f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz;
-		f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz;
-		f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz;
-		f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz;
+		double f5 = f6 + 0.33333333333333338*uz;
+		double f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz;
+		double f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz;
+		double f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz;
+		double f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz;
 
 		dist[nr5] = f5;
 		dist[nr11] = f11;
@@ -611,60 +596,58 @@ extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *d_neighborList, int *list,
 
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *d_neighborList, int *list, double *dist, double dout, int count, int Np)
 {
-	int idx,n,nread;
+	int nread;
 	int nr6,nr12,nr13,nr16,nr17;
 	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double ux,uy,uz,Cyz,Cxz;
 	ux = uy = 0.0;
 
 	for (int idx=0; idx<count; idx++){
-		n = list[idx];
+		int n = list[idx];
 		//........................................................................
 		// Read distributions 
 		//........................................................................
-		f0 = dist[n];
+		double f0 = dist[n];
 
 		nread = d_neighborList[n];
-		f1 = dist[nread];
+		double f1 = dist[nread];
 
 		nread = d_neighborList[n+2*Np];
-		f3 = dist[nread];
+		double f3 = dist[nread];
 
 		nread = d_neighborList[n+4*Np];
-		f5 = dist[nread];
+		double f5 = dist[nread];
 
 		nread = d_neighborList[n+6*Np];
-		f7 = dist[nread];
+		double f7 = dist[nread];
 
 		nread = d_neighborList[n+8*Np];
-		f9 = dist[nread];
+		double f9 = dist[nread];
 
 		nread = d_neighborList[n+10*Np];
-		f11 = dist[nread];
+		double f11 = dist[nread];
 
 		nread = d_neighborList[n+14*Np];
-		f15 = dist[nread];
+		double f15 = dist[nread];
 
 
 		nread = d_neighborList[n+Np];
-		f2 = dist[nread];
+		double f2 = dist[nread];
 
 		nread = d_neighborList[n+3*Np];
-		f4 = dist[nread];
+		double f4 = dist[nread];
 
 		nread = d_neighborList[n+7*Np];
-		f8 = dist[nread];
+		double f8 = dist[nread];
 
 		nread = d_neighborList[n+9*Np];
-		f10 = dist[nread];
+		double f10 = dist[nread];
 
 		nread = d_neighborList[n+13*Np];
-		f14 = dist[nread];
+		double f14 = dist[nread];
 
 		nread = d_neighborList[n+17*Np];
-		f18 = dist[nread];
+		double f18 = dist[nread];
 		
 		// unknown distributions
 		nr6 = d_neighborList[n+5*Np];
@@ -681,11 +664,11 @@ extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *d_neighborList, int *list,
 		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
 		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
 
-		f6 = f5 - 0.33333333333333338*uz;
-		f12 = f11 - 0.16666666666666678*(uz+ux)+Cxz;
-		f13 = f14 - 0.16666666666666678*(uz-ux)-Cxz;
-		f16 = f15 - 0.16666666666666678*(uy+uz)+Cyz;
-		f17 = f18 - 0.16666666666666678*(uz-uy)-Cyz;
+		double f6 = f5 - 0.33333333333333338*uz;
+		double f12 = f11 - 0.16666666666666678*(uz+ux)+Cxz;
+		double f13 = f14 - 0.16666666666666678*(uz-ux)-Cxz;
+		double f16 = f15 - 0.16666666666666678*(uy+uz)+Cyz;
+		double f17 = f18 - 0.16666666666666678*(uz-uy)-Cyz;
 
 		//........Store in "opposite" memory location..........
 		dist[nr6] = f6;
@@ -995,35 +978,30 @@ extern "C" void ScaLBL_D3Q19_Momentum(double *dist, double *vel, int Np)
 
 extern "C" void ScaLBL_D3Q19_Pressure(double *dist, double *Pressure, int N)
 {
-	int n;
-	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
-
-	for (n=0; n<N; n++){
+	for (int n=0; n<N; n++){
 		//........................................................................
 		// Registers to store the distributions
 		//........................................................................
-		f0 = dist[n];
-		f2 = dist[2*N+n];
-		f4 = dist[4*N+n];
-		f6 = dist[6*N+n];
-		f8 = dist[8*N+n];
-		f10 = dist[10*N+n];
-		f12 = dist[12*N+n];
-		f14 = dist[14*N+n];
-		f16 = dist[16*N+n];
-		f18 = dist[18*N+n];
+		double f0 = dist[n];
+		double f2 = dist[2*N+n];
+		double f4 = dist[4*N+n];
+		double f6 = dist[6*N+n];
+		double f8 = dist[8*N+n];
+		double f10 = dist[10*N+n];
+		double f12 = dist[12*N+n];
+		double f14 = dist[14*N+n];
+		double f16 = dist[16*N+n];
+		double f18 = dist[18*N+n];
 		//........................................................................
-		f1 = dist[N+n];
-		f3 = dist[3*N+n];
-		f5 = dist[5*N+n];
-		f7 = dist[7*N+n];
-		f9 = dist[9*N+n];
-		f11 = dist[11*N+n];
-		f13 = dist[13*N+n];
-		f15 = dist[15*N+n];
-		f17 = dist[17*N+n];
+		double f1 = dist[N+n];
+		double f3 = dist[3*N+n];
+		double f5 = dist[5*N+n];
+		double f7 = dist[7*N+n];
+		double f9 = dist[9*N+n];
+		double f11 = dist[11*N+n];
+		double f13 = dist[13*N+n];
+		double f15 = dist[15*N+n];
+		double f17 = dist[17*N+n];
 		//.................Compute the velocity...................................
 		Pressure[n] = 0.3333333333333333*(f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+
 				f9+f12+f11+f14+f13+f16+f15+f18+f17);
@@ -1031,31 +1009,29 @@ extern "C" void ScaLBL_D3Q19_Pressure(double *dist, double *Pressure, int N)
 }
 
 extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
-		double Fy, double Fz){
-	int n;
-	double fq,fp;
+		double Fy, double Fz)
+{
 	// conserved momemnts
 	double rho,jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
 
-	const double mrt_V1=0.05263157894736842;
-	const double mrt_V2=0.012531328320802;
-	const double mrt_V3=0.04761904761904762;
-	const double mrt_V4=0.004594820384294068;
-	const double mrt_V5=0.01587301587301587;
-	const double mrt_V6=0.0555555555555555555555555;
-	const double mrt_V7=0.02777777777777778;
-	const double mrt_V8=0.08333333333333333;
-	const double mrt_V9=0.003341687552213868;
-	const double mrt_V10=0.003968253968253968;
-	const double mrt_V11=0.01388888888888889;
-	const double mrt_V12=0.04166666666666666;
-
+	constexpr double mrt_V1=0.05263157894736842;
+	constexpr double mrt_V2=0.012531328320802;
+	constexpr double mrt_V3=0.04761904761904762;
+	constexpr double mrt_V4=0.004594820384294068;
+	constexpr double mrt_V5=0.01587301587301587;
+	constexpr double mrt_V6=0.0555555555555555555555555;
+	constexpr double mrt_V7=0.02777777777777778;
+	constexpr double mrt_V8=0.08333333333333333;
+	constexpr double mrt_V9=0.003341687552213868;
+	constexpr double mrt_V10=0.003968253968253968;
+	constexpr double mrt_V11=0.01388888888888889;
+	constexpr double mrt_V12=0.04166666666666666;
 
 	for (int n=start; n<finish; n++){
 		// q=0
-		fq = dist[n];
+		double fq = dist[n];
 		rho = fq;
 		m1  = -30.0*fq;
 		m2  = 12.0*fq;
@@ -1462,31 +1438,30 @@ extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_MRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
-		double Fy, double Fz){
-	int n;
-	double fq,fp;
+		double Fy, double Fz)
+{
 	// conserved momemnts
 	double rho,jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	const double mrt_V1=0.05263157894736842;
-	const double mrt_V2=0.012531328320802;
-	const double mrt_V3=0.04761904761904762;
-	const double mrt_V4=0.004594820384294068;
-	const double mrt_V5=0.01587301587301587;
-	const double mrt_V6=0.0555555555555555555555555;
-	const double mrt_V7=0.02777777777777778;
-	const double mrt_V8=0.08333333333333333;
-	const double mrt_V9=0.003341687552213868;
-	const double mrt_V10=0.003968253968253968;
-	const double mrt_V11=0.01388888888888889;
-	const double mrt_V12=0.04166666666666666;
+	constexpr double mrt_V1=0.05263157894736842;
+	constexpr double mrt_V2=0.012531328320802;
+	constexpr double mrt_V3=0.04761904761904762;
+	constexpr double mrt_V4=0.004594820384294068;
+	constexpr double mrt_V5=0.01587301587301587;
+	constexpr double mrt_V6=0.0555555555555555555555555;
+	constexpr double mrt_V7=0.02777777777777778;
+	constexpr double mrt_V8=0.08333333333333333;
+	constexpr double mrt_V9=0.003341687552213868;
+	constexpr double mrt_V10=0.003968253968253968;
+	constexpr double mrt_V11=0.01388888888888889;
+	constexpr double mrt_V12=0.04166666666666666;
 
 
 	int nread;
 	for (int n=start; n<finish; n++){
 		// q=0
-		fq = dist[n];
+		double fq = dist[n];
 		rho = fq;
 		m1  = -30.0*fq;
 		m2  = 12.0*fq;
@@ -1937,11 +1912,8 @@ extern "C" void ScaLBL_D3Q19_AAodd_MRT(int *neighborList, double *dist, int star
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *dist,  int Np) {
-
-	int n;
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
+extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *dist,  int Np)
+{
 
 	for (int n=0; n<Np; n++){
 
@@ -1950,27 +1922,26 @@ extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *dist,  int Np) {
 		//              (read from opposite array due to previous swap operation)
 		//........................................................................
 		// even
-		f2 = dist[10*Np+n];
-		f4 = dist[11*Np+n];
-		f6 = dist[12*Np+n];
-		f8 = dist[13*Np+n];
-		f10 = dist[14*Np+n];
-		f12 = dist[15*Np+n];
-		f14 = dist[16*Np+n];
-		f16 = dist[17*Np+n];
-		f18 = dist[18*Np+n];
+		double f2 = dist[10*Np+n];
+		double f4 = dist[11*Np+n];
+		double f6 = dist[12*Np+n];
+		double f8 = dist[13*Np+n];
+		double f10 = dist[14*Np+n];
+		double f12 = dist[15*Np+n];
+		double f14 = dist[16*Np+n];
+		double f16 = dist[17*Np+n];
+		double f18 = dist[18*Np+n];
 
-		f0 = dist[n];
 		// odd
-		f1 = dist[Np+n];
-		f3 = dist[2*Np+n];
-		f5 = dist[3*Np+n];
-		f7 = dist[4*Np+n];
-		f9 = dist[5*Np+n];
-		f11 = dist[6*Np+n];
-		f13 = dist[7*Np+n];
-		f15 = dist[8*Np+n];
-		f17 = dist[9*Np+n];
+		double f1 = dist[Np+n];
+		double f3 = dist[2*Np+n];
+		double f5 = dist[3*Np+n];
+		double f7 = dist[4*Np+n];
+		double f9 = dist[5*Np+n];
+		double f11 = dist[6*Np+n];
+		double f13 = dist[7*Np+n];
+		double f15 = dist[8*Np+n];
+		double f17 = dist[9*Np+n];
 
 		//........................................................................
 		//                                      WRITE THE DISTRIBUTIONS
@@ -2000,71 +1971,69 @@ extern "C" void ScaLBL_D3Q19_AAeven_Compact(char * ID, double *dist,  int Np) {
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Compact(char * ID, int *neighborList, double *dist, int Np) {
-	int n;
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
+extern "C" void ScaLBL_D3Q19_AAodd_Compact(char * ID, int *neighborList, double *dist, int Np)
+{
 	int nread;
 
 	for (int n=0; n<Np; n++){
 		//........Get 1-D index for this thread....................
 
-		f0 = dist[n];
+		//double f0 = dist[n];
 
 		nread = neighborList[n]; // + 0*Np
-		f2 = dist[nread];
+		double f2 = dist[nread];
 
 		nread = neighborList[n+2*Np];
-		f4 = dist[nread];
+		double f4 = dist[nread];
 
 		nread = neighborList[n+4*Np];
-		f6 = dist[nread];
+		double f6 = dist[nread];
 
 		nread = neighborList[n+6*Np];
-		f8 = dist[nread];
+		double f8 = dist[nread];
 
 		nread = neighborList[n+8*Np];
-		f10 = dist[nread];
+		double f10 = dist[nread];
 
 		nread = neighborList[n+10*Np];
-		f12 = dist[nread];
+		double f12 = dist[nread];
 
 		nread = neighborList[n+12*Np];
-		f14 = dist[nread];
+		double f14 = dist[nread];
 
 		nread = neighborList[n+14*Np];
-		f16 = dist[nread];
+		double f16 = dist[nread];
 
 		nread = neighborList[n+16*Np];
-		f18 = dist[nread];
+		double f18 = dist[nread];
 
 
 		nread = neighborList[n+Np];
-		f1 = dist[nread];
+		double f1 = dist[nread];
 
 		nread = neighborList[n+3*Np];
-		f3 = dist[nread];
+		double f3 = dist[nread];
 
 		nread = neighborList[n+5*Np];
-		f5 = dist[nread];
+		double f5 = dist[nread];
 
 		nread = neighborList[n+7*Np];
-		f7 = dist[nread];
+		double f7 = dist[nread];
 
 		nread = neighborList[n+9*Np];
-		f9 = dist[nread];
+		double f9 = dist[nread];
 
 		nread = neighborList[n+11*Np];
-		f11 = dist[nread];
+		double f11 = dist[nread];
 
 		nread = neighborList[n+13*Np];
-		f13 = dist[nread];
+		double f13 = dist[nread];
 
 		nread = neighborList[n+15*Np];
-		f15 = dist[nread];
+		double f15 = dist[nread];
 
 		nread = neighborList[n+17*Np];
-		f17 = dist[nread];
+		double f17 = dist[nread];
 
 
 		nread = neighborList[n];
diff --git a/cpu/dfh.cpp b/cpu/dfh.cpp
index d8a3bdaf..c87bc261 100644
--- a/cpu/dfh.cpp
+++ b/cpu/dfh.cpp
@@ -23,11 +23,10 @@ extern "C" void ScaLBL_Gradient_Unpack(double weight, double Cqx, double Cqy, do
 	}
 }
 
-extern "C" void ScaLBL_DFH_Init(double *Phi, double *Den, double *Aq, double *Bq, int start, int finish, int Np){
-	int idx,n;
-	double phi,nA,nB;
-
-	for (idx=start; idx<finish; idx++){
+extern "C" void ScaLBL_DFH_Init(double *Phi, double *Den, double *Aq, double *Bq, int start, int finish, int Np)
+{
+	for (int idx=start; idx<finish; idx++){
+	    double phi,nA,nB;
 		phi = Phi[idx];
 		if (phi > 0.f){
 			nA = 1.0; nB = 0.f;
@@ -60,15 +59,13 @@ extern "C" void ScaLBL_DFH_Init(double *Phi, double *Den, double *Aq, double *Bq
 // LBM based on density functional hydrodynamics
 extern "C" void ScaLBL_D3Q19_AAeven_DFH(int *neighborList, double *dist, double *Aq, double *Bq, double *Den, double *Phi,
 		double *Gradient, double *SolidForce, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
-		double Fx, double Fy, double Fz, int start, int finish, int Np){
-
-	int ijk,nn,n;
+		double Fx, double Fy, double Fz, int start, int finish, int Np)
+{
 	double fq;
 	// conserved momemnts
 	double rho,jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	double m3,m5,m7;
 	double nA,nB; // number density
 	double a1,b1,a2,b2,nAB,delta;
 	double C,nx,ny,nz; //color gradient magnitude and direction
@@ -586,7 +583,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_DFH(int *neighborList, double *dist, double *
 		double *Phi, double *Gradient, double *SolidForce, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
 		double Fx, double Fy, double Fz, int start, int finish, int Np){
 	
-	int n,nn,ijk,nread;
+	int nread;
 	int nr1,nr2,nr3,nr4,nr5,nr6;
 	int nr7,nr8,nr9,nr10;
 	int nr11,nr12,nr13,nr14;
@@ -596,7 +593,6 @@ extern "C" void ScaLBL_D3Q19_AAodd_DFH(int *neighborList, double *dist, double *
 	double rho,jx,jy,jz;
 	// non-conserved moments
 	double m1,m2,m4,m6,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18;
-	double m3,m5,m7;
 	double nA,nB; // number density
 	double a1,b1,a2,b2,nAB,delta;
 	double C,nx,ny,nz; //color gradient magnitude and direction
@@ -1182,12 +1178,12 @@ extern "C" void ScaLBL_D3Q19_AAodd_DFH(int *neighborList, double *dist, double *
 }
 
 extern "C" void ScaLBL_D3Q7_AAodd_DFH(int *neighborList, double *Aq, double *Bq, 
-			double *Den, double *Phi, int start, int finish, int Np){
-
-	int idx,n,nread;
-	double fq,nA,nB;
+			double *Den, double *Phi, int start, int finish, int Np)
+{
 
 	for (int n=start; n<finish; n++){
+		int nread;
+		double fq,nA,nB;
 		
 		//..........Compute the number density for component A............
 		// q=0
@@ -1270,11 +1266,10 @@ extern "C" void ScaLBL_D3Q7_AAodd_DFH(int *neighborList, double *Aq, double *Bq,
 }
 
 extern "C" void ScaLBL_D3Q7_AAeven_DFH(double *Aq, double *Bq, double *Den, double *Phi, 
-			int start, int finish, int Np){
-	int idx,n,nread;
-	double fq,nA,nB;
+			int start, int finish, int Np)
+{
 	for (int n=start; n<finish; n++){
-		
+		double fq,nA,nB;
 		// compute number density for component A
 		// q=0
 		fq = Aq[n];
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index cddb4d9c..a8ba0cde 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -74,11 +74,6 @@ int main(int argc, char **argv)
         ScaLBL_DeviceBarrier();
         MPI_Barrier(comm);
 
-        PROFILE_ENABLE(1);
-        //PROFILE_ENABLE_TRACE();
-        //PROFILE_ENABLE_MEMORY();
-        PROFILE_SYNCHRONIZE();
-        PROFILE_START("Main");
         Utilities::setErrorHandlers();
 
         // Variables that specify the computational domain  
@@ -164,14 +159,14 @@ int main(int argc, char **argv)
             pBC=false;
 
         // Full domain used for averaging (do not use mask for analysis)
-        std::shared_ptr<Domain> Dm(new Domain(domain_db,comm));
+        auto Dm = std::make_shared<Domain>(domain_db,comm);
         for (int i=0; i<Dm->Nx*Dm->Ny*Dm->Nz; i++) Dm->id[i] = 1;
-        std::shared_ptr<TwoPhase> Averages( new TwoPhase(Dm) );
+        auto Averages = std::make_shared<TwoPhase>(Dm);
         //   TwoPhase Averages(Dm);
         Dm->CommInit();
 
         // Mask that excludes the solid phase
-        std::shared_ptr<Domain> Mask(new Domain(domain_db,comm));
+        auto Mask = std::make_shared<Domain>(domain_db,comm);
         MPI_Barrier(comm);
 
         Nx+=2; Ny+=2; Nz += 2;
@@ -191,8 +186,7 @@ int main(int argc, char **argv)
 		//	printf("Local File Name =  %s \n",LocalRankFilename);
 		// .......... READ THE INPUT FILE .......................................
 		//	char value;
-		char *id;
-		id = new char[N];
+		auto id = new char[N];
 		double sum;
 		//...........................................................................
 		if (rank == 0) cout << "Setting up bubble..." << endl;
@@ -244,19 +238,17 @@ int main(int argc, char **argv)
 		// Initialize communication structures in averaging domain
 		for (i=0; i<Mask->Nx*Mask->Ny*Mask->Nz; i++) Mask->id[i] = id[i];
 		Mask->CommInit();
-		double *PhaseLabel;
-		PhaseLabel = new double[N];
+		auto PhaseLabel = new double[N];
 		
 		//...........................................................................
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 		// Create a communicator for the device (will use optimized layout)
-		std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm(new ScaLBL_Communicator(Mask));
+		auto ScaLBL_Comm = std::make_shared<ScaLBL_Communicator>(Mask);
 		
 		int Npad=(Np/16 + 2)*16;
 		if (rank==0)	printf ("Set up memory efficient layout Npad=%i \n",Npad);
-		int *neighborList;
 		IntArray Map(Nx,Ny,Nz);
-		neighborList= new int[18*Npad];
+		auto neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
 		MPI_Barrier(comm);
 
@@ -515,9 +507,8 @@ int main(int argc, char **argv)
 		// Copy back final phase indicator field and convert to regular layout
 		DoubleArray PhaseField(Nx,Ny,Nz);
         ScaLBL_Comm->RegularLayout(Map,Phi,PhaseField);
-    	FILE *OUTFILE;
 		sprintf(LocalRankFilename,"Phase.raw");
-		OUTFILE = fopen(LocalRankFilename,"wb");
+		auto OUTFILE = fopen(LocalRankFilename,"wb");
     	fwrite(PhaseField.data(),8,N,OUTFILE);
     	fclose(OUTFILE);
     	
@@ -535,9 +526,8 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-    	FILE *GFILE;
 		sprintf(LocalRankFilename,"Gradient.raw");
-		GFILE = fopen(LocalRankFilename,"wb");
+		auto GFILE = fopen(LocalRankFilename,"wb");
     	fwrite(GradNorm.data(),8,N,GFILE);
     	fclose(GFILE);
     	
@@ -545,14 +535,12 @@ int main(int argc, char **argv)
 		DoubleArray Rho2(Nx,Ny,Nz);
         ScaLBL_Comm->RegularLayout(Map,&Den[0],Rho1);
         ScaLBL_Comm->RegularLayout(Map,&Den[Np],Rho2);
-    	FILE *RFILE1;
 		sprintf(LocalRankFilename,"Rho1.raw");
-		RFILE1 = fopen(LocalRankFilename,"wb");
+		auto RFILE1 = fopen(LocalRankFilename,"wb");
     	fwrite(Rho1.data(),8,N,RFILE1);
     	fclose(RFILE1);
-    	FILE *RFILE2;
 		sprintf(LocalRankFilename,"Rho2.raw");
-		RFILE2 = fopen(LocalRankFilename,"wb");
+		auto RFILE2 = fopen(LocalRankFilename,"wb");
     	fwrite(Rho2.data(),8,N,RFILE2);
     	fclose(RFILE2);
     	

From d1f714a82e32be3e43d24446e5ff375453aff0be Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Tue, 28 Jan 2020 08:51:32 -0500
Subject: [PATCH 029/121] Adding MPI wrapper class

---
 CMakeLists.txt                            |  344 +-
 IO/MeshDatabase.cpp                       |  145 +-
 IO/MeshDatabase.h                         |    4 +-
 IO/PIO.cpp                                |   12 +-
 IO/PackData.cpp                           |  105 +
 IO/PackData.h                             |   78 +
 common/MPI_Helpers.hpp => IO/PackData.hpp |    9 +-
 IO/Writer.cpp                             |   20 +-
 IO/Writer.h                               |    4 +-
 IO/netcdf.cpp                             |   12 +-
 IO/netcdf.h                               |    4 +-
 IO/silo.cpp                               |    2 +-
 IO/silo.h                                 |    2 +-
 IO/silo.hpp                               |    2 +-
 analysis/Minkowski.cpp                    |   16 +-
 analysis/Minkowski.h                      |    2 +-
 analysis/SubPhase.cpp                     |  154 +-
 analysis/SubPhase.h                       |    2 +-
 analysis/TwoPhase.cpp                     |   83 +-
 analysis/TwoPhase.h                       |    2 +-
 analysis/analysis.cpp                     |   83 +-
 analysis/analysis.h                       |    8 +-
 analysis/distance.cpp                     |    2 +-
 analysis/morphology.cpp                   |  106 +-
 analysis/runAnalysis.cpp                  |   49 +-
 analysis/runAnalysis.h                    |    8 +-
 analysis/uCT.cpp                          |   11 +-
 cmake/FindHIP.cmake                       |  579 ++++
 common/Communication.h                    |  198 +-
 common/Communication.hpp                  |   53 +-
 common/Domain.cpp                         |  225 +-
 common/Domain.h                           |    6 +-
 common/MPI.I                              | 1143 +++++++
 common/MPI.cpp                            | 3758 +++++++++++++++++++++
 common/MPI.h                              | 1152 +++++++
 common/MPI_Helpers.cpp                    |  266 --
 common/MPI_Helpers.h                      |  239 --
 common/ReadMicroCT.cpp                    |    4 +-
 common/ReadMicroCT.h                      |    3 +-
 common/ScaLBL.cpp                         |  226 +-
 common/ScaLBL.h                           |    3 +-
 common/SpherePack.cpp                     |    1 -
 common/SpherePack.h                       |    1 -
 common/UnitTest.cpp                       |  211 +-
 common/UnitTest.h                         |   71 +-
 common/UtilityMacros.h                    |   28 +-
 cpu/BGK.cpp                               |    5 +-
 cpu/Color.cpp                             |   51 +-
 cpu/exe/lb2_Color_mpi.cpp                 |    2 +-
 cpu/exe/lb2_Color_wia_mpi_bubble.cpp      |    2 +-
 models/ColorModel.cpp                     |   57 +-
 models/ColorModel.h                       |    6 +-
 models/DFHModel.cpp                       |   35 +-
 models/DFHModel.h                         |    6 +-
 models/MRTModel.cpp                       |   49 +-
 models/MRTModel.h                         |    6 +-
 tests/BlobAnalyzeParallel.cpp             |   21 +-
 tests/BlobIdentifyParallel.cpp            |    9 +-
 tests/ColorToBinary.cpp                   |    9 +-
 tests/ComponentLabel.cpp                  |    9 +-
 tests/GenerateSphereTest.cpp              |   75 +-
 tests/TestBlobAnalyze.cpp                 |   17 +-
 tests/TestBlobIdentify.cpp                |   37 +-
 tests/TestBlobIdentifyCorners.cpp         |    5 +-
 tests/TestBubble.cpp                      |   46 +-
 tests/TestBubbleDFH.cpp                   |   32 +-
 tests/TestColorBubble.cpp                 |   14 +-
 tests/TestColorGrad.cpp                   |   24 +-
 tests/TestColorGradDFH.cpp                |   18 +-
 tests/TestColorMassBounceback.cpp         |   32 +-
 tests/TestColorSquareTube.cpp             |   14 +-
 tests/TestCommD3Q19.cpp                   |   23 +-
 tests/TestDatabase.cpp                    |    9 +-
 tests/TestFluxBC.cpp                      |   18 +-
 tests/TestForceD3Q19.cpp                  |    7 +-
 tests/TestForceMoments.cpp                |   30 +-
 tests/TestInterfaceSpeed.cpp              |   32 +-
 tests/TestMRT.cpp                         |   38 +-
 tests/TestMap.cpp                         |   17 +-
 tests/TestMassConservationD3Q7.cpp        |   11 +-
 tests/TestMicroCTReader.cpp               |   10 +-
 tests/TestMomentsD3Q19.cpp                |    9 +-
 tests/TestNetcdf.cpp                      |   10 +-
 tests/TestPoiseuille.cpp                  |   18 +-
 tests/TestPressVel.cpp                    |   23 +-
 tests/TestSegDist.cpp                     |   13 +-
 tests/TestSubphase.cpp                    |    9 +-
 tests/TestTopo3D.cpp                      |    9 +-
 tests/TestTorus.cpp                       |    9 +-
 tests/TestTorusEvolve.cpp                 |    9 +-
 tests/TestTwoPhase.cpp                    |   11 +-
 tests/TestWriter.cpp                      |   21 +-
 tests/convertIO.cpp                       |   15 +-
 tests/hello_world.cpp                     |   11 +-
 tests/lb2_CMT_wia.cpp                     |    2 +-
 tests/lb2_Color_blob_wia_mpi.cpp          |   48 +-
 tests/lbpm_BGK_simulator.cpp              |   33 +-
 tests/lbpm_captube_pp.cpp                 |   16 +-
 tests/lbpm_color_macro_simulator.cpp      |   36 +-
 tests/lbpm_color_simulator.cpp            |   14 +-
 tests/lbpm_dfh_simulator.cpp              |   12 +-
 tests/lbpm_disc_pp.cpp                    |   24 +-
 tests/lbpm_inkbottle_pp.cpp               |   20 +-
 tests/lbpm_juanes_bench_disc_pp.cpp       |   26 +-
 tests/lbpm_minkowski_scalar.cpp           |   23 +-
 tests/lbpm_morph_pp.cpp                   |   22 +-
 tests/lbpm_morphdrain_pp.cpp              |   12 +-
 tests/lbpm_morphopen_pp.cpp               |   12 +-
 tests/lbpm_nondarcy_simulator.cpp         |   28 +-
 tests/lbpm_nonnewtonian_simulator.cpp     |   83 +-
 tests/lbpm_nonnewtonian_simulator.h       |   40 +-
 tests/lbpm_permeability_simulator.cpp     |   13 +-
 tests/lbpm_plates_pp.cpp                  |   20 +-
 tests/lbpm_porenetwork_pp.cpp             |   20 +-
 tests/lbpm_random_pp.cpp                  |   13 +-
 tests/lbpm_refine_pp.cpp                  |    9 +-
 tests/lbpm_segmented_decomp.cpp           |   20 +-
 tests/lbpm_segmented_pp.cpp               |    9 +-
 tests/lbpm_sphere_pp.cpp                  |   18 +-
 tests/lbpm_squaretube_pp.cpp              |   20 +-
 tests/lbpm_uCT_maskfilter.cpp             |   16 +-
 tests/lbpm_uCT_pp.cpp                     |   37 +-
 tests/testCommunication.cpp               |   34 +-
 tests/test_dcel_minkowski.cpp             |    8 +-
 tests/test_dcel_tri_normal.cpp            |    4 +-
 125 files changed, 8530 insertions(+), 2541 deletions(-)
 create mode 100644 IO/PackData.cpp
 create mode 100644 IO/PackData.h
 rename common/MPI_Helpers.hpp => IO/PackData.hpp (95%)
 create mode 100644 cmake/FindHIP.cmake
 create mode 100644 common/MPI.I
 create mode 100644 common/MPI.cpp
 create mode 100644 common/MPI.h
 delete mode 100644 common/MPI_Helpers.cpp
 delete mode 100644 common/MPI_Helpers.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acc2c2dc..1e7eeaea 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,170 +1,174 @@
-# Set some CMake properties    
-CMAKE_MINIMUM_REQUIRED( VERSION 3.9 )
-
-
-MESSAGE("====================")
-MESSAGE("Configuring LBPM-WIA")
-MESSAGE("====================")
-
-
-# Set the project name
-SET( PROJ LBPM )          # Set the project name for CMake
-SET( LBPM_LIB lbpm-wia )  # Set the final library name
-SET( LBPM_INC  )          # Set an optional subfolder for includes (e.g. include/name/...)
-SET( TEST_MAX_PROCS 16 )
-
-
-# Initialize the project
-PROJECT( ${PROJ} LANGUAGES CXX )
-
-
-# Prevent users from building in place
-IF ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
-    MESSAGE( FATAL_ERROR "Building code in place is a bad idea" )
-ENDIF()
-
-
-# Set the default C++ standard
-SET( CMAKE_CXX_EXTENSIONS OFF )
-IF ( NOT CMAKE_CXX_STANDARD )
-    IF ( CXX_STD )
-        MESSAGE( FATAL_ERROR "CXX_STD is obsolete, please set CMAKE_CXX_STANDARD" )
-    ENDIF()
-    SET( CMAKE_CXX_STANDARD 14 )
-ENDIF()
-IF ( ( "${CMAKE_CXX_STANDARD}" GREATER "90" ) OR ( "${CMAKE_CXX_STANDARD}" LESS "14" ) )
-    MESSAGE( FATAL_ERROR "C++14 or newer required" )
-ENDIF()
-
-
-# Set source/install paths
-SET( ${PROJ}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
-SET( ${PROJ}_BUILD_DIR  "${CMAKE_CURRENT_BINARY_DIR}" )
-IF( ${PROJ}_INSTALL_DIR )
-    SET( ${PROJ}_INSTALL_DIR "${${PROJ}_INSTALL_DIR}" )
-ELSEIF( PREFIX )
-    SET( ${PROJ}_INSTALL_DIR "${PREFIX}" )
-ELSEIF( NOT ${PROJ}_INSTALL_DIR )
-    SET( ${PROJ}_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}" )
-ENDIF()
-INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
-SET( CMAKE_MODULE_PATH ${${PROJ}_SOURCE_DIR} ${${PROJ}_SOURCE_DIR}/cmake )
-
-
-# Include macros
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/macros.cmake" )
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/libraries.cmake" )
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/LBPM-macros.cmake" )
-
-
-# Check if we are only compiling docs
-CHECK_ENABLE_FLAG( ONLY_BUILD_DOCS 0 )
-
-
-# Set testing paramaters
-SET( DROP_METHOD "http" )
-SET( DROP_SITE "" )
-SET( DROP_LOCATION "/CDash/submit.php?project=LBPM-WIA" )
-SET( TRIGGER_SITE "" )
-SET( DROP_SITE_CDASH TRUE )
-ENABLE_TESTING()
-INCLUDE( CTest )
-
-
-# Check the compile mode and compile flags
-IF ( NOT ONLY_BUILD_DOCS )
-    CONFIGURE_SYSTEM()
-ENDIF()
-
-
-# Add some directories to include
-INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
-
-
-# Create the target for documentation
-ADD_CUSTOM_TARGET( doc )
-ADD_CUSTOM_TARGET( latex_docs )
-CHECK_ENABLE_FLAG( USE_DOXYGEN 1 )
-CHECK_ENABLE_FLAG( USE_LATEX 1 )
-FILE( MAKE_DIRECTORY "${${PROJ}_INSTALL_DIR}/doc" )
-IF ( USE_DOXYGEN )
-    SET( DOXYFILE_LATEX YES )
-    SET( DOXYFILE_IN "${${PROJ}_SOURCE_DIR}/doxygen/Doxyfile.in" )
-    SET( DOXY_HEADER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/header.html" )
-    SET( DOXY_FOOTER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/footer.html" )
-    SET( DOXYFILE_OUTPUT_DIR "${${PROJ}_INSTALL_DIR}/doc" )
-    SET( DOXYFILE_SRC_HTML_DIR "${${PROJ}_SOURCE_DIR}/doxygen/html" )
-    SET( DOXYFILE_SOURCE_DIR "${${PROJ}_SOURCE_DIR}" )
-    SET( REL_PACKAGE_HTML "" )
-    SET( DOXYGEN_MACROS "" )
-    MESSAGE("DOXYGEN_MACROS = ${DOXYGEN_MACROS}")
-    INCLUDE( "${${PROJ}_SOURCE_DIR}/cmake/UseDoxygen.cmake" )
-    IF ( DOXYGEN_FOUND )
-        ADD_DEPENDENCIES( doxygen latex_docs )
-        ADD_DEPENDENCIES( doc latex_docs doxygen )
-    ELSE()
-        SET( USE_DOXYGEN 0 )
-    ENDIF()
-ENDIF()
-
-
-# Create custom targets for build-test, check, and distclean
-ADD_CUSTOM_TARGET( build-test )
-ADD_CUSTOM_TARGET( build-examples )
-ADD_CUSTOM_TARGET( check COMMAND  make test  )
-ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool StackTrace )
-
-
-# Check for CUDA
-CHECK_ENABLE_FLAG( USE_CUDA 0 )
-NULL_USE( CMAKE_CUDA_FLAGS )
-IF ( USE_CUDA )
-    ADD_DEFINITIONS( -DUSE_CUDA )
-    ENABLE_LANGUAGE( CUDA )
-ENDIF()
-
-
-# Configure external packages
-IF ( NOT ONLY_BUILD_DOCS )
-    CONFIGURE_MPI()     # MPI must be before other libraries
-    CONFIGURE_MIC()
-    CONFIGURE_NETCDF()
-    CONFIGURE_SILO()
-    CONFIGURE_LBPM()
-    CONFIGURE_TIMER( 0 "${${PROJ}_INSTALL_DIR}/null_timer" )
-    CONFIGURE_LINE_COVERAGE()
-    # Set the external library link list
-    SET( EXTERNAL_LIBS ${EXTERNAL_LIBS} ${TIMER_LIBS} )
-ENDIF()
-
-
-
-# Macro to create 1,2,4 processor tests
-MACRO( ADD_LBPM_TEST_1_2_4 EXENAME ${ARGN} )
-    ADD_LBPM_TEST( ${EXENAME} ${ARGN} )
-    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 2 ${ARGN} )
-    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 4 ${ARGN} )
-ENDMACRO()
-
-
-# Add the src directories
-IF ( NOT ONLY_BUILD_DOCS )
-    BEGIN_PACKAGE_CONFIG( lbpm-wia-library )
-    ADD_PACKAGE_SUBDIRECTORY( common )
-    ADD_PACKAGE_SUBDIRECTORY( analysis )
-    ADD_PACKAGE_SUBDIRECTORY( IO )
-    ADD_PACKAGE_SUBDIRECTORY( threadpool )
-    ADD_PACKAGE_SUBDIRECTORY( StackTrace )
-    ADD_PACKAGE_SUBDIRECTORY( models )
-    IF ( USE_CUDA )
-        ADD_PACKAGE_SUBDIRECTORY( gpu )
-    ELSE()
-        ADD_PACKAGE_SUBDIRECTORY( cpu )
-    ENDIF()
-    INSTALL_LBPM_TARGET( lbpm-wia-library  )
-    ADD_SUBDIRECTORY( tests )
-    ADD_SUBDIRECTORY( example )
-    #ADD_SUBDIRECTORY( workflows )
-    INSTALL_PROJ_LIB()
-ENDIF()
-
+# Set some CMake properties    
+CMAKE_MINIMUM_REQUIRED( VERSION 3.9 )
+
+
+MESSAGE("====================")
+MESSAGE("Configuring LBPM-WIA")
+MESSAGE("====================")
+
+
+# Set the project name
+SET( PROJ LBPM )          # Set the project name for CMake
+SET( LBPM_LIB lbpm-wia )  # Set the final library name
+SET( LBPM_INC  )          # Set an optional subfolder for includes (e.g. include/name/...)
+SET( TEST_MAX_PROCS 16 )
+
+
+# Initialize the project
+PROJECT( ${PROJ} LANGUAGES CXX )
+
+
+# Prevent users from building in place
+IF ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
+    MESSAGE( FATAL_ERROR "Building code in place is a bad idea" )
+ENDIF()
+
+
+# Set the default C++ standard
+SET( CMAKE_CXX_EXTENSIONS OFF )
+IF ( NOT CMAKE_CXX_STANDARD )
+    IF ( CXX_STD )
+        MESSAGE( FATAL_ERROR "CXX_STD is obsolete, please set CMAKE_CXX_STANDARD" )
+    ENDIF()
+    SET( CMAKE_CXX_STANDARD 14 )
+ENDIF()
+IF ( ( "${CMAKE_CXX_STANDARD}" GREATER "90" ) OR ( "${CMAKE_CXX_STANDARD}" LESS "14" ) )
+    MESSAGE( FATAL_ERROR "C++14 or newer required" )
+ENDIF()
+
+
+# Set source/install paths
+SET( ${PROJ}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
+SET( ${PROJ}_BUILD_DIR  "${CMAKE_CURRENT_BINARY_DIR}" )
+IF( ${PROJ}_INSTALL_DIR )
+    SET( ${PROJ}_INSTALL_DIR "${${PROJ}_INSTALL_DIR}" )
+ELSEIF( PREFIX )
+    SET( ${PROJ}_INSTALL_DIR "${PREFIX}" )
+ELSEIF( NOT ${PROJ}_INSTALL_DIR )
+    SET( ${PROJ}_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}" )
+ENDIF()
+INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
+SET( CMAKE_MODULE_PATH ${${PROJ}_SOURCE_DIR} ${${PROJ}_SOURCE_DIR}/cmake )
+
+
+# Include macros
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/macros.cmake" )
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/libraries.cmake" )
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/LBPM-macros.cmake" )
+
+
+# Check if we are only compiling docs
+CHECK_ENABLE_FLAG( ONLY_BUILD_DOCS 0 )
+
+
+# Set testing paramaters
+SET( DROP_METHOD "http" )
+SET( DROP_SITE "" )
+SET( DROP_LOCATION "/CDash/submit.php?project=LBPM-WIA" )
+SET( TRIGGER_SITE "" )
+SET( DROP_SITE_CDASH TRUE )
+ENABLE_TESTING()
+INCLUDE( CTest )
+
+
+# Check the compile mode and compile flags
+IF ( NOT ONLY_BUILD_DOCS )
+    CONFIGURE_SYSTEM()
+ENDIF()
+
+
+# Add some directories to include
+INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
+
+
+# Create the target for documentation
+ADD_CUSTOM_TARGET( doc )
+ADD_CUSTOM_TARGET( latex_docs )
+CHECK_ENABLE_FLAG( USE_DOXYGEN 1 )
+CHECK_ENABLE_FLAG( USE_LATEX 1 )
+FILE( MAKE_DIRECTORY "${${PROJ}_INSTALL_DIR}/doc" )
+IF ( USE_DOXYGEN )
+    SET( DOXYFILE_LATEX YES )
+    SET( DOXYFILE_IN "${${PROJ}_SOURCE_DIR}/doxygen/Doxyfile.in" )
+    SET( DOXY_HEADER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/header.html" )
+    SET( DOXY_FOOTER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/footer.html" )
+    SET( DOXYFILE_OUTPUT_DIR "${${PROJ}_INSTALL_DIR}/doc" )
+    SET( DOXYFILE_SRC_HTML_DIR "${${PROJ}_SOURCE_DIR}/doxygen/html" )
+    SET( DOXYFILE_SOURCE_DIR "${${PROJ}_SOURCE_DIR}" )
+    SET( REL_PACKAGE_HTML "" )
+    SET( DOXYGEN_MACROS "" )
+    MESSAGE("DOXYGEN_MACROS = ${DOXYGEN_MACROS}")
+    INCLUDE( "${${PROJ}_SOURCE_DIR}/cmake/UseDoxygen.cmake" )
+    IF ( DOXYGEN_FOUND )
+        ADD_DEPENDENCIES( doxygen latex_docs )
+        ADD_DEPENDENCIES( doc latex_docs doxygen )
+    ELSE()
+        SET( USE_DOXYGEN 0 )
+    ENDIF()
+ENDIF()
+
+
+# Create custom targets for build-test, check, and distclean
+ADD_CUSTOM_TARGET( build-test )
+ADD_CUSTOM_TARGET( build-examples )
+ADD_CUSTOM_TARGET( check COMMAND  make test  )
+ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool StackTrace )
+
+
+# Check for CUDA
+CHECK_ENABLE_FLAG( USE_CUDA 0 )
+CHECK_ENABLE_FLAG( USE_HIP 0 )
+NULL_USE( CMAKE_CUDA_FLAGS )
+IF ( USE_CUDA )
+    ADD_DEFINITIONS( -DUSE_CUDA )
+    ENABLE_LANGUAGE( CUDA )
+ELSEIF ( USE_HIP )
+    FIND_PACKAGE( HIP )
+    MESSAGE( FATAL_ERROR "STOP" )
+ENDIF()
+
+
+# Configure external packages
+IF ( NOT ONLY_BUILD_DOCS )
+    CONFIGURE_MPI()     # MPI must be before other libraries
+    CONFIGURE_MIC()
+    CONFIGURE_NETCDF()
+    CONFIGURE_SILO()
+    CONFIGURE_LBPM()
+    CONFIGURE_TIMER( 0 "${${PROJ}_INSTALL_DIR}/null_timer" )
+    CONFIGURE_LINE_COVERAGE()
+    # Set the external library link list
+    SET( EXTERNAL_LIBS ${EXTERNAL_LIBS} ${TIMER_LIBS} )
+ENDIF()
+
+
+
+# Macro to create 1,2,4 processor tests
+MACRO( ADD_LBPM_TEST_1_2_4 EXENAME ${ARGN} )
+    ADD_LBPM_TEST( ${EXENAME} ${ARGN} )
+    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 2 ${ARGN} )
+    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 4 ${ARGN} )
+ENDMACRO()
+
+
+# Add the src directories
+IF ( NOT ONLY_BUILD_DOCS )
+    BEGIN_PACKAGE_CONFIG( lbpm-wia-library )
+    ADD_PACKAGE_SUBDIRECTORY( common )
+    ADD_PACKAGE_SUBDIRECTORY( analysis )
+    ADD_PACKAGE_SUBDIRECTORY( IO )
+    ADD_PACKAGE_SUBDIRECTORY( threadpool )
+    ADD_PACKAGE_SUBDIRECTORY( StackTrace )
+    ADD_PACKAGE_SUBDIRECTORY( models )
+    IF ( USE_CUDA )
+        ADD_PACKAGE_SUBDIRECTORY( gpu )
+    ELSE()
+        ADD_PACKAGE_SUBDIRECTORY( cpu )
+    ENDIF()
+    INSTALL_LBPM_TARGET( lbpm-wia-library  )
+    ADD_SUBDIRECTORY( tests )
+    ADD_SUBDIRECTORY( example )
+    #ADD_SUBDIRECTORY( workflows )
+    INSTALL_PROJ_LIB()
+ENDIF()
+
diff --git a/IO/MeshDatabase.cpp b/IO/MeshDatabase.cpp
index 1fad9231..2c03ddde 100644
--- a/IO/MeshDatabase.cpp
+++ b/IO/MeshDatabase.cpp
@@ -1,7 +1,8 @@
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
+#include "IO/PackData.h"
 #include "IO/IOHelpers.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 
 #include <vector>
@@ -13,8 +14,6 @@
 
 
 
-/****************************************************
-****************************************************/
 // MeshType
 template<>
 size_t packsize<IO::MeshType>( const IO::MeshType& rhs )
@@ -247,80 +246,76 @@ void DatabaseEntry::read( const std::string& line )
 
 // Gather the mesh databases from all processors
 inline int tod( int N ) { return (N+7)/sizeof(double); }
-std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, MPI_Comm comm )
+std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, const Utilities::MPI& comm )
 {
-    #ifdef USE_MPI
-        PROFILE_START("gatherAll");
-        PROFILE_START("gatherAll-pack",2);
-        int size = MPI_WORLD_SIZE();
-        // First pack the mesh data to local buffers
-        int localsize = 0;
-        for (size_t i=0; i<meshes.size(); i++)
-            localsize += tod(packsize(meshes[i]));
-        auto localbuf = new double[localsize];
-        int pos = 0;
-        for (size_t i=0; i<meshes.size(); i++) {
-            pack( meshes[i], (char*) &localbuf[pos] );
-            pos += tod(packsize(meshes[i]));
-        }
-        PROFILE_STOP("gatherAll-pack",2);
-        // Get the number of bytes each processor will be sending/recieving
-        PROFILE_START("gatherAll-send1",2);
-        auto recvsize = new int[size];
-        MPI_Allgather(&localsize,1,MPI_INT,recvsize,1,MPI_INT,comm);
-        int globalsize = recvsize[0];
-        auto disp = new int[size];
-        disp[0] = 0;
-        for (int i=1; i<size; i++) {
-            disp[i] = disp[i-1] + recvsize[i];
-            globalsize += recvsize[i];
-        }
-        PROFILE_STOP("gatherAll-send1",2);
-        // Send/recv the global data
-        PROFILE_START("gatherAll-send2",2);
-        auto globalbuf = new double[globalsize];
-        MPI_Allgatherv(localbuf,localsize,MPI_DOUBLE,globalbuf,recvsize,disp,MPI_DOUBLE,comm);
-        PROFILE_STOP("gatherAll-send2",2);
-        // Unpack the data
-        PROFILE_START("gatherAll-unpack",2);
-        std::map<std::string,MeshDatabase> data;
-        pos = 0;
-        while ( pos < globalsize ) {
-            MeshDatabase tmp;
-            unpack(tmp,(char*)&globalbuf[pos]);
-            pos += tod(packsize(tmp));
-            std::map<std::string,MeshDatabase>::iterator it = data.find(tmp.name);
-            if ( it==data.end() ) {
-                data[tmp.name] = tmp;
-            } else {
-                for (size_t i=0; i<tmp.domains.size(); i++)
-                    it->second.domains.push_back(tmp.domains[i]);
-                for (size_t i=0; i<tmp.variables.size(); i++)
-                    it->second.variables.push_back(tmp.variables[i]);
-                it->second.variable_data.insert(tmp.variable_data.begin(),tmp.variable_data.end());
-            }
-        }
-        for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it) {
-            // Get the unique variables
-            std::set<VariableDatabase> data2(it->second.variables.begin(),it->second.variables.end());
-            it->second.variables = std::vector<VariableDatabase>(data2.begin(),data2.end());
-        }
-        // Free temporary memory
-        delete [] localbuf;
-        delete [] recvsize;
-        delete [] disp;
-        delete [] globalbuf;
-        // Return the results
-        std::vector<MeshDatabase> data2(data.size());
-        size_t i=0; 
-        for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it, ++i)
-            data2[i] = it->second;
-        PROFILE_STOP("gatherAll-unpack",2);
-        PROFILE_STOP("gatherAll");
-        return data2;
-    #else
+    if ( comm.getSize() == 1 )
         return meshes;
-    #endif
+    PROFILE_START("gatherAll");
+    PROFILE_START("gatherAll-pack",2);
+    int size = comm.getSize();
+    // First pack the mesh data to local buffers
+    int localsize = 0;
+    for (size_t i=0; i<meshes.size(); i++)
+        localsize += tod(packsize(meshes[i]));
+    auto localbuf = new double[localsize];
+    int pos = 0;
+    for (size_t i=0; i<meshes.size(); i++) {
+        pack( meshes[i], (char*) &localbuf[pos] );
+        pos += tod(packsize(meshes[i]));
+    }
+    PROFILE_STOP("gatherAll-pack",2);
+    // Get the number of bytes each processor will be sending/recieving
+    PROFILE_START("gatherAll-send1",2);
+    auto recvsize = comm.allGather( localsize );
+    int globalsize = recvsize[0];
+    auto disp = new int[size];
+    disp[0] = 0;
+    for (int i=1; i<size; i++) {
+        disp[i] = disp[i-1] + recvsize[i];
+        globalsize += recvsize[i];
+    }
+    PROFILE_STOP("gatherAll-send1",2);
+    // Send/recv the global data
+    PROFILE_START("gatherAll-send2",2);
+    auto globalbuf = new double[globalsize];
+    comm.allGather(localbuf,localsize,globalbuf,recvsize.data(),disp,true);
+    PROFILE_STOP("gatherAll-send2",2);
+    // Unpack the data
+    PROFILE_START("gatherAll-unpack",2);
+    std::map<std::string,MeshDatabase> data;
+    pos = 0;
+    while ( pos < globalsize ) {
+        MeshDatabase tmp;
+        unpack(tmp,(char*)&globalbuf[pos]);
+        pos += tod(packsize(tmp));
+        std::map<std::string,MeshDatabase>::iterator it = data.find(tmp.name);
+        if ( it==data.end() ) {
+            data[tmp.name] = tmp;
+        } else {
+            for (size_t i=0; i<tmp.domains.size(); i++)
+                it->second.domains.push_back(tmp.domains[i]);
+            for (size_t i=0; i<tmp.variables.size(); i++)
+                it->second.variables.push_back(tmp.variables[i]);
+            it->second.variable_data.insert(tmp.variable_data.begin(),tmp.variable_data.end());
+        }
+    }
+    for (auto it=data.begin(); it!=data.end(); ++it) {
+        // Get the unique variables
+        std::set<VariableDatabase> data2(it->second.variables.begin(),it->second.variables.end());
+        it->second.variables = std::vector<VariableDatabase>(data2.begin(),data2.end());
+    }
+    // Free temporary memory
+    delete [] localbuf;
+    delete [] disp;
+    delete [] globalbuf;
+    // Return the results
+    std::vector<MeshDatabase> data2(data.size());
+    size_t i=0; 
+    for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it, ++i)
+        data2[i] = it->second;
+    PROFILE_STOP("gatherAll-unpack",2);
+    PROFILE_STOP("gatherAll");
+    return data2;
 }
 
 
diff --git a/IO/MeshDatabase.h b/IO/MeshDatabase.h
index 9f544925..8e501624 100644
--- a/IO/MeshDatabase.h
+++ b/IO/MeshDatabase.h
@@ -2,7 +2,7 @@
 #define MeshDatabase_INC
 
 #include "IO/Mesh.h" 
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 #include <iostream>
 #include <memory>
@@ -70,7 +70,7 @@ public:
 
 
 //! Gather the mesh databases from all processors
-std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, MPI_Comm comm );
+std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, const Utilities::MPI& comm );
 
 
 //! Write the mesh databases to a file
diff --git a/IO/PIO.cpp b/IO/PIO.cpp
index 6c6ece2d..3c2f3934 100644
--- a/IO/PIO.cpp
+++ b/IO/PIO.cpp
@@ -1,6 +1,6 @@
 #include "IO/PIO.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 #include <fstream>
 #include <string>
@@ -36,10 +36,7 @@ static void shutdownFilestream( )
 }
 void Utilities::logOnlyNodeZero( const std::string &filename )
 {
-    int rank = 0;
-    #ifdef USE_MPI
-        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-    #endif
+    int rank = ::Utilities::MPI( MPI_COMM_WORLD ).getRank();
     if ( rank == 0 )
         logAllNodes(filename,true);
 }
@@ -54,10 +51,7 @@ void Utilities::logAllNodes( const std::string &filename, bool singleStream )
     // Open the log stream and redirect output
     std::string full_filename = filename;
     if ( !singleStream ) {
-        int rank = 0;
-        #ifdef USE_MPI
-            MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-        #endif
+        int rank = ::Utilities::MPI( MPI_COMM_WORLD ).getRank();
         char tmp[100];
         sprintf(tmp,".%04i",rank);
         full_filename += std::string(tmp);
diff --git a/IO/PackData.cpp b/IO/PackData.cpp
new file mode 100644
index 00000000..f10d9ca7
--- /dev/null
+++ b/IO/PackData.cpp
@@ -0,0 +1,105 @@
+#include "IO/PackData.h"
+
+#include <string.h>
+
+
+/********************************************************
+* Concrete implimentations for packing/unpacking        *
+********************************************************/
+// unsigned char
+template<>
+size_t packsize<unsigned char>( const unsigned char& rhs )
+{
+    return sizeof(unsigned char);
+}
+template<>
+void pack<unsigned char>( const unsigned char& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(unsigned char));
+}
+template<>
+void unpack<unsigned char>( unsigned char& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(unsigned char));
+}
+// char
+template<>
+size_t packsize<char>( const char& rhs )
+{
+    return sizeof(char);
+}
+template<>
+void pack<char>( const char& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(char));
+}
+template<>
+void unpack<char>( char& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(char));
+}
+// int
+template<>
+size_t packsize<int>( const int& rhs )
+{
+    return sizeof(int);
+}
+template<>
+void pack<int>( const int& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(int));
+}
+template<>
+void unpack<int>( int& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(int));
+}
+// unsigned int
+template<>
+size_t packsize<unsigned int>( const unsigned int& rhs )
+{
+    return sizeof(unsigned int);
+}
+template<>
+void pack<unsigned int>( const unsigned int& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(int));
+}
+template<>
+void unpack<unsigned int>( unsigned int& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(int));
+}
+// size_t
+template<>
+size_t packsize<size_t>( const size_t& rhs )
+{
+    return sizeof(size_t);
+}
+template<>
+void pack<size_t>( const size_t& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(size_t));
+}
+template<>
+void unpack<size_t>( size_t& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(size_t));
+}
+// std::string
+template<>
+size_t packsize<std::string>( const std::string& rhs )
+{
+    return rhs.size()+1;
+}
+template<>
+void pack<std::string>( const std::string& rhs, char *buffer )
+{
+    memcpy(buffer,rhs.c_str(),rhs.size()+1);
+}
+template<>
+void unpack<std::string>( std::string& data, const char *buffer )
+{
+    data = std::string(buffer);
+}
+
diff --git a/IO/PackData.h b/IO/PackData.h
new file mode 100644
index 00000000..85326c0b
--- /dev/null
+++ b/IO/PackData.h
@@ -0,0 +1,78 @@
+// This file contains unctions to pack/unpack data structures
+#ifndef included_PackData
+#define included_PackData
+
+#include <vector>
+#include <set>
+#include <map>
+
+
+//! Template function to return the buffer size required to pack a class
+template<class TYPE>
+size_t packsize( const TYPE& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const TYPE& rhs, char *buffer );
+
+//! Template function to unpack a class from a buffer
+template<class TYPE>
+void unpack( TYPE& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::vector
+template<class TYPE>
+size_t packsize( const std::vector<TYPE>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const std::vector<TYPE>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void unpack( std::vector<TYPE>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::pair
+template<class TYPE1, class TYPE2>
+size_t packsize( const std::pair<TYPE1,TYPE2>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void pack( const std::pair<TYPE1,TYPE2>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void unpack( std::pair<TYPE1,TYPE2>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::map
+template<class TYPE1, class TYPE2>
+size_t packsize( const std::map<TYPE1,TYPE2>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void pack( const std::map<TYPE1,TYPE2>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void unpack( std::map<TYPE1,TYPE2>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::set
+template<class TYPE>
+size_t packsize( const std::set<TYPE>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const std::set<TYPE>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void unpack( std::set<TYPE>& data, const char *buffer );
+
+
+#include "IO/PackData.hpp"
+
+#endif
+
diff --git a/common/MPI_Helpers.hpp b/IO/PackData.hpp
similarity index 95%
rename from common/MPI_Helpers.hpp
rename to IO/PackData.hpp
index 85261cf1..006cdf73 100644
--- a/common/MPI_Helpers.hpp
+++ b/IO/PackData.hpp
@@ -1,8 +1,9 @@
-// This file contains wrappers for MPI routines and functions to pack/unpack data structures
-#ifndef MPI_WRAPPERS_HPP
-#define MPI_WRAPPERS_HPP
+// This file functions to pack/unpack data structures
+#ifndef included_PackData_hpp
+#define included_PackData_hpp
+
+#include "IO/PackData.h"
 
-#include "common/MPI_Helpers.h"
 #include <string.h>
 #include <vector>
 #include <set>
diff --git a/IO/Writer.cpp b/IO/Writer.cpp
index 6581ad42..61c333af 100644
--- a/IO/Writer.cpp
+++ b/IO/Writer.cpp
@@ -2,7 +2,7 @@
 #include "IO/MeshDatabase.h"
 #include "IO/IOHelpers.h"
 #include "IO/silo.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 
 #include <sys/stat.h>
@@ -36,7 +36,7 @@ void IO::initialize( const std::string& path, const std::string& format, bool ap
         global_IO_format = Format::SILO;
     else
         ERROR("Unknown format");
-    int rank = comm_rank(MPI_COMM_WORLD);
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     if ( !append && rank==0 ) {
         mkdir(path.c_str(),S_IRWXU|S_IRGRP);
         std::string filename;
@@ -55,7 +55,7 @@ void IO::initialize( const std::string& path, const std::string& format, bool ap
 // Write the mesh data in the original format
 static std::vector<IO::MeshDatabase> writeMeshesOrigFormat( const std::vector<IO::MeshDataStruct>& meshData, const std::string& path )
 {
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     std::vector<IO::MeshDatabase> meshes_written;
     for (size_t i=0; i<meshData.size(); i++) {
         char domainname[100], filename[100], fullpath[200];
@@ -120,7 +120,7 @@ static std::vector<IO::MeshDatabase> writeMeshesOrigFormat( const std::vector<IO
 // Create the database entry for the mesh data
 static IO::MeshDatabase getDatabase( const std::string& filename, const IO::MeshDataStruct& mesh, int format )
 {
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     char domainname[100];
     sprintf(domainname,"%s_%05i",mesh.meshName.c_str(),rank);
     // Create the MeshDatabase
@@ -161,7 +161,7 @@ static IO::MeshDatabase write_domain( FILE *fid, const std::string& filename,
     const IO::MeshDataStruct& mesh, int format )
 {
     const int level = 0;
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     // Create the MeshDatabase
     IO::MeshDatabase database = getDatabase( filename, mesh, format );
     // Write the mesh
@@ -399,7 +399,7 @@ void writeSiloSummary( const std::vector<IO::MeshDatabase>& meshes_written, cons
 static std::vector<IO::MeshDatabase> writeMeshesNewFormat( 
     const std::vector<IO::MeshDataStruct>& meshData, const std::string& path, int format )
 {
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     std::vector<IO::MeshDatabase> meshes_written;
     char filename[100], fullpath[200];
     sprintf(filename,"%05i",rank);
@@ -419,7 +419,7 @@ static std::vector<IO::MeshDatabase> writeMeshesSilo(
     const std::vector<IO::MeshDataStruct>& meshData, const std::string& path, int format )
 {
 #ifdef USE_SILO
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     std::vector<IO::MeshDatabase> meshes_written;
     char filename[100], fullpath[200];
     sprintf(filename,"%05i.silo",rank);
@@ -441,12 +441,12 @@ static std::vector<IO::MeshDatabase> writeMeshesSilo(
 /****************************************************
 * Write the mesh data                               *
 ****************************************************/
-void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm )
+void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm )
 {
     if ( global_IO_path.empty() )
         IO::initialize( );
     PROFILE_START("writeData");
-    int rank = comm_rank(comm);
+    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
     // Check the meshData before writing
     for ( const auto& data : meshData ) {
         if ( !data.check() )
@@ -457,7 +457,7 @@ void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStr
     if ( rank == 0 ) {
         mkdir(path.c_str(),S_IRWXU|S_IRGRP);
     }
-    MPI_Barrier(comm);
+    comm.barrier();
     // Write the mesh files
     std::vector<IO::MeshDatabase> meshes_written;
     if ( global_IO_format == Format::OLD ) {
diff --git a/IO/Writer.h b/IO/Writer.h
index 710fa0d8..dfc22db8 100644
--- a/IO/Writer.h
+++ b/IO/Writer.h
@@ -34,7 +34,7 @@ void initialize( const std::string& path="", const std::string& format="silo", b
  * @param[in] meshData      The data to write
  * @param[in] comm          The comm to use for writing (usually MPI_COMM_WORLD or a dup thereof)
  */
-void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm );
+void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm );
 
 
 /*!
@@ -44,7 +44,7 @@ void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>
  * @param[in] meshData      The data to write
  * @param[in] comm          The comm to use for writing (usually MPI_COMM_WORLD or a dup thereof)
  */
-inline void writeData( int timestep, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm )
+inline void writeData( int timestep, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm )
 {
     char subdir[100];
     sprintf(subdir,"vis%03i",timestep);
diff --git a/IO/netcdf.cpp b/IO/netcdf.cpp
index b36bb6d6..e061579a 100644
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@@ -1,6 +1,6 @@
 #include "IO/netcdf.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 #include "ProfilerApp.h"
 
@@ -116,7 +116,7 @@ std::string VariableTypeName( VariableType type )
 /****************************************************
 * Open/close a file                                 *
 ****************************************************/
-int open( const std::string& filename, FileMode mode, MPI_Comm comm )
+int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm )
 {
     int fid = 0;
     if ( comm == MPI_COMM_NULL ) {
@@ -134,13 +134,13 @@ int open( const std::string& filename, FileMode mode, MPI_Comm comm )
         }
     } else {
         if ( mode == READ ) {
-            int err = nc_open_par( filename.c_str(), NC_MPIPOSIX, comm, MPI_INFO_NULL, &fid );
+            int err = nc_open_par( filename.c_str(), NC_MPIPOSIX, comm.getCommunicator(), MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else if ( mode == WRITE ) {
-            int err = nc_open_par( filename.c_str(), NC_WRITE|NC_MPIPOSIX, comm, MPI_INFO_NULL, &fid );
+            int err = nc_open_par( filename.c_str(), NC_WRITE|NC_MPIPOSIX, comm.getCommunicator(), MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else if ( mode == CREATE ) {
-            int err = nc_create_par( filename.c_str(), NC_NETCDF4|NC_MPIIO, comm, MPI_INFO_NULL, &fid );
+            int err = nc_create_par( filename.c_str(), NC_NETCDF4|NC_MPIIO, comm.getCommunicator(), MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else {
             ERROR("Unknown file mode");
@@ -375,7 +375,7 @@ Array<TYPE> getVar( int fid, const std::string& var, const std::vector<int>& sta
     std::vector<size_t> var_size = getVarDim( fid, var );
     for (int d=0; d<(int)var_size.size(); d++) {
         if ( start[d]<0 || start[d]+stride[d]*(count[d]-1)>(int)var_size[d] ) {
-            int rank = comm_rank(MPI_COMM_WORLD);
+            int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
             char tmp[1000];
             sprintf(tmp,"%i: Range exceeded array dimension:\n"
                 "   start[%i]=%i, count[%i]=%i, stride[%i]=%i, var_size[%i]=%i",
diff --git a/IO/netcdf.h b/IO/netcdf.h
index 657747bf..b4559e51 100644
--- a/IO/netcdf.h
+++ b/IO/netcdf.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "common/Array.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 
 
@@ -32,7 +32,7 @@ std::string VariableTypeName( VariableType type );
  * @param mode          Open the file for reading or writing
  * @param comm          MPI communicator to use (MPI_COMM_WORLD: don't use parallel netcdf)
 */
-int open( const std::string& filename, FileMode mode, MPI_Comm comm=MPI_COMM_NULL );
+int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm=MPI_COMM_NULL );
 
 
 /*!
diff --git a/IO/silo.cpp b/IO/silo.cpp
index eece8583..ddf3646a 100644
--- a/IO/silo.cpp
+++ b/IO/silo.cpp
@@ -1,6 +1,6 @@
 #include "IO/silo.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 #include "ProfilerApp.h"
 
diff --git a/IO/silo.h b/IO/silo.h
index 4c7081e5..339a5c34 100644
--- a/IO/silo.h
+++ b/IO/silo.h
@@ -6,7 +6,7 @@
 #include <array>
 
 #include "common/Array.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 
 
diff --git a/IO/silo.hpp b/IO/silo.hpp
index 312f32d8..35852004 100644
--- a/IO/silo.hpp
+++ b/IO/silo.hpp
@@ -3,7 +3,7 @@
 
 #include "IO/silo.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 #include "ProfilerApp.h"
 
diff --git a/analysis/Minkowski.cpp b/analysis/Minkowski.cpp
index faac6142..3e3fb35e 100644
--- a/analysis/Minkowski.cpp
+++ b/analysis/Minkowski.cpp
@@ -4,7 +4,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -109,13 +109,13 @@ void Minkowski::ComputeScalar(const DoubleArray& Field, const double isovalue)
 	// convert X for 2D manifold to 3D object
 	Xi *= 0.5;
 	
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 	// Phase averages
-	MPI_Allreduce(&Vi,&Vi_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Xi,&Xi_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Ai,&Ai_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Ji,&Ji_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Barrier(Dm->Comm);
+	Vi_global = Dm->Comm.sumReduce( Vi );
+	Xi_global = Dm->Comm.sumReduce( Xi );
+	Ai_global = Dm->Comm.sumReduce( Ai );
+	Ji_global = Dm->Comm.sumReduce( Ji );
+	Dm->Comm.barrier();
     PROFILE_STOP("ComputeScalar");
 }
 
@@ -168,7 +168,7 @@ int Minkowski::MeasureConnectedPathway(){
 	double vF=0.0; 
 	n_connected_components = ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,Dm->rank_info,distance,distance,vF,vF,label,Dm->Comm);
 //	int n_connected_components = ComputeGlobalPhaseComponent(Nx-2,Ny-2,Nz-2,Dm->rank_info,const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, Dm->Comm )
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 	
 	for (int k=0; k<Nz; k++){
 		for (int j=0; j<Ny; j++){
diff --git a/analysis/Minkowski.h b/analysis/Minkowski.h
index 0d36e33d..2fb9f048 100644
--- a/analysis/Minkowski.h
+++ b/analysis/Minkowski.h
@@ -12,7 +12,7 @@
 #include "analysis/distance.h"
 
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/SubPhase.cpp b/analysis/SubPhase.cpp
index 76541ffd..6fcbe399 100644
--- a/analysis/SubPhase.cpp
+++ b/analysis/SubPhase.cpp
@@ -229,25 +229,25 @@ void SubPhase::Basic(){
 			}
 		}
 	}
-	gwb.V=sumReduce( Dm->Comm, wb.V);
-	gnb.V=sumReduce( Dm->Comm, nb.V);
-	gwb.M=sumReduce( Dm->Comm, wb.M);
-	gnb.M=sumReduce( Dm->Comm, nb.M);
-	gwb.Px=sumReduce( Dm->Comm, wb.Px);
-	gwb.Py=sumReduce( Dm->Comm, wb.Py);
-	gwb.Pz=sumReduce( Dm->Comm, wb.Pz);
-	gnb.Px=sumReduce( Dm->Comm, nb.Px);
-	gnb.Py=sumReduce( Dm->Comm, nb.Py);
-	gnb.Pz=sumReduce( Dm->Comm, nb.Pz);
+	gwb.V = Dm->Comm.sumReduce( wb.V);
+	gnb.V = Dm->Comm.sumReduce( nb.V);
+	gwb.M = Dm->Comm.sumReduce( wb.M);
+	gnb.M = Dm->Comm.sumReduce( nb.M);
+	gwb.Px = Dm->Comm.sumReduce( wb.Px);
+	gwb.Py = Dm->Comm.sumReduce( wb.Py);
+	gwb.Pz = Dm->Comm.sumReduce( wb.Pz);
+	gnb.Px = Dm->Comm.sumReduce( nb.Px);
+	gnb.Py = Dm->Comm.sumReduce( nb.Py);
+	gnb.Pz = Dm->Comm.sumReduce( nb.Pz);
 	
-	count_w=sumReduce( Dm->Comm, count_w);
-	count_n=sumReduce( Dm->Comm, count_n);
+	count_w = Dm->Comm.sumReduce( count_w);
+	count_n = Dm->Comm.sumReduce( count_n);
 	if (count_w > 0.0)
-		gwb.p=sumReduce( Dm->Comm, wb.p) / count_w;
+		gwb.p = Dm->Comm.sumReduce(wb.p) / count_w;
 	else 
 		gwb.p = 0.0;
 	if (count_n > 0.0)
-		gnb.p=sumReduce( Dm->Comm, nb.p) / count_n;
+		gnb.p = Dm->Comm.sumReduce( nb.p) / count_n;
 	else 
 		gnb.p = 0.0;
 
@@ -444,14 +444,14 @@ void SubPhase::Full(){
 	nd.X -= nc.X;
 
 	// compute global entities
-	gnc.V=sumReduce( Dm->Comm, nc.V);
-	gnc.A=sumReduce( Dm->Comm, nc.A);
-	gnc.H=sumReduce( Dm->Comm, nc.H);
-	gnc.X=sumReduce( Dm->Comm, nc.X);
-	gnd.V=sumReduce( Dm->Comm, nd.V);
-	gnd.A=sumReduce( Dm->Comm, nd.A);
-	gnd.H=sumReduce( Dm->Comm, nd.H);
-	gnd.X=sumReduce( Dm->Comm, nd.X);
+	gnc.V = Dm->Comm.sumReduce( nc.V );
+	gnc.A = Dm->Comm.sumReduce( nc.A );
+	gnc.H = Dm->Comm.sumReduce( nc.H );
+	gnc.X = Dm->Comm.sumReduce( nc.X );
+	gnd.V = Dm->Comm.sumReduce( nd.V );
+	gnd.A = Dm->Comm.sumReduce( nd.A );
+	gnd.H = Dm->Comm.sumReduce( nd.H );
+	gnd.X = Dm->Comm.sumReduce( nd.X );
 	gnd.Nc = nd.Nc;
  	// wetting
 	for (k=0; k<Nz; k++){
@@ -491,14 +491,14 @@ void SubPhase::Full(){
 	wd.H -= wc.H;
 	wd.X -= wc.X;
 	// compute global entities
-	gwc.V=sumReduce( Dm->Comm, wc.V);
-	gwc.A=sumReduce( Dm->Comm, wc.A);
-	gwc.H=sumReduce( Dm->Comm, wc.H);
-	gwc.X=sumReduce( Dm->Comm, wc.X);
-	gwd.V=sumReduce( Dm->Comm, wd.V);
-	gwd.A=sumReduce( Dm->Comm, wd.A);
-	gwd.H=sumReduce( Dm->Comm, wd.H);
-	gwd.X=sumReduce( Dm->Comm, wd.X);
+	gwc.V = Dm->Comm.sumReduce( wc.V );
+	gwc.A = Dm->Comm.sumReduce( wc.A );
+	gwc.H = Dm->Comm.sumReduce( wc.H );
+	gwc.X = Dm->Comm.sumReduce( wc.X );
+	gwd.V = Dm->Comm.sumReduce( wd.V );
+	gwd.A = Dm->Comm.sumReduce( wd.A );
+	gwd.H = Dm->Comm.sumReduce( wd.H );
+	gwd.X = Dm->Comm.sumReduce( wd.X );
 	gwd.Nc = wd.Nc;
 	
  	/*  Set up geometric analysis of interface region */
@@ -526,20 +526,20 @@ void SubPhase::Full(){
 	iwn.A = morph_i->A(); 
 	iwn.H = morph_i->H(); 
 	iwn.X = morph_i->X(); 
-	giwn.V=sumReduce( Dm->Comm, iwn.V);
-	giwn.A=sumReduce( Dm->Comm, iwn.A);
-	giwn.H=sumReduce( Dm->Comm, iwn.H);
-	giwn.X=sumReduce( Dm->Comm, iwn.X);
+	giwn.V = Dm->Comm.sumReduce( iwn.V );
+	giwn.A = Dm->Comm.sumReduce( iwn.A );
+	giwn.H = Dm->Comm.sumReduce( iwn.H );
+	giwn.X = Dm->Comm.sumReduce( iwn.X );
 	// measure only the connected part
 	iwnc.Nc = morph_i->MeasureConnectedPathway();
 	iwnc.V = morph_i->V(); 
 	iwnc.A = morph_i->A(); 
 	iwnc.H = morph_i->H(); 
 	iwnc.X = morph_i->X(); 
-	giwnc.V=sumReduce( Dm->Comm, iwnc.V);
-	giwnc.A=sumReduce( Dm->Comm, iwnc.A);
-	giwnc.H=sumReduce( Dm->Comm, iwnc.H);
-	giwnc.X=sumReduce( Dm->Comm, iwnc.X);
+	giwnc.V = Dm->Comm.sumReduce( iwnc.V );
+	giwnc.A = Dm->Comm.sumReduce( iwnc.A );
+	giwnc.H = Dm->Comm.sumReduce( iwnc.H );
+	giwnc.X = Dm->Comm.sumReduce( iwnc.X );
 	giwnc.Nc = iwnc.Nc;
 
 	double vol_nc_bulk = 0.0;
@@ -630,46 +630,46 @@ void SubPhase::Full(){
 		}
 	}
 
-	gnd.M=sumReduce( Dm->Comm, nd.M);
-	gnd.Px=sumReduce( Dm->Comm, nd.Px);
-	gnd.Py=sumReduce( Dm->Comm, nd.Py);
-	gnd.Pz=sumReduce( Dm->Comm, nd.Pz);
-	gnd.K=sumReduce( Dm->Comm, nd.K);
+	gnd.M = Dm->Comm.sumReduce( nd.M );
+	gnd.Px = Dm->Comm.sumReduce( nd.Px );
+	gnd.Py = Dm->Comm.sumReduce( nd.Py );
+	gnd.Pz = Dm->Comm.sumReduce( nd.Pz );
+	gnd.K = Dm->Comm.sumReduce( nd.K );
 
-	gwd.M=sumReduce( Dm->Comm, wd.M);
-	gwd.Px=sumReduce( Dm->Comm, wd.Px);
-	gwd.Py=sumReduce( Dm->Comm, wd.Py);
-	gwd.Pz=sumReduce( Dm->Comm, wd.Pz);
-	gwd.K=sumReduce( Dm->Comm, wd.K);
+	gwd.M = Dm->Comm.sumReduce( wd.M );
+	gwd.Px = Dm->Comm.sumReduce( wd.Px );
+	gwd.Py = Dm->Comm.sumReduce( wd.Py );
+	gwd.Pz = Dm->Comm.sumReduce( wd.Pz );
+	gwd.K = Dm->Comm.sumReduce( wd.K );
 	
-	gnc.M=sumReduce( Dm->Comm, nc.M);
-	gnc.Px=sumReduce( Dm->Comm, nc.Px);
-	gnc.Py=sumReduce( Dm->Comm, nc.Py);
-	gnc.Pz=sumReduce( Dm->Comm, nc.Pz);
-	gnc.K=sumReduce( Dm->Comm, nc.K);
+	gnc.M = Dm->Comm.sumReduce( nc.M );
+	gnc.Px = Dm->Comm.sumReduce( nc.Px );
+	gnc.Py = Dm->Comm.sumReduce( nc.Py );
+	gnc.Pz = Dm->Comm.sumReduce( nc.Pz );
+	gnc.K = Dm->Comm.sumReduce( nc.K );
 
-	gwc.M=sumReduce( Dm->Comm, wc.M);
-	gwc.Px=sumReduce( Dm->Comm, wc.Px);
-	gwc.Py=sumReduce( Dm->Comm, wc.Py);
-	gwc.Pz=sumReduce( Dm->Comm, wc.Pz);
-	gwc.K=sumReduce( Dm->Comm, wc.K);
+	gwc.M = Dm->Comm.sumReduce( wc.M );
+	gwc.Px = Dm->Comm.sumReduce( wc.Px );
+	gwc.Py = Dm->Comm.sumReduce( wc.Py );
+	gwc.Pz = Dm->Comm.sumReduce( wc.Pz );
+	gwc.K = Dm->Comm.sumReduce( wc.K );
 	
-	giwn.Mn=sumReduce( Dm->Comm, iwn.Mn);
-	giwn.Pnx=sumReduce( Dm->Comm, iwn.Pnx);
-	giwn.Pny=sumReduce( Dm->Comm, iwn.Pny);
-	giwn.Pnz=sumReduce( Dm->Comm, iwn.Pnz);
-	giwn.Kn=sumReduce( Dm->Comm, iwn.Kn);
-	giwn.Mw=sumReduce( Dm->Comm, iwn.Mw);
-	giwn.Pwx=sumReduce( Dm->Comm, iwn.Pwx);
-	giwn.Pwy=sumReduce( Dm->Comm, iwn.Pwy);
-	giwn.Pwz=sumReduce( Dm->Comm, iwn.Pwz);
-	giwn.Kw=sumReduce( Dm->Comm, iwn.Kw);
+	giwn.Mn = Dm->Comm.sumReduce( iwn.Mn );
+	giwn.Pnx = Dm->Comm.sumReduce( iwn.Pnx );
+	giwn.Pny = Dm->Comm.sumReduce( iwn.Pny );
+	giwn.Pnz = Dm->Comm.sumReduce( iwn.Pnz );
+	giwn.Kn = Dm->Comm.sumReduce( iwn.Kn );
+	giwn.Mw = Dm->Comm.sumReduce( iwn.Mw );
+	giwn.Pwx = Dm->Comm.sumReduce( iwn.Pwx );
+	giwn.Pwy = Dm->Comm.sumReduce( iwn.Pwy );
+	giwn.Pwz = Dm->Comm.sumReduce( iwn.Pwz );
+	giwn.Kw = Dm->Comm.sumReduce( iwn.Kw );
 	
 	// pressure averaging
-	gnc.p=sumReduce( Dm->Comm, nc.p);
-	gnd.p=sumReduce( Dm->Comm, nd.p);
-	gwc.p=sumReduce( Dm->Comm, wc.p);
-	gwd.p=sumReduce( Dm->Comm, wd.p);
+	gnc.p = Dm->Comm.sumReduce( nc.p );
+	gnd.p = Dm->Comm.sumReduce( nd.p );
+	gwc.p = Dm->Comm.sumReduce( wc.p );
+	gwd.p = Dm->Comm.sumReduce( wd.p );
 
 	if (vol_wc_bulk > 0.0)
 		wc.p = wc.p /vol_wc_bulk;
@@ -680,10 +680,10 @@ void SubPhase::Full(){
 	if (vol_nd_bulk > 0.0)
 		nd.p = nd.p /vol_nd_bulk;
 
-	vol_wc_bulk=sumReduce( Dm->Comm, vol_wc_bulk);
-	vol_wd_bulk=sumReduce( Dm->Comm, vol_wd_bulk);
-	vol_nc_bulk=sumReduce( Dm->Comm, vol_nc_bulk);
-	vol_nd_bulk=sumReduce( Dm->Comm, vol_nd_bulk);
+	vol_wc_bulk = Dm->Comm.sumReduce( vol_wc_bulk );
+	vol_wd_bulk = Dm->Comm.sumReduce( vol_wd_bulk );
+	vol_nc_bulk = Dm->Comm.sumReduce( vol_nc_bulk );
+	vol_nd_bulk = Dm->Comm.sumReduce( vol_nd_bulk );
 	
 	if (vol_wc_bulk > 0.0)
 		gwc.p = gwc.p /vol_wc_bulk;
@@ -719,7 +719,7 @@ void SubPhase::AggregateLabels( const std::string& filename )
 			}
 		}
 	}
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 
 	Dm->AggregateLabels( filename );
 
diff --git a/analysis/SubPhase.h b/analysis/SubPhase.h
index 71b87ef0..691c654f 100644
--- a/analysis/SubPhase.h
+++ b/analysis/SubPhase.h
@@ -12,7 +12,7 @@
 #include "analysis/distance.h"
 #include "analysis/Minkowski.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index 9b2e5fd8..ea136758 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -5,7 +5,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -882,7 +882,7 @@ void TwoPhase::ComponentAverages()
 		}
 	}
 
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 	if (Dm->rank()==0){
 		printf("Component averages computed locally -- reducing result... \n");
 	}
@@ -895,8 +895,8 @@ void TwoPhase::ComponentAverages()
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_NWP(idx,b)=RecvBuffer(idx);
 	}
 	*/
-	MPI_Barrier(Dm->Comm);
-	MPI_Allreduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP,					MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	Dm->Comm.barrier();
+	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP);
 	//	MPI_Reduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
 
 	if (Dm->rank()==0){
@@ -993,9 +993,9 @@ void TwoPhase::ComponentAverages()
 
 	// reduce the wetting phase averages
 	for (int b=0; b<NumberComponents_WP; b++){
-		MPI_Barrier(Dm->Comm);
+		Dm->Comm.barrier();
 //		MPI_Allreduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-		MPI_Reduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
+		Dm->Comm.sumReduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_WP(idx,b)=RecvBuffer(idx);
 	}
 	
@@ -1078,43 +1078,42 @@ void TwoPhase::Reduce()
 	int i;
 	double iVol_global=1.0/Volume;
 	//...........................................................................
-	MPI_Barrier(Dm->Comm);
-	MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&wp_volume,&wp_volume_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Kwn,&Kwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&KGwns,&KGwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&KNwns,&KNwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&wwndnw,&wwndnw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&wwnsdnwn,&wwnsdnwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Jwnwwndnw,&Jwnwwndnw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	Dm->Comm.barrier();
+	nwp_volume_global = Dm->Comm.sumReduce( nwp_volume );
+	wp_volume_global = Dm->Comm.sumReduce( wp_volume );
+	awn_global = Dm->Comm.sumReduce( awn );
+	ans_global = Dm->Comm.sumReduce( ans );
+	aws_global = Dm->Comm.sumReduce( aws );
+	lwns_global = Dm->Comm.sumReduce( lwns );
+	As_global = Dm->Comm.sumReduce( As );
+	Jwn_global = Dm->Comm.sumReduce( Jwn );
+	Kwn_global = Dm->Comm.sumReduce( Kwn );
+	KGwns_global = Dm->Comm.sumReduce( KGwns );
+	KNwns_global = Dm->Comm.sumReduce( KNwns );
+	efawns_global = Dm->Comm.sumReduce( efawns );
+	wwndnw_global = Dm->Comm.sumReduce( wwndnw );
+	wwnsdnwn_global = Dm->Comm.sumReduce( wwnsdnwn );
+	Jwnwwndnw_global = Dm->Comm.sumReduce( Jwnwwndnw );
 	// Phase averages
-	MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&vawns(0),&vawns_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&trRwn,&trRwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&euler,&euler_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&An,&An_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Jn,&Jn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&Kn,&Kn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-
-	MPI_Barrier(Dm->Comm);
+	vol_w_global = Dm->Comm.sumReduce( vol_w );
+	vol_n_global = Dm->Comm.sumReduce( vol_n );
+	paw_global = Dm->Comm.sumReduce( paw );
+	pan_global = Dm->Comm.sumReduce( pan );
+	vaw_global(0) = Dm->Comm.sumReduce( vaw(0) );
+	van_global(0) = Dm->Comm.sumReduce( van(0) );
+	vawn_global(0) = Dm->Comm.sumReduce( vawn(0) );
+	vawns_global(0) = Dm->Comm.sumReduce( vawns(0) );
+	Gwn_global(0) = Dm->Comm.sumReduce( Gwn(0) );
+	Gns_global(0) = Dm->Comm.sumReduce( Gns(0) );
+	Gws_global(0) = Dm->Comm.sumReduce( Gws(0) );
+	trawn_global = Dm->Comm.sumReduce( trawn );
+	trJwn_global = Dm->Comm.sumReduce( trJwn );
+	trRwn_global = Dm->Comm.sumReduce( trRwn );
+	euler_global = Dm->Comm.sumReduce( euler );
+	An_global = Dm->Comm.sumReduce( An );
+	Jn_global = Dm->Comm.sumReduce( Jn );
+	Kn_global = Dm->Comm.sumReduce( Kn );
+	Dm->Comm.barrier();
 
 	// Normalize the phase averages
 	// (density of both components = 1.0)
diff --git a/analysis/TwoPhase.h b/analysis/TwoPhase.h
index fddd04e8..4d500a89 100644
--- a/analysis/TwoPhase.h
+++ b/analysis/TwoPhase.h
@@ -12,7 +12,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/analysis.cpp b/analysis/analysis.cpp
index 7587f3c5..4298750e 100644
--- a/analysis/analysis.cpp
+++ b/analysis/analysis.cpp
@@ -188,7 +188,7 @@ int ComputeLocalPhaseComponent(const IntArray &PhaseID, int &VALUE, BlobIDArray
 /******************************************************************
 * Reorder the global blob ids                                     *
 ******************************************************************/
-static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int ngz, MPI_Comm comm )
+static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int ngz, const Utilities::MPI& comm )
 {
     if ( N_blobs==0 )
         return 0;
@@ -212,7 +212,7 @@ static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int
         }
     }
     ASSERT(max_id<N_blobs);
-    MPI_Allreduce(local_size,global_size,N_blobs,MPI_DOUBLE,MPI_SUM,comm);
+    comm.sumReduce(local_size,global_size,N_blobs);
     std::vector<std::pair<double,int> > map1(N_blobs);
     int N_blobs2 = 0;
     for (int i=0; i<N_blobs; i++) {
@@ -235,12 +235,12 @@ static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int
     PROFILE_STOP("ReorderBlobIDs2",1);
     return N_blobs2;
 }
-void ReorderBlobIDs( BlobIDArray& ID, MPI_Comm comm )
+void ReorderBlobIDs( BlobIDArray& ID, const Utilities::MPI& comm )
 {
     PROFILE_START("ReorderBlobIDs");
     int tmp = ID.max()+1;
     int N_blobs = 0;
-    MPI_Allreduce(&tmp,&N_blobs,1,MPI_INT,MPI_MAX,comm);
+    N_blobs = comm.maxReduce( tmp );
     ReorderBlobIDs2(ID,N_blobs,1,1,1,comm);
     PROFILE_STOP("ReorderBlobIDs");
 }
@@ -260,30 +260,29 @@ static void updateRemoteIds(
     int N_send, const std::vector<int>& N_recv,
     int64_t *send_buf, std::vector<int64_t*>& recv_buf,
     std::map<int64_t,int64_t>& remote_map,
-    MPI_Comm comm )
+    const Utilities::MPI& comm )
 {
     std::vector<MPI_Request> send_req(neighbors.size());
     std::vector<MPI_Request> recv_req(neighbors.size());
-    std::vector<MPI_Status> status(neighbors.size());
-    std::map<int64_t,global_id_info_struct>::const_iterator it = map.begin();
+    auto it = map.begin();
     ASSERT(N_send==(int)map.size());
     for (size_t i=0; i<map.size(); i++, ++it) {
         send_buf[2*i+0] = it->first;
         send_buf[2*i+1] = it->second.new_id;
     }
     for (size_t i=0; i<neighbors.size(); i++) {
-        MPI_Isend( send_buf,    2*N_send,    MPI_LONG_LONG, neighbors[i], 0, comm, &send_req[i] );
-        MPI_Irecv( recv_buf[i], 2*N_recv[i], MPI_LONG_LONG, neighbors[i], 0, comm, &recv_req[i] );
+        send_req[i] = comm.Isend( send_buf,    2*N_send, neighbors[i], 0 );
+        recv_req[i] = comm.Irecv( recv_buf[i], 2*N_recv[i], neighbors[i], 0 );
     }
     for (it=map.begin(); it!=map.end(); ++it) {
         remote_map[it->first] = it->second.new_id;
     }
     for (size_t i=0; i<neighbors.size(); i++) {
-        MPI_Wait(&recv_req[i],&status[i]);
+        comm.wait( recv_req[i] );
         for (int j=0; j<N_recv[i]; j++)
             remote_map[recv_buf[i][2*j+0]] = recv_buf[i][2*j+1];
     }
-    MPI_Waitall(neighbors.size(),getPtr(send_req),getPtr(status));
+    comm.waitAll(neighbors.size(),getPtr(send_req));
 }
 // Compute a new local id for each local id
 static bool updateLocalIds( const std::map<int64_t,int64_t>& remote_map, 
@@ -304,18 +303,18 @@ static bool updateLocalIds( const std::map<int64_t,int64_t>& remote_map,
     return changed;
 }
 static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
-    int nblobs, BlobIDArray& IDs, MPI_Comm comm )
+    int nblobs, BlobIDArray& IDs, const Utilities::MPI& comm )
 {
     PROFILE_START("LocalToGlobalIDs",1);
     const int rank = rank_info.rank[1][1][1];
-    int nprocs = comm_size(comm);
+    int nprocs = comm.getSize();
     const int ngx = (IDs.size(0)-nx)/2;
     const int ngy = (IDs.size(1)-ny)/2;
     const int ngz = (IDs.size(2)-nz)/2;
     // Get the number of blobs for each rank
     std::vector<int> N_blobs(nprocs,0);
     PROFILE_START("LocalToGlobalIDs-Allgather",1);
-    MPI_Allgather(&nblobs,1,MPI_INT,getPtr(N_blobs),1,MPI_INT,comm);
+    comm.allGather(nblobs,getPtr(N_blobs));
     PROFILE_STOP("LocalToGlobalIDs-Allgather",1);
     int64_t N_blobs_tot = 0;
     int offset = 0;
@@ -363,13 +362,12 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
     std::vector<int> N_recv(neighbors.size(),0);
     std::vector<MPI_Request> send_req(neighbors.size());
     std::vector<MPI_Request> recv_req(neighbors.size());
-    std::vector<MPI_Status> status(neighbors.size());
     for (size_t i=0; i<neighbors.size(); i++) {
-        MPI_Isend( &N_send,    1, MPI_INT, neighbors[i], 0, comm, &send_req[i] );
-        MPI_Irecv( &N_recv[i], 1, MPI_INT, neighbors[i], 0, comm, &recv_req[i] );
+        send_req[i] = comm.Isend( &N_send,    1, neighbors[i], 0 );
+        recv_req[i] = comm.Irecv( &N_recv[i], 1, neighbors[i], 0 );
     }
-    MPI_Waitall(neighbors.size(),getPtr(send_req),getPtr(status));
-    MPI_Waitall(neighbors.size(),getPtr(recv_req),getPtr(status));
+    comm.waitAll(neighbors.size(),getPtr(send_req));
+    comm.waitAll(neighbors.size(),getPtr(recv_req));
     // Allocate memory for communication
     int64_t *send_buf = new int64_t[2*N_send];
     std::vector<int64_t*> recv_buf(neighbors.size());
@@ -398,8 +396,7 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
         bool changed = updateLocalIds( remote_map, map );
         // Check if we are finished
         int test = changed ? 1:0;
-        int result = 0;
-        MPI_Allreduce(&test,&result,1,MPI_INT,MPI_SUM,comm);
+        int result = comm.sumReduce( test );
         if ( result==0 )
             break;
     }
@@ -435,7 +432,7 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
 }
 int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
     const DoubleArray& Phase, const DoubleArray& SignDist, double vF, double vS,
-    BlobIDArray& GlobalBlobID, MPI_Comm comm )
+    BlobIDArray& GlobalBlobID, const Utilities::MPI& comm )
 {
     PROFILE_START("ComputeGlobalBlobIDs");
     // First compute the local ids
@@ -446,7 +443,7 @@ int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_inf
     return nglobal;
 }
 int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& rank_info,
-    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, MPI_Comm comm )
+    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, const Utilities::MPI& comm )
 {
     PROFILE_START("ComputeGlobalPhaseComponent");
     // First compute the local ids
@@ -462,37 +459,27 @@ int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& r
 * Compute the mapping of blob ids between timesteps               *
 ******************************************************************/
 typedef std::map<BlobIDType,std::map<BlobIDType,int64_t> > map_type;
-template<class TYPE> inline MPI_Datatype getMPIType();
-template<> inline MPI_Datatype getMPIType<int32_t>() { return MPI_INT; }
-template<> inline MPI_Datatype getMPIType<int64_t>() { 
-    if ( sizeof(int64_t)==sizeof(long int) )
-        return MPI_LONG;
-    else if ( sizeof(int64_t)==sizeof(double) )
-        return MPI_DOUBLE;
-}
 template<class TYPE>
-void gatherSet( std::set<TYPE>& set, MPI_Comm comm )
+void gatherSet( std::set<TYPE>& set, const Utilities::MPI& comm )
 {
-    int nprocs = comm_size(comm);
-    MPI_Datatype type = getMPIType<TYPE>();
+    int nprocs = comm.getSize();
     std::vector<TYPE> send_data(set.begin(),set.end());
     int send_count = send_data.size();
     std::vector<int> recv_count(nprocs,0), recv_disp(nprocs,0);
-    MPI_Allgather(&send_count,1,MPI_INT,getPtr(recv_count),1,MPI_INT,comm);
+    comm.allGather( send_count, getPtr(recv_count) );
     for (int i=1; i<nprocs; i++)
         recv_disp[i] = recv_disp[i-1] + recv_count[i-1];
     std::vector<TYPE> recv_data(recv_disp[nprocs-1]+recv_count[nprocs-1]);
-    MPI_Allgatherv(getPtr(send_data),send_count,type,
-        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),type,comm);
+    comm.allGather( getPtr(send_data), send_count, getPtr(recv_data),
+        getPtr(recv_count), getPtr(recv_disp), true );
     for (size_t i=0; i<recv_data.size(); i++)
         set.insert(recv_data[i]);
 }
-void gatherSrcIDMap( map_type& src_map, MPI_Comm comm )
+void gatherSrcIDMap( map_type& src_map, const Utilities::MPI& comm )
 {
-    int nprocs = comm_size(comm);
-    MPI_Datatype type = getMPIType<int64_t>();
+    int nprocs = comm.getSize();
     std::vector<int64_t> send_data;
-    for (map_type::const_iterator it=src_map.begin(); it!=src_map.end(); ++it) {
+    for (auto it=src_map.begin(); it!=src_map.end(); ++it) {
         int id = it->first;
         const std::map<BlobIDType,int64_t>& src_ids = it->second;
         send_data.push_back(id);
@@ -505,21 +492,21 @@ void gatherSrcIDMap( map_type& src_map, MPI_Comm comm )
     }
     int send_count = send_data.size();
     std::vector<int> recv_count(nprocs,0), recv_disp(nprocs,0);
-    MPI_Allgather(&send_count,1,MPI_INT,getPtr(recv_count),1,MPI_INT,comm);
+    comm.allGather(send_count,getPtr(recv_count));
     for (int i=1; i<nprocs; i++)
         recv_disp[i] = recv_disp[i-1] + recv_count[i-1];
     std::vector<int64_t> recv_data(recv_disp[nprocs-1]+recv_count[nprocs-1]);
-    MPI_Allgatherv(getPtr(send_data),send_count,type,
-        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),type,comm);
+    comm.allGather(getPtr(send_data),send_count,
+        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),true);
     size_t i=0;
     src_map.clear();
     while ( i < recv_data.size() ) {
         BlobIDType id = recv_data[i];
         size_t count = recv_data[i+1];
         i += 2;
-        std::map<BlobIDType,int64_t>& src_ids = src_map[id];
+        auto& src_ids = src_map[id];
         for (size_t j=0; j<count; j++,i+=2) {
-            std::map<BlobIDType,int64_t>::iterator it = src_ids.find(recv_data[i]);
+            auto it = src_ids.find(recv_data[i]);
             if ( it == src_ids.end() )
                 src_ids.insert(std::pair<BlobIDType,int64_t>(recv_data[i],recv_data[i+1]));
             else
@@ -538,7 +525,7 @@ void addSrcDstIDs( BlobIDType src_id, map_type& src_map, map_type& dst_map,
     }
 }
 ID_map_struct computeIDMap( int nx, int ny, int nz, 
-    const BlobIDArray& ID1, const BlobIDArray& ID2, MPI_Comm comm )
+    const BlobIDArray& ID1, const BlobIDArray& ID2, const Utilities::MPI& comm )
 {
     ASSERT(ID1.size()==ID2.size());
     PROFILE_START("computeIDMap");
@@ -780,7 +767,7 @@ void renumberIDs( const std::vector<BlobIDType>& new_ids, BlobIDArray& IDs )
 ******************************************************************/
 void writeIDMap( const ID_map_struct& map, long long int timestep, const std::string& filename )
 {
-    int rank = MPI_WORLD_RANK();
+    int rank = Utilities::MPI( MPI_COMM_WORLD ).getRank();
     if ( rank!=0 )
         return;
     bool empty = map.created.empty() && map.destroyed.empty() &&
diff --git a/analysis/analysis.h b/analysis/analysis.h
index 2ce531b1..ec377995 100644
--- a/analysis/analysis.h
+++ b/analysis/analysis.h
@@ -58,7 +58,7 @@ int ComputeLocalPhaseComponent( const IntArray &PhaseID, int &VALUE, IntArray &C
  */
 int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
     const DoubleArray& Phase, const DoubleArray& SignDist, double vF, double vS, 
-    BlobIDArray& GlobalBlobID, MPI_Comm comm );
+    BlobIDArray& GlobalBlobID, const Utilities::MPI& comm );
 
 
 /*!
@@ -75,7 +75,7 @@ int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_inf
  * @return Return the number of components in the specified phase
  */
 int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& rank_info,
-    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, MPI_Comm comm );
+    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, const Utilities::MPI& comm );
 
 
 /*!
@@ -87,7 +87,7 @@ int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& r
  * @param[in] nz            Number of elements in the z-direction
  * @param[in/out] ID        The ids of the blobs
  */
-void ReorderBlobIDs( BlobIDArray& ID, MPI_Comm comm );
+void ReorderBlobIDs( BlobIDArray& ID, const Utilities::MPI& comm );
 
 
 typedef std::pair<BlobIDType,std::vector<BlobIDType> > BlobIDSplitStruct;
@@ -120,7 +120,7 @@ struct ID_map_struct {
  * @param[in] ID1           The blob ids at the first timestep
  * @param[in] ID2           The blob ids at the second timestep
  */
-ID_map_struct computeIDMap( int nx, int ny, int nz, const BlobIDArray& ID1, const BlobIDArray& ID2, MPI_Comm comm );
+ID_map_struct computeIDMap( int nx, int ny, int nz, const BlobIDArray& ID1, const BlobIDArray& ID2, const Utilities::MPI& comm );
 
 
 /*!
diff --git a/analysis/distance.cpp b/analysis/distance.cpp
index e297b435..9c605e1e 100644
--- a/analysis/distance.cpp
+++ b/analysis/distance.cpp
@@ -176,7 +176,7 @@ void CalcVecDist( Array<Vec> &d, const Array<int> &ID0, const Domain &Dm,
         // Update distance
         double err = calcVecUpdateInterior( d, dx[0], dx[1], dx[2] );
         // Check if we are finished
-        err = maxReduce( Dm.Comm, err );
+        err = Dm.Comm.maxReduce( err );
         if ( err < tol )
             break;
     }
diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index 05278313..a65cb237 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -58,11 +58,11 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 			}
 		}
 	}
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 	
 	// total Global is the number of nodes in the pore-space
-	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm->Comm);
+	totalGlobal = Dm->Comm.sumReduce( count );
+	maxdistGlobal = Dm->Comm.sumReduce( maxdist );
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double volume_fraction=totalGlobal/volume;
 	if (rank==0) printf("Volume fraction for morphological opening: %f \n",volume_fraction);
@@ -133,7 +133,6 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 	double deltaR=0.05; // amount to change the radius in voxel units
 	double Rcrit_old;
 
-	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
 
 	if (ErodeLabel == 1){
@@ -203,41 +202,41 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
 		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -259,7 +258,7 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		UnpackID(Dm->recvList_YZ, Dm->recvCount_YZ ,recvID_YZ, id);
 		//......................................................................................
 
-		MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+		//double GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
 
 		count = 0.f;
 		for (int k=1; k<Nz-1; k++){
@@ -272,7 +271,7 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+		countGlobal = Dm->Comm.sumReduce( count );
 		void_fraction_new = countGlobal/totalGlobal;
 		void_fraction_diff_new = abs(void_fraction_new-VoidFraction);
 	/*	if (rank==0){
@@ -360,11 +359,11 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		}
 	}
 
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 	
 	// total Global is the number of nodes in the pore-space
-	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm->Comm);
+	totalGlobal = Dm->Comm.sumReduce( count );
+	maxdistGlobal = Dm->Comm.sumReduce( maxdist );
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double volume_fraction=totalGlobal/volume;
 	if (rank==0) printf("Volume fraction for morphological opening: %f \n",volume_fraction);
@@ -434,7 +433,6 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	double deltaR=0.05; // amount to change the radius in voxel units
 	double Rcrit_old;
 
-	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
 
 	double Rcrit_new = maxdistGlobal;
@@ -442,7 +440,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	//	Rcrit_new = strtod(argv[2],NULL);
 	//	if (rank==0) printf("Max. distance =%f, Initial critical radius = %f \n",maxdistGlobal,Rcrit_new);
 	//}
-	MPI_Barrier(Dm->Comm);
+	Dm->Comm.barrier();
 
 	
 	FILE *DRAIN = fopen("morphdrain.csv","w");
@@ -509,41 +507,41 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
 		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -564,7 +562,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		UnpackID(Dm->recvList_yZ, Dm->recvCount_yZ ,recvID_yZ, id);
 		UnpackID(Dm->recvList_YZ, Dm->recvCount_YZ ,recvID_YZ, id);
 		//......................................................................................
-		MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+		// double GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -583,7 +581,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		MPI_Barrier(Dm->Comm);
+		Dm->Comm.barrier();
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -645,7 +643,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+		countGlobal = Dm->Comm.sumReduce( count );
 		void_fraction_new = countGlobal/totalGlobal;
 		void_fraction_diff_new = abs(void_fraction_new-VoidFraction);
 		if (rank==0){
@@ -702,7 +700,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			}
 		}
 	}
-	double count_original=sumReduce( Dm->Comm, count);
+	double count_original = Dm->Comm.sumReduce( count);
 
 	// Estimate morph_delta
 	double morph_delta = 0.0;
@@ -732,8 +730,8 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 				}
 			}
 		}
-		count=sumReduce( Dm->Comm, count);
-		MAX_DISPLACEMENT = maxReduce( Dm->Comm, MAX_DISPLACEMENT);
+		count = Dm->Comm.sumReduce( count );
+		MAX_DISPLACEMENT = Dm->Comm.maxReduce( MAX_DISPLACEMENT );
 		GrowthEstimate = count - count_original;
 		ERROR = fabs((GrowthEstimate-TargetGrowth) /TargetGrowth);
 
@@ -776,7 +774,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			}
 		}
 	}
-	count=sumReduce( Dm->Comm, count);
+	count = Dm->Comm.sumReduce( count );
 
 	return count;
 }
diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index 6c76f58b..89451c7b 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -3,7 +3,7 @@
 #include "analysis/analysis.h"
 #include "common/Array.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/ScaLBL.h"
 #include "models/ColorModel.h"
 
@@ -462,7 +462,7 @@ private:
 /******************************************************************
  *  MPI comm wrapper for use with analysis                         *
  ******************************************************************/
-runAnalysis::commWrapper::commWrapper( int tag_, MPI_Comm comm_, runAnalysis* analysis_ ):
+runAnalysis::commWrapper::commWrapper( int tag_, const Utilities::MPI& comm_, runAnalysis* analysis_ ):
             comm(comm_),
             tag(tag_),
             analysis(analysis_)
@@ -479,7 +479,7 @@ runAnalysis::commWrapper::~commWrapper()
 {
     if ( tag == -1 )
         return;
-    MPI_Barrier( comm );
+    comm.barrier();
     analysis->d_comm_used[tag] = false;
 }
 runAnalysis::commWrapper runAnalysis::getComm( )
@@ -496,10 +496,10 @@ runAnalysis::commWrapper runAnalysis::getComm( )
         if ( tag == -1 )
             ERROR("Unable to get comm");
     }
-    MPI_Bcast( &tag, 1, MPI_INT, 0, d_comm );
+    tag = d_comm.bcast( tag, 0 );
     d_comm_used[tag] = true;
-    if ( d_comms[tag] == MPI_COMM_NULL )
-        MPI_Comm_dup( MPI_COMM_WORLD, &d_comms[tag] );
+    if ( d_comms[tag].isNull() )
+        d_comms[tag] = d_comm.dup();
     return commWrapper(tag,d_comms[tag],this);
 }
 
@@ -507,14 +507,20 @@ runAnalysis::commWrapper runAnalysis::getComm( )
 /******************************************************************
  *  Constructor/Destructors                                        *
  ******************************************************************/
-runAnalysis::runAnalysis(std::shared_ptr<Database> input_db, const RankInfoStruct& rank_info, std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm, std::shared_ptr <Domain> Dm,
-        int Np, bool Regular, IntArray Map ):
-            d_Np( Np ),
-            d_regular ( Regular),
-            d_rank_info( rank_info ),
-            d_Map( Map ),
-            d_fillData(Dm->Comm,Dm->rank_info,{Dm->Nx-2,Dm->Ny-2,Dm->Nz-2},{1,1,1},0,1),
-            d_ScaLBL_Comm( ScaLBL_Comm)
+runAnalysis::runAnalysis( std::shared_ptr<Database> input_db,
+                          const RankInfoStruct& rank_info,
+                          std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm,
+                          std::shared_ptr <Domain> Dm,
+                          int Np,
+                          bool Regular,
+                          IntArray Map ):
+    d_Np( Np ),
+    d_regular ( Regular),
+    d_rank_info( rank_info ),
+    d_Map( Map ),
+    d_fillData(Dm->Comm,Dm->rank_info,{Dm->Nx-2,Dm->Ny-2,Dm->Nz-2},{1,1,1},0,1),
+    d_comm( Utilities::MPI( MPI_COMM_WORLD ).dup() ),
+    d_ScaLBL_Comm( ScaLBL_Comm)
 {
 
 	auto db = input_db->getDatabase( "Analysis" );
@@ -552,7 +558,7 @@ runAnalysis::runAnalysis(std::shared_ptr<Database> input_db, const RankInfoStruc
     d_restartFile = restart_file + "." + rankString;
     
     
-    d_rank = MPI_WORLD_RANK();
+    d_rank = d_comm.getRank();
     writeIDMap(ID_map_struct(),0,id_map_filename);
     // Initialize IO for silo
     IO::initialize("","silo","false");
@@ -621,11 +627,8 @@ runAnalysis::runAnalysis(std::shared_ptr<Database> input_db, const RankInfoStruc
     
 
     // Initialize the comms
-    MPI_Comm_dup(MPI_COMM_WORLD,&d_comm);
-    for (int i=0; i<1024; i++) {
-        d_comms[i] = MPI_COMM_NULL;
+    for (int i=0; i<1024; i++)
         d_comm_used[i] = false;
-    }
     // Initialize the threads
     int N_threads = db->getWithDefault<int>( "N_threads", 4 );
     auto method = db->getWithDefault<std::string>( "load_balance", "default" );
@@ -635,12 +638,6 @@ runAnalysis::~runAnalysis( )
 {
     // Finish processing analysis
     finish();
-    // Clear internal data
-    MPI_Comm_free( &d_comm );
-    for (int i=0; i<1024; i++) {
-        if ( d_comms[i] != MPI_COMM_NULL )
-            MPI_Comm_free(&d_comms[i]);
-    }
 }
 void runAnalysis::finish( )
 {
@@ -654,7 +651,7 @@ void runAnalysis::finish( )
     d_wait_subphase.reset();
     d_wait_restart.reset();
     // Syncronize
-    MPI_Barrier( d_comm );
+    d_comm.barrier();
     PROFILE_STOP("finish");
 }
 
diff --git a/analysis/runAnalysis.h b/analysis/runAnalysis.h
index 0bf2f676..3c5bc7f0 100644
--- a/analysis/runAnalysis.h
+++ b/analysis/runAnalysis.h
@@ -68,10 +68,10 @@ public:
     class commWrapper
     {
       public:
-        MPI_Comm comm;
+        Utilities::MPI comm;
         int tag;
         runAnalysis *analysis;
-        commWrapper( int tag, MPI_Comm comm, runAnalysis *analysis );
+        commWrapper( int tag, const Utilities::MPI& comm, runAnalysis *analysis );
         commWrapper( ) = delete;
         commWrapper( const commWrapper &rhs ) = delete;
         commWrapper& operator=( const commWrapper &rhs ) = delete;
@@ -100,8 +100,8 @@ private:
     std::vector<IO::MeshDataStruct> d_meshData;
     fillHalo<double> d_fillData;
     std::string d_restartFile;
-    MPI_Comm d_comm;
-    MPI_Comm d_comms[1024];
+    Utilities::MPI d_comm;
+    Utilities::MPI d_comms[1024];
     volatile bool d_comm_used[1024];
     std::shared_ptr<ScaLBL_Communicator> d_ScaLBL_Comm;
 
diff --git a/analysis/uCT.cpp b/analysis/uCT.cpp
index 912f8e85..28d677c1 100644
--- a/analysis/uCT.cpp
+++ b/analysis/uCT.cpp
@@ -228,8 +228,7 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     Array<float>& Mean, Array<float>& Dist1, Array<float>& Dist2 )
 {
     PROFILE_SCOPED(timer,"filter_final");
-	int rank;
-	MPI_Comm_rank(Dm.Comm,&rank);
+	int rank = Dm.Comm.getRank();
     int Nx = Dm.Nx-2;
     int Ny = Dm.Ny-2;
     int Nz = Dm.Nz-2;
@@ -242,7 +241,7 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     float tmp = 0;
     for (size_t i=0; i<Dist0.length(); i++)
         tmp += Dist0(i)*Dist0(i);
-    tmp = sqrt( sumReduce(Dm.Comm,tmp) / sumReduce(Dm.Comm,(float)Dist0.length()) );
+    tmp = sqrt( Dm.Comm.sumReduce(tmp) / Dm.Comm.sumReduce<float>(Dist0.length()) );
     const float dx1 = 0.3*tmp;
     const float dx2 = 1.05*dx1;
     if (rank==0)
@@ -285,7 +284,7 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     Phase.fill(1);
     ComputeGlobalBlobIDs( Nx, Ny, Nz, Dm.rank_info, Phase, SignDist, 0, 0, GlobalBlobID, Dm.Comm );
     fillInt.fill(GlobalBlobID);
-    int N_blobs = maxReduce(Dm.Comm,GlobalBlobID.max()+1);
+    int N_blobs = Dm.Comm.maxReduce(GlobalBlobID.max()+1);
     std::vector<float> mean(N_blobs,0);
     std::vector<int> count(N_blobs,0);
     for (int k=1; k<=Nz; k++) {
@@ -321,8 +320,8 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
             }
         }
     }
-    mean = sumReduce(Dm.Comm,mean);
-    count = sumReduce(Dm.Comm,count);
+    mean = Dm.Comm.sumReduce(mean);
+    count = Dm.Comm.sumReduce(count);
     for (size_t i=0; i<mean.size(); i++)
         mean[i] /= count[i];
     /*if (rank==0) {
diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake
new file mode 100644
index 00000000..d2377e9a
--- /dev/null
+++ b/cmake/FindHIP.cmake
@@ -0,0 +1,579 @@
+###############################################################################
+# FindHIP.cmake
+###############################################################################
+
+###############################################################################
+# SET: Variable defaults
+###############################################################################
+# User defined flags
+set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
+set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC")
+set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
+mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS)
+set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES _hip_configuration_types)
+foreach(config ${_hip_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
+    set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC")
+    set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
+    mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
+endforeach()
+option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
+option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+mark_as_advanced(HIP_HOST_COMPILATION_CPP)
+
+###############################################################################
+# Set HIP CMAKE Flags
+###############################################################################
+# Copy the invocation styles from CXX to HIP
+set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
+set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
+set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
+set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
+set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
+#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
+set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
+set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
+
+# Set the CMake Flags to use the HCC Compilier.
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+
+###############################################################################
+# FIND: HIP and associated helper binaries
+###############################################################################
+# HIP is supported on Linux only
+if(UNIX AND NOT APPLE AND NOT CYGWIN)
+    # Search for HIP installation
+    if(NOT HIP_ROOT_DIR)
+        # Search in user specified path first
+        find_path(
+            HIP_ROOT_DIR
+            NAMES hipconfig
+            PATHS
+            ENV ROCM_PATH
+            ENV HIP_PATH
+            PATH_SUFFIXES bin
+            DOC "HIP installed location"
+            NO_DEFAULT_PATH
+            )
+        # Now search in default path
+        find_path(
+            HIP_ROOT_DIR
+            NAMES hipconfig
+            PATHS
+            /opt/rocm
+            /opt/rocm/hip
+            PATH_SUFFIXES bin
+            DOC "HIP installed location"
+            )
+
+        # Check if we found HIP installation
+        if(HIP_ROOT_DIR)
+            # If so, fix the path
+            string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" HIP_ROOT_DIR ${HIP_ROOT_DIR})
+            # And push it back to the cache
+            set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
+        endif()
+        if(NOT EXISTS ${HIP_ROOT_DIR})
+            if(HIP_FIND_REQUIRED)
+                message(FATAL_ERROR "Specify HIP_ROOT_DIR")
+            elseif(NOT HIP_FIND_QUIETLY)
+                message("HIP_ROOT_DIR not found or specified")
+            endif()
+        endif()
+    endif()
+
+    # Find HIPCC executable
+    find_program(
+        HIP_HIPCC_EXECUTABLE
+        NAMES hipcc
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCC_EXECUTABLE)
+        # Now search in default paths
+        find_program(HIP_HIPCC_EXECUTABLE hipcc)
+    endif()
+    mark_as_advanced(HIP_HIPCC_EXECUTABLE)
+
+    # Find HIPCONFIG executable
+    find_program(
+        HIP_HIPCONFIG_EXECUTABLE
+        NAMES hipconfig
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCONFIG_EXECUTABLE)
+        # Now search in default paths
+        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
+    endif()
+    mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
+
+    # Find HIPCC_CMAKE_LINKER_HELPER executable
+    find_program(
+        HIP_HIPCC_CMAKE_LINKER_HELPER
+        NAMES hipcc_cmake_linker_helper
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
+        # Now search in default paths
+        find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
+    endif()
+    mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
+
+    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
+        # Compute the version
+        execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
+            OUTPUT_VARIABLE _hip_version
+            ERROR_VARIABLE _hip_error
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_STRIP_TRAILING_WHITESPACE
+            )
+        if(NOT _hip_error)
+            set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
+        else()
+            set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
+        endif()
+        mark_as_advanced(HIP_VERSION)
+    endif()
+    if(HIP_VERSION)
+        string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
+        list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
+        list(GET _hip_version_list 1 HIP_VERSION_MINOR)
+        list(GET _hip_version_list 2 HIP_VERSION_PATCH)
+        set(HIP_VERSION_STRING "${HIP_VERSION}")
+    endif()
+
+    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
+        # Compute the platform
+        execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
+            OUTPUT_VARIABLE _hip_platform
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+        set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
+        mark_as_advanced(HIP_PLATFORM)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    HIP
+    REQUIRED_VARS
+    HIP_ROOT_DIR
+    HIP_HIPCC_EXECUTABLE
+    HIP_HIPCONFIG_EXECUTABLE
+    HIP_PLATFORM
+    VERSION_VAR HIP_VERSION
+    )
+
+###############################################################################
+# MACRO: Locate helper files
+###############################################################################
+macro(HIP_FIND_HELPER_FILE _name _extension)
+    set(_hip_full_name "${_name}.${_extension}")
+    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+    set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
+    if(NOT EXISTS "${HIP_${_name}}")
+        set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
+        if(HIP_FIND_REQUIRED)
+            message(FATAL_ERROR "${error_message}")
+        else()
+            if(NOT HIP_FIND_QUIETLY)
+                message(STATUS "${error_message}")
+            endif()
+        endif()
+    endif()
+    # Set this variable as internal, so the user isn't bugged with it.
+    set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+###############################################################################
+hip_find_helper_file(run_make2cmake cmake)
+hip_find_helper_file(run_hipcc cmake)
+###############################################################################
+
+###############################################################################
+# MACRO: Reset compiler flags
+###############################################################################
+macro(HIP_RESET_FLAGS)
+    unset(HIP_HIPCC_FLAGS)
+    unset(HIP_HCC_FLAGS)
+    unset(HIP_NVCC_FLAGS)
+    foreach(config ${_hip_configuration_types})
+        string(TOUPPER ${config} config_upper)
+        unset(HIP_HIPCC_FLAGS_${config_upper})
+        unset(HIP_HCC_FLAGS_${config_upper})
+        unset(HIP_NVCC_FLAGS_${config_upper})
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Separate the options from the sources
+###############################################################################
+macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options)
+    set(${_sources})
+    set(${_cmake_options})
+    set(${_hipcc_options})
+    set(${_hcc_options})
+    set(${_nvcc_options})
+    set(_hipcc_found_options FALSE)
+    set(_hcc_found_options FALSE)
+    set(_nvcc_found_options FALSE)
+    foreach(arg ${ARGN})
+        if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
+            set(_hipcc_found_options TRUE)
+            set(_hcc_found_options FALSE)
+            set(_nvcc_found_options FALSE)
+        elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
+            set(_hipcc_found_options FALSE)
+            set(_hcc_found_options TRUE)
+            set(_nvcc_found_options FALSE)
+        elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
+            set(_hipcc_found_options FALSE)
+            set(_hcc_found_options FALSE)
+            set(_nvcc_found_options TRUE)
+        elseif(
+                "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+                "x${arg}" STREQUAL "xSTATIC" OR
+                "x${arg}" STREQUAL "xSHARED" OR
+                "x${arg}" STREQUAL "xMODULE"
+                )
+            list(APPEND ${_cmake_options} ${arg})
+        else()
+            if(_hipcc_found_options)
+                list(APPEND ${_hipcc_options} ${arg})
+            elseif(_hcc_found_options)
+                list(APPEND ${_hcc_options} ${arg})
+            elseif(_nvcc_found_options)
+                list(APPEND ${_nvcc_options} ${arg})
+            else()
+                # Assume this is a file
+                list(APPEND ${_sources} ${arg})
+            endif()
+        endif()
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Add include directories to pass to the hipcc command
+###############################################################################
+set(HIP_HIPCC_INCLUDE_ARGS_USER "")
+macro(HIP_INCLUDE_DIRECTORIES)
+    foreach(dir ${ARGN})
+        list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
+    endforeach()
+endmacro()
+
+###############################################################################
+# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
+###############################################################################
+function(HIP_COMPUTE_BUILD_PATH path build_path)
+    # Convert to cmake style paths
+    file(TO_CMAKE_PATH "${path}" bpath)
+    if(IS_ABSOLUTE "${bpath}")
+        string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+        if(_binary_dir_pos EQUAL 0)
+            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+        else()
+            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+        endif()
+    endif()
+
+    # Remove leading /
+    string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+    # Avoid absolute paths by removing ':'
+    string(REPLACE ":" "_" bpath "${bpath}")
+    # Avoid relative paths that go up the tree
+    string(REPLACE "../" "__/" bpath "${bpath}")
+    # Avoid spaces
+    string(REPLACE " " "_" bpath "${bpath}")
+    # Strip off the filename
+    get_filename_component(bpath "${bpath}" PATH)
+
+    set(${build_path} "${bpath}" PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
+###############################################################################
+macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
+    set(_hip_found_config)
+    foreach(arg ${ARGN})
+        # Determine if we are dealing with a per-configuration flag
+        foreach(config ${_hip_configuration_types})
+            string(TOUPPER ${config} config_upper)
+            if(arg STREQUAL "${config_upper}")
+                set(_hip_found_config _${arg})
+                # Clear arg to prevent it from being processed anymore
+                set(arg)
+            endif()
+        endforeach()
+        if(arg)
+            list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
+        endif()
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Try and include dependency file if it exists
+###############################################################################
+macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
+    set(HIP_HIPCC_DEPEND)
+    set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
+
+    # Create the dependency file if it doesn't exist
+    if(NOT EXISTS ${dependency_file})
+        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
+    endif()
+    # Include the dependency file
+    include(${dependency_file})
+
+    # Verify the existence of all the included files
+    if(HIP_HIPCC_DEPEND)
+        foreach(f ${HIP_HIPCC_DEPEND})
+            if(NOT EXISTS ${f})
+                # If they aren't there, regenerate the file again
+                set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
+            endif()
+        endforeach()
+    else()
+        # No dependencies, so regenerate the file
+        set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
+    endif()
+
+    # Regenerate the dependency file if needed
+    if(HIP_HIPCC_DEPEND_REGENERATE)
+        set(HIP_HIPCC_DEPEND ${dependency_file})
+        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
+    endif()
+endmacro()
+
+###############################################################################
+# MACRO: Prepare cmake commands for the target
+###############################################################################
+macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
+    set(_hip_flags "")
+    string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
+    if(HIP_HOST_COMPILATION_CPP)
+        set(HIP_C_OR_CXX CXX)
+    else()
+        set(HIP_C_OR_CXX C)
+    endif()
+    set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
+
+    # Initialize list of includes with those specified by the user. Append with
+    # ones specified to cmake directly.
+    set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
+
+    # Add the include directories
+    set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
+    list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
+
+    get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
+    list(REMOVE_DUPLICATES _hip_include_directories)
+    if(_hip_include_directories)
+        foreach(dir ${_hip_include_directories})
+            list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
+        endforeach()
+    endif()
+
+    HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
+
+    # Add the compile definitions
+    set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
+    list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
+
+    # Check if we are building shared library.
+    set(_hip_build_shared_libs FALSE)
+    list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
+    list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
+    if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
+        set(_hip_build_shared_libs TRUE)
+    endif()
+    list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
+    if(_hip_found_STATIC GREATER -1)
+        set(_hip_build_shared_libs FALSE)
+    endif()
+
+    # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
+    if(_hip_build_shared_libs)
+        list(APPEND HIP_HCC_FLAGS "-fPIC")
+        list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
+    endif()
+
+    # Set host compiler
+    set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
+
+    # Set compiler flags
+    set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
+    set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
+    set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})")
+    set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
+    foreach(config ${_hip_configuration_types})
+        string(TOUPPER ${config} config_upper)
+        set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
+        set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
+        set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})")
+        set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
+    endforeach()
+
+    # Reset the output variable
+    set(_hip_generated_files "")
+    set(_hip_source_files "")
+
+    # Iterate over all arguments and create custom commands for all source files
+    foreach(file ${ARGN})
+        # Ignore any file marked as a HEADER_FILE_ONLY
+        get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+        # Allow per source file overrides of the format. Also allows compiling non .cu files.
+        get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
+        if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
+            set(host_flag FALSE)
+        else()
+            set(host_flag TRUE)
+        endif()
+
+        if(NOT host_flag)
+            # Determine output directory
+            HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
+            set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
+
+            get_filename_component(basename ${file} NAME)
+            set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+            set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
+
+            # Set file names
+            set(generated_file "${generated_file_path}/${generated_file_basename}")
+            set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
+            set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
+            set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
+
+            # Set properties for object files
+            set_source_files_properties("${generated_file}"
+                PROPERTIES
+                EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
+                )
+
+            # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
+            get_filename_component(file_path "${file}" PATH)
+            if(IS_ABSOLUTE "${file_path}")
+                set(source_file "${file}")
+            else()
+                set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+            endif()
+
+            # Bring in the dependencies
+            HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
+
+            # Configure the build script
+            configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
+            file(GENERATE
+                OUTPUT "${custom_target_script}"
+                INPUT "${custom_target_script_pregen}"
+                )
+            set(main_dep DEPENDS ${source_file})
+            if(CMAKE_GENERATOR MATCHES "Makefiles")
+                set(verbose_output "$(VERBOSE)")
+            elseif(HIP_VERBOSE_BUILD)
+                set(verbose_output ON)
+            else()
+                set(verbose_output OFF)
+            endif()
+
+            # Create up the comment string
+            file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+            set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
+
+            # Build the generated file and dependency file
+            add_custom_command(
+                OUTPUT ${generated_file}
+                # These output files depend on the source_file and the contents of cmake_dependency_file
+                ${main_dep}
+                DEPENDS ${HIP_HIPCC_DEPEND}
+                DEPENDS ${custom_target_script}
+                # Make sure the output directory exists before trying to write to it.
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+                COMMAND ${CMAKE_COMMAND} ARGS
+                -D verbose:BOOL=${verbose_output}
+                -D build_configuration:STRING=${_hip_build_configuration}
+                -D "generated_file:STRING=${generated_file}"
+                -P "${custom_target_script}"
+                WORKING_DIRECTORY "${hip_compile_output_dir}"
+                COMMENT "${hip_build_comment_string}"
+                )
+
+            # Make sure the build system knows the file is generated
+            set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+            list(APPEND _hip_generated_files ${generated_file})
+            list(APPEND _hip_source_files ${file})
+        endif()
+    endforeach()
+
+    # Set the return parameter
+    set(${_generated_files} ${_hip_generated_files})
+    set(${_source_files} ${_hip_source_files})
+endmacro()
+
+###############################################################################
+# HIP_ADD_EXECUTABLE
+###############################################################################
+macro(HIP_ADD_EXECUTABLE hip_target)
+    # Separate the sources from the options
+    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+        list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if("x${HCC_HOME}" STREQUAL "x")
+        set(HCC_HOME "/opt/rocm/hcc")
+    endif()
+    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
+endmacro()
+
+###############################################################################
+# HIP_ADD_LIBRARY
+###############################################################################
+macro(HIP_ADD_LIBRARY hip_target)
+    # Separate the sources from the options
+    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+        list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
+endmacro()
+
+# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/common/Communication.h b/common/Communication.h
index 7819a0bb..7c2f8d08 100644
--- a/common/Communication.h
+++ b/common/Communication.h
@@ -1,7 +1,7 @@
 #ifndef COMMUNICATION_H_INC
 #define COMMUNICATION_H_INC
 
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 #include "common/Array.h"
 
@@ -38,7 +38,7 @@ struct RankInfoStruct {
 //! Redistribute domain data (dst may be smaller than the src)
 template<class TYPE>
 Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src_data,
-    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, MPI_Comm comm );
+    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, const Utilities::MPI& comm );
 
 
 /*!
@@ -59,7 +59,7 @@ public:
      * @param[in] fill          Fill {faces,edges,corners}
      * @param[in] periodic      Periodic dimensions
      */
-    fillHalo( MPI_Comm comm, const RankInfoStruct& info,
+    fillHalo( const Utilities::MPI& comm, const RankInfoStruct& info,
         std::array<int,3> n, std::array<int,3> ng, int tag, int depth,
         std::array<bool,3> fill = {true,true,true},
         std::array<bool,3> periodic = {true,true,true} );
@@ -83,7 +83,7 @@ public:
 
 
 private:
-    MPI_Comm comm;
+    Utilities::MPI comm;
     RankInfoStruct info;
     std::array<int,3> n, ng;
     int depth;
@@ -93,8 +93,6 @@ private:
     TYPE *mem;
     TYPE *send[3][3][3], *recv[3][3][3];
     MPI_Request send_req[3][3][3], recv_req[3][3][3];
-    size_t N_type;
-    MPI_Datatype datatype;
     fillHalo();                             // Private empty constructor
     fillHalo(const fillHalo&);              // Private copy constructor
     fillHalo& operator=(const fillHalo&);   // Private assignment operator
@@ -136,7 +134,7 @@ void InitializeRanks( const int rank, const int nprocx, const int nprocy, const
 
 
 //***************************************************************************************
-inline void CommunicateSendRecvCounts( MPI_Comm Communicator, int sendtag, int recvtag, 
+inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
 		int rank_x, int rank_y, int rank_z, 
 		int rank_X, int rank_Y, int rank_Z,
 		int rank_xy, int rank_XY, int rank_xY, int rank_Xy,
@@ -155,53 +153,53 @@ inline void CommunicateSendRecvCounts( MPI_Comm Communicator, int sendtag, int r
 {
 	MPI_Request req1[18], req2[18];
 	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator,&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator,&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator,&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator,&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator,&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator,&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator,&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator,&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator,&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator,&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator,&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator,&req2[5]);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator.getCommunicator(),&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator.getCommunicator(),&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator.getCommunicator(),&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator.getCommunicator(),&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator.getCommunicator(),&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator.getCommunicator(),&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator.getCommunicator(),&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator.getCommunicator(),&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator.getCommunicator(),&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator.getCommunicator(),&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator.getCommunicator(),&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator.getCommunicator(),&req2[5]);
 
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator,&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator,&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator,&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator,&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator,&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator,&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator,&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator,&req2[9]);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator.getCommunicator(),&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator.getCommunicator(),&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator.getCommunicator(),&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator.getCommunicator(),&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator.getCommunicator(),&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator.getCommunicator(),&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator.getCommunicator(),&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator.getCommunicator(),&req2[9]);
 
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator,&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator,&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator,&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator,&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator,&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator,&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator,&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator,&req2[13]);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator.getCommunicator(),&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator.getCommunicator(),&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator.getCommunicator(),&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator.getCommunicator(),&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator.getCommunicator(),&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator.getCommunicator(),&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator.getCommunicator(),&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator.getCommunicator(),&req2[13]);
 
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator,&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator,&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator,&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator,&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator,&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator,&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator,&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator,&req2[17]);
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator.getCommunicator(),&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator.getCommunicator(),&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator.getCommunicator(),&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator.getCommunicator(),&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator.getCommunicator(),&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator.getCommunicator(),&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator.getCommunicator(),&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator.getCommunicator(),&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(Communicator);
+	Communicator.barrier();
 }
 
 
 //***************************************************************************************
-inline void CommunicateRecvLists( MPI_Comm Communicator, int sendtag, int recvtag, 
+inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
 		int *sendList_x, int *sendList_y, int *sendList_z, int *sendList_X, int *sendList_Y, int *sendList_Z,
 		int *sendList_xy, int *sendList_XY, int *sendList_xY, int *sendList_Xy,
 		int *sendList_xz, int *sendList_XZ, int *sendList_xZ, int *sendList_Xz,
@@ -223,52 +221,52 @@ inline void CommunicateRecvLists( MPI_Comm Communicator, int sendtag, int recvta
 {
 	MPI_Request req1[18], req2[18];
 	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator,&req2[5]);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator.getCommunicator(),&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator.getCommunicator(),&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator.getCommunicator(),&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator.getCommunicator(),&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator.getCommunicator(),&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator.getCommunicator(),&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator.getCommunicator(),&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator.getCommunicator(),&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator.getCommunicator(),&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator.getCommunicator(),&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator.getCommunicator(),&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator.getCommunicator(),&req2[5]);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator,&req2[9]);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator.getCommunicator(),&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator.getCommunicator(),&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator.getCommunicator(),&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator.getCommunicator(),&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator.getCommunicator(),&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator.getCommunicator(),&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator.getCommunicator(),&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator.getCommunicator(),&req2[9]);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator,&req2[13]);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator.getCommunicator(),&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator.getCommunicator(),&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator.getCommunicator(),&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator.getCommunicator(),&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator.getCommunicator(),&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator.getCommunicator(),&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator.getCommunicator(),&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator.getCommunicator(),&req2[13]);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator,&req2[17]);
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator.getCommunicator(),&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator.getCommunicator(),&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator.getCommunicator(),&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator.getCommunicator(),&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator.getCommunicator(),&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator.getCommunicator(),&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator.getCommunicator(),&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator.getCommunicator(),&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
 }
 
 
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &Mesh, MPI_Comm Communicator,
+inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communicator,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -319,41 +317,41 @@ inline void CommunicateMeshHalo(DoubleArray &Mesh, MPI_Comm Communicator,
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
 	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_DOUBLE,rank_x,sendtag,
-			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_DOUBLE,rank_X,sendtag,
-			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_DOUBLE,rank_y,sendtag,
-			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,
-			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_DOUBLE,rank_z,sendtag,
-			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,
-			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,
-			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,
-			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,
-			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,
-			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,
-			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,
-			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,
-			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,
-			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,
-			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,
-			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,
-			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,
-			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator,MPI_STATUS_IGNORE);
+			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
diff --git a/common/Communication.hpp b/common/Communication.hpp
index 33fed3a7..ca310ea5 100644
--- a/common/Communication.hpp
+++ b/common/Communication.hpp
@@ -2,9 +2,8 @@
 #define COMMUNICATION_HPP_INC
 
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
-//#include "ProfilerApp.h"
 
 
 /********************************************************
@@ -12,17 +11,19 @@
 ********************************************************/
 template<class TYPE>
 Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src_data,
-    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, MPI_Comm comm )
+    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, const Utilities::MPI& comm )
 {
-#ifdef USE_MPI
+    if ( comm.getSize() == 1 ) {
+        return src_data.subset( { 0, (size_t) dst_size[0]-1, 0, (size_t) dst_size[1]-1, 0, (size_t) dst_size[2]-1 } );
+    }
     // Get the src size
     std::array<int,3> src_size;
     int size0[3] = { (int) src_data.size(0), (int) src_data.size(1), (int) src_data.size(2) };
-    MPI_Allreduce( size0, src_size.data(), 3, MPI_INT, MPI_MAX, comm );
+    comm.maxReduce( size0, src_size.data(), 3 );
     if ( !src_data.empty() )
         ASSERT( src_size[0] == size0[0] && src_size[1] == size0[1] && src_size[2] == size0[2] );
     // Check that dst_size matches on all ranks
-    MPI_Allreduce( dst_size.data(), size0, 3, MPI_INT, MPI_MAX, comm );
+    comm.maxReduce( dst_size.data(), size0, 3 );
     ASSERT( dst_size[0] == size0[0] && dst_size[1] == size0[1] && dst_size[2] == size0[2] );
     // Function to get overlap range
     auto calcOverlap = []( int i1[3], int i2[3], int j1[3], int j2[3] ) {
@@ -60,7 +61,7 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     }
     std::vector<MPI_Request> send_request( send_rank.size() );
     for (size_t i=0; i<send_rank.size(); i++)
-        MPI_Isend( send_data[i].data(), sizeof(TYPE)*send_data[i].length(), MPI_BYTE, send_rank[i], 5462, comm, &send_request[i]);
+        send_request[i] = comm.Isend( send_data[i].data(), send_data[i].length(), send_rank[i], 5462 );
     // Unpack data from the appropriate ranks (including myself)
     Array<TYPE> dst_data( dst_size[0], dst_size[1], dst_size[2] );
     int i1[3] = { dst_size[0] * dst_rank.ix, dst_size[1] * dst_rank.jy, dst_size[2] * dst_rank.kz };
@@ -75,17 +76,14 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
                     continue;
                 int rank  = src_rank.getRankForBlock(i,j,k);
                 Array<TYPE> data( index[1] - index[0] + 1, index[3] - index[2] + 1, index[5] - index[4] + 1 );
-                MPI_Recv( data.data(), sizeof(TYPE)*data.length(), MPI_BYTE, rank, 5462, comm, MPI_STATUS_IGNORE );
+                comm.recv( data.data(), data.length(), rank, 5462 );
                 dst_data.copySubset( index, data );
             }
         }
     }
     // Free data
-    MPI_Waitall( send_request.size(), send_request.data(), MPI_STATUSES_IGNORE );
+    comm.waitAll( send_request.size(), send_request.data() );
     return dst_data;
-#else
-    return src_data.subset( { 0, dst_size[0]-1, 0, dst_size[1]-1, 0, dst_size[2]-1 );
-#endif
 }
 
 
@@ -94,27 +92,11 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
 *  Structure to fill halo cells                         *
 ********************************************************/
 template<class TYPE>
-fillHalo<TYPE>::fillHalo( MPI_Comm comm_, const RankInfoStruct& info_,
+fillHalo<TYPE>::fillHalo( const Utilities::MPI& comm_, const RankInfoStruct& info_,
     std::array<int,3> n_, std::array<int,3> ng_, int tag0, int depth_,
     std::array<bool,3> fill, std::array<bool,3> periodic ):
     comm(comm_), info(info_), n(n_), ng(ng_), depth(depth_)
 {
-    if ( std::is_same<TYPE,double>() ) {
-        N_type = 1;
-        datatype = MPI_DOUBLE;
-    } else if ( std::is_same<TYPE,float>() ) {
-        N_type = 1;
-        datatype = MPI_FLOAT;
-    } else if ( sizeof(TYPE)%sizeof(double)==0 ) {
-        N_type = sizeof(TYPE) / sizeof(double);
-        datatype = MPI_DOUBLE;
-    } else if ( sizeof(TYPE)%sizeof(float)==0 ) {
-        N_type = sizeof(TYPE) / sizeof(float);
-        datatype = MPI_FLOAT;
-    } else {
-        N_type = sizeof(TYPE);
-        datatype = MPI_BYTE;
-    }
     // Set the fill pattern
     memset(fill_pattern,0,sizeof(fill_pattern));
     if ( fill[0] ) {
@@ -251,8 +233,8 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
             for (int k=0; k<3; k++) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                MPI_Irecv( recv[i][j][k], N_type*depth2*N_send_recv[i][j][k], datatype, 
-                    info.rank[i][j][k], tag[2-i][2-j][2-k], comm, &recv_req[i][j][k] );
+                recv_req[i][j][k] = comm.Irecv( recv[i][j][k], depth2*N_send_recv[i][j][k], 
+                    info.rank[i][j][k], tag[2-i][2-j][2-k] );
             }
         }
     }
@@ -263,19 +245,18 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
                 if ( !fill_pattern[i][j][k] )
                     continue;
                 pack( data, i-1, j-1, k-1, send[i][j][k] );
-                MPI_Isend( send[i][j][k], N_type*depth2*N_send_recv[i][j][k], datatype, 
-                    info.rank[i][j][k], tag[i][j][k], comm, &send_req[i][j][k] );
+                send_req[i][j][k] = comm.Isend( send[i][j][k], depth2*N_send_recv[i][j][k], 
+                    info.rank[i][j][k], tag[i][j][k] );
             }
         }
     }
     // Recv the dst data and unpack (we recive in reverse order to match the sends)
-    MPI_Status status;
     for (int i=2; i>=0; i--) {
         for (int j=2; j>=0; j--) {
             for (int k=2; k>=0; k--) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                MPI_Wait(&recv_req[i][j][k],&status);
+                comm.wait( recv_req[i][j][k] );
                 unpack( data, i-1, j-1, k-1, recv[i][j][k] );
             }
         }
@@ -286,7 +267,7 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
             for (int k=0; k<3; k++) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                MPI_Wait(&send_req[i][j][k],&status);
+                comm.wait( send_req[i][j][k] );
             }
         }
     }
diff --git a/common/Domain.cpp b/common/Domain.cpp
index a4959508..58ca099b 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -12,7 +12,7 @@
 #include "common/Domain.h"
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 
 // Inline function to read line without a return argument
@@ -62,11 +62,10 @@ Domain::Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz,
     NULL_USE( npy );
     NULL_USE( npz );
 	// set up the neighbor ranks
-    int myrank;
-    MPI_Comm_rank( Comm, &myrank );
+    int myrank = Comm.getRank();
 	rank_info = RankInfoStruct( myrank, rank_info.nx, rank_info.ny, rank_info.nz );
 	
-	MPI_Barrier(Comm);
+	Comm.barrier();
 	
     auto db = std::make_shared<Database>( );
     db->putScalar<int>( "BC", BC );
@@ -76,10 +75,9 @@ Domain::Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz,
     db->putVector<double>( "L", { lx, ly, lz } );
     initialize( db );
 }
-Domain::Domain( std::shared_ptr<Database> db, MPI_Comm Communicator):
+Domain::Domain( std::shared_ptr<Database> db, const Utilities::MPI& Communicator):
 	database(db), Nx(0), Ny(0), Nz(0), 
 	Lx(0), Ly(0), Lz(0), Volume(0), BoundaryCondition(0),
-	Comm(MPI_COMM_NULL),
 	inlet_layers_x(0), inlet_layers_y(0), inlet_layers_z(0),
 	outlet_layers_x(0), outlet_layers_y(0), outlet_layers_z(0),
     inlet_layers_phase(1),outlet_layers_phase(2),
@@ -109,14 +107,13 @@ Domain::Domain( std::shared_ptr<Database> db, MPI_Comm Communicator):
 	recvData_xY(NULL), recvData_yZ(NULL), recvData_Xz(NULL), recvData_XY(NULL), recvData_YZ(NULL), recvData_XZ(NULL),
 	id(NULL)
 {
-    MPI_Comm_dup(Communicator,&Comm);
+    Comm = Communicator.dup();
 
 	// set up the neighbor ranks
-    int myrank;
-    MPI_Comm_rank( Comm, &myrank );
+    int myrank = Comm.getRank();
     initialize( db );
 	rank_info = RankInfoStruct( myrank, rank_info.nx, rank_info.ny, rank_info.nz );
-	MPI_Barrier(Comm);
+    Comm.barrier();
 }
 
 Domain::~Domain()
@@ -165,10 +162,6 @@ Domain::~Domain()
 	delete [] recvData_yZ;  delete [] recvData_Yz;  delete [] recvData_YZ;
 	// Free id
 	delete [] id;
-	// Free the communicator
-	if ( Comm != MPI_COMM_WORLD && Comm != MPI_COMM_NULL ) {
-		MPI_Comm_free(&Comm);
-	}
 }
 
 void Domain::initialize( std::shared_ptr<Database> db )
@@ -219,8 +212,7 @@ void Domain::initialize( std::shared_ptr<Database> db )
     Ny = ny+2;
     Nz = nz+2;
     // Initialize ranks
-    int myrank;
-    MPI_Comm_rank( Comm, &myrank );
+    int myrank = Comm.getRank();
 	rank_info = RankInfoStruct(myrank,nproc[0],nproc[1],nproc[2]);
 	// inlet layers only apply to lower part of domain
 	if (rank_info.ix > 0) inlet_layers_x = 0;
@@ -239,8 +231,7 @@ void Domain::initialize( std::shared_ptr<Database> db )
 	id = new signed char[N];
 	memset(id,0,N);
 	BoundaryCondition = d_db->getScalar<int>("BC");
-    int nprocs;
-    MPI_Comm_size( Comm, &nprocs );
+    int nprocs = Comm.getSize();
 	INSIST(nprocs == nproc[0]*nproc[1]*nproc[2],"Fatal error in processor count!");
 }
 
@@ -569,7 +560,7 @@ void Domain::Decomp( const std::string& Filename )
 					}
 					else{
 						//printf("Sending data to process %i \n", rnk);
-						MPI_Send(loc_id,N,MPI_CHAR,rnk,15,Comm);
+						Comm.send(loc_id,N,rnk,15);
 					}
 					// Write the data for this rank data 
 					sprintf(LocalRankFilename,"ID.%05i",rnk+rank_offset);
@@ -584,9 +575,9 @@ void Domain::Decomp( const std::string& Filename )
 	else{
 		// Recieve the subdomain from rank = 0
 		//printf("Ready to recieve data %i at process %i \n", N,rank);
-		MPI_Recv(id,N,MPI_CHAR,0,15,Comm,MPI_STATUS_IGNORE);
+		Comm.recv(id,N,0,15);
 	}
-	MPI_Barrier(Comm);
+	Comm.barrier();
 }
 
 void Domain::AggregateLabels( const std::string& filename ){
@@ -625,7 +616,7 @@ void Domain::AggregateLabels( const std::string& filename ){
 			}
 		}
 	}
-	MPI_Barrier(Comm);
+	Comm.barrier();
 
 	// populate the FullID 
 	if (rank() == 0){
@@ -651,7 +642,7 @@ void Domain::AggregateLabels( const std::string& filename ){
 			ipx = (rnk - ipz*npx*npy - ipy*npx); 
 			//printf("ipx=%i ipy=%i ipz=%i\n", ipx, ipy, ipz);
 			int tag = 15+rnk;
-			MPI_Recv(LocalID,local_size,MPI_CHAR,rnk,tag,Comm,MPI_STATUS_IGNORE);
+			Comm.recv(LocalID,local_size,rnk,tag);
 			for (int k=1; k<nz-1; k++){
 				for (int j=1; j<ny-1; j++){
 					for (int i=1; i<nx-1; i++){
@@ -674,9 +665,9 @@ void Domain::AggregateLabels( const std::string& filename ){
 		// send LocalID to rank=0
 		int tag = 15+ rank();
 		int dstrank = 0;
-		MPI_Send(LocalID,local_size,MPI_CHAR,dstrank,tag,Comm);
+		Comm.send(LocalID,local_size,dstrank,tag);
 	}
-	MPI_Barrier(Comm);
+	Comm.barrier();
 
 }
 
@@ -801,45 +792,45 @@ void Domain::CommInit()
 	sendBuf_YZ = new int [sendCount_YZ];
 	sendBuf_XZ = new int [sendCount_XZ];
 	//......................................................................................
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm,&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm,&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm,&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm,&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm,&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm,&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm,&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm,&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm,&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm,&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm,&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm,&req2[5]);
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm,&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm,&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm,&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm,&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm,&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm,&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm,&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm,&req2[9]);
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm,&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm,&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm,&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm,&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm,&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm,&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm,&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm,&req2[13]);
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm,&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm,&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm,&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm,&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm,&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm,&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm,&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm,&req2[17]);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm.getCommunicator(),&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm.getCommunicator(),&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm.getCommunicator(),&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm.getCommunicator(),&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm.getCommunicator(),&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm.getCommunicator(),&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm.getCommunicator(),&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm.getCommunicator(),&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm.getCommunicator(),&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm.getCommunicator(),&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm.getCommunicator(),&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm.getCommunicator(),&req2[5]);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm.getCommunicator(),&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm.getCommunicator(),&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm.getCommunicator(),&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm.getCommunicator(),&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm.getCommunicator(),&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm.getCommunicator(),&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm.getCommunicator(),&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm.getCommunicator(),&req2[9]);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm.getCommunicator(),&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm.getCommunicator(),&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm.getCommunicator(),&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm.getCommunicator(),&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm.getCommunicator(),&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm.getCommunicator(),&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm.getCommunicator(),&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm.getCommunicator(),&req2[13]);
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm.getCommunicator(),&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm.getCommunicator(),&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm.getCommunicator(),&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm.getCommunicator(),&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm.getCommunicator(),&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm.getCommunicator(),&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm.getCommunicator(),&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm.getCommunicator(),&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(Comm);
+	Comm.barrier();
 	//......................................................................................
 	// recv buffers
 	recvList_x = new int [recvCount_x];
@@ -861,42 +852,42 @@ void Domain::CommInit()
 	recvList_YZ = new int [recvCount_YZ];
 	recvList_XZ = new int [recvCount_XZ];
 	//......................................................................................
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm,&req2[5]);
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm,&req2[9]);
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm,&req2[13]);
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm,&req2[17]);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm.getCommunicator(),&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm.getCommunicator(),&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm.getCommunicator(),&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm.getCommunicator(),&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm.getCommunicator(),&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm.getCommunicator(),&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm.getCommunicator(),&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm.getCommunicator(),&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm.getCommunicator(),&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm.getCommunicator(),&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm.getCommunicator(),&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm.getCommunicator(),&req2[5]);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm.getCommunicator(),&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm.getCommunicator(),&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm.getCommunicator(),&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm.getCommunicator(),&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm.getCommunicator(),&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm.getCommunicator(),&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm.getCommunicator(),&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm.getCommunicator(),&req2[9]);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm.getCommunicator(),&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm.getCommunicator(),&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm.getCommunicator(),&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm.getCommunicator(),&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm.getCommunicator(),&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm.getCommunicator(),&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm.getCommunicator(),&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm.getCommunicator(),&req2[13]);
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm.getCommunicator(),&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm.getCommunicator(),&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm.getCommunicator(),&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm.getCommunicator(),&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm.getCommunicator(),&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm.getCommunicator(),&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm.getCommunicator(),&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm.getCommunicator(),&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
 	//......................................................................................
@@ -1041,7 +1032,7 @@ void Domain::ReadIDs(){
             }
         }
     }
-    MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,Comm);
+    sum = Comm.sumReduce(sum_local);
     porosity = sum*iVol_global;
     if (rank()==0) printf("Media porosity = %f \n",porosity);
  	//.........................................................
@@ -1089,41 +1080,41 @@ void Domain::CommunicateMeshHalo(DoubleArray &Mesh)
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendData_YZ, MeshData);
 	//......................................................................................
 	MPI_Sendrecv(sendData_x,sendCount_x,MPI_DOUBLE,rank_x(),sendtag,
-			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_X,sendCount_X,MPI_DOUBLE,rank_X(),sendtag,
-			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_y,sendCount_y,MPI_DOUBLE,rank_y(),sendtag,
-			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Y,sendCount_Y,MPI_DOUBLE,rank_Y(),sendtag,
-			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_z,sendCount_z,MPI_DOUBLE,rank_z(),sendtag,
-			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Z,sendCount_Z,MPI_DOUBLE,rank_Z(),sendtag,
-			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xy,sendCount_xy,MPI_DOUBLE,rank_xy(),sendtag,
-			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_XY,sendCount_XY,MPI_DOUBLE,rank_XY(),sendtag,
-			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy(),sendtag,
-			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xY,sendCount_xY,MPI_DOUBLE,rank_xY(),sendtag,
-			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xz,sendCount_xz,MPI_DOUBLE,rank_xz(),sendtag,
-			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ(),sendtag,
-			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz(),sendtag,
-			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ(),sendtag,
-			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_yz,sendCount_yz,MPI_DOUBLE,rank_yz(),sendtag,
-			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ(),sendtag,
-			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz(),sendtag,
-			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ(),sendtag,
-			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm,MPI_STATUS_IGNORE);
+			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvData_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvData_X, MeshData);
diff --git a/common/Domain.h b/common/Domain.h
index df2812c1..22b05af7 100755
--- a/common/Domain.h
+++ b/common/Domain.h
@@ -12,7 +12,7 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 
@@ -63,7 +63,7 @@ private:
 class Domain{
 public:
     //! Default constructor
-    Domain( std::shared_ptr<Database> db, MPI_Comm Communicator);
+    Domain( std::shared_ptr<Database> db, const Utilities::MPI& Communicator);
 
     //! Obsolete constructor
     Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz, 
@@ -116,7 +116,7 @@ public: // Public variables (need to create accessors instead)
     double porosity;
     RankInfoStruct rank_info;
 
-    MPI_Comm Comm;        // MPI Communicator for this domain
+    Utilities::MPI Comm;        // MPI Communicator for this domain
 
     int BoundaryCondition;
 
diff --git a/common/MPI.I b/common/MPI.I
new file mode 100644
index 00000000..8cbc9c09
--- /dev/null
+++ b/common/MPI.I
@@ -0,0 +1,1143 @@
+// This file contains the default instantiations for templated operations
+// Note: Intel compilers need definitions before all default instantions to compile correctly
+#ifndef included_MPI_I
+#define included_MPI_I
+
+#include "common/Utilities.h"
+
+#include <typeinfo>
+
+
+#define MPI_CLASS MPI
+#define MPI_CLASS_ERROR ERROR
+#define MPI_CLASS_ASSERT ASSERT
+
+#undef NULL_USE
+#define NULL_USE( variable )                    \
+    do {                                        \
+        if ( 0 ) {                              \
+            auto static t = (char *) &variable; \
+            t++;                                \
+        }                                       \
+    } while ( 0 )
+
+
+namespace Utilities {
+
+
+// Function to test if a type is a std::pair
+template<typename>
+struct is_pair : std::false_type {
+};
+template<typename T, typename U>
+struct is_pair<std::pair<T, U>> : std::true_type {
+};
+
+
+// Function to test if a type can be passed by MPI
+template<class TYPE>
+constexpr typename std::enable_if<std::is_trivially_copyable<TYPE>::value,bool>::type
+    is_mpi_copyable()
+{
+    return true;
+}
+template<class TYPE>
+constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&is_pair<TYPE>::value,bool>::type
+    is_mpi_copyable()
+{
+    return is_mpi_copyable<typename TYPE::first_type>() && is_mpi_copyable<typename TYPE::second_type>();
+}
+template<class TYPE>
+constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&!is_pair<TYPE>::value,bool>::type
+    is_mpi_copyable()
+{
+    return false;
+}
+
+
+/************************************************************************
+ *  sumReduce                                                            *
+ ************************************************************************/
+template<class TYPE>
+inline TYPE MPI_CLASS::sumReduce( const TYPE value ) const
+{
+    if ( comm_size > 1 ) {
+        TYPE tmp = value;
+        call_sumReduce( &tmp, 1 );
+        return tmp;
+    } else {
+        return value;
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::sumReduce( TYPE *x, const int n ) const
+{
+    if ( comm_size > 1 )
+        call_sumReduce( x, n );
+}
+template<class TYPE>
+inline void MPI_CLASS::sumReduce( const TYPE *x, TYPE *y, const int n ) const
+{
+    if ( comm_size > 1 ) {
+        call_sumReduce( x, y, n );
+    } else {
+        for ( int i = 0; i < n; i++ )
+            y[i] = x[i];
+    }
+}
+// Define specializations of call_sumReduce(TYPE*, const int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<char>( char *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<int>( int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<unsigned long int>( unsigned long int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<long int>( long int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<size_t>( size_t *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<float>( float *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<double>( double *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *, const int ) const;
+#endif
+// Default instantiations of call_sumReduce(TYPE*, const int)
+template<class TYPE>
+void MPI_CLASS::call_sumReduce( TYPE *, const int ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+// Define specializations of call_sumReduce(const TYPE*, TYPE*, const int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_sumReduce<unsigned char>(
+    const unsigned char *, unsigned char *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<char>( const char *, char *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<unsigned int>(
+    const unsigned int *, unsigned int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<int>( const int *, int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<unsigned long int>(
+    const unsigned long int *, unsigned long int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<long int>( const long int *, long int *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<size_t>( const size_t *, size_t *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<float>( const float *, float *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<double>( const double *, double *, const int ) const;
+template<>
+void MPI_CLASS::call_sumReduce<std::complex<double>>(
+    const std::complex<double> *, std::complex<double> *, const int ) const;
+#endif
+// Default instantiations of call_sumReduce(const TYPE*, TYPE*, const int)
+template<class TYPE>
+void MPI_CLASS::call_sumReduce( const TYPE *x, TYPE *y, const int n ) const
+{
+    NULL_USE( x );
+    NULL_USE( y );
+    NULL_USE( n );
+    char message[200];
+    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  minReduce                                                            *
+ ************************************************************************/
+template<class TYPE>
+inline TYPE MPI_CLASS::minReduce( const TYPE value ) const
+{
+    if ( comm_size > 1 ) {
+        TYPE tmp = value;
+        call_minReduce( &tmp, 1, nullptr );
+        return tmp;
+    } else {
+        return value;
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::minReduce( TYPE *x, const int n, int *rank_of_min ) const
+{
+    if ( comm_size > 1 ) {
+        call_minReduce( x, n, rank_of_min );
+    } else {
+        if ( rank_of_min != nullptr ) {
+            for ( int i = 0; i < n; i++ )
+                rank_of_min[i] = 0;
+        }
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::minReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_min ) const
+{
+    if ( comm_size > 1 ) {
+        call_minReduce( x, y, n, rank_of_min );
+    } else {
+        for ( int i = 0; i < n; i++ ) {
+            y[i] = x[i];
+            if ( rank_of_min != nullptr )
+                rank_of_min[i] = 0;
+        }
+    }
+}
+// Define specializations of call_minReduce(TYPE*, const int, int*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_minReduce<unsigned char>( unsigned char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<char>( char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned int>( unsigned int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<int>( int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<long int>( long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned long long int>(
+    unsigned long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<long long int>( long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<size_t>( size_t *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<float>( float *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<double>( double *, const int, int * ) const;
+#endif
+// Default instantiations of call_minReduce(TYPE*, const int, int*)
+template<class TYPE>
+void MPI_CLASS::call_minReduce( TYPE *, const int, int * ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+// Define specializations of call_minReduce(const TYPE*, TYPE*, const int, int*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_minReduce<unsigned char>(
+    const unsigned char *, unsigned char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<char>( const char *, char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned int>(
+    const unsigned int *, unsigned int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<int>( const int *, int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned long int>(
+    const unsigned long int *, unsigned long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<long int>( const long int *, long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<unsigned long long int>(
+    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<long long int>(
+    const long long int *, long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<float>( const float *, float *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_minReduce<double>( const double *, double *, const int, int * ) const;
+#endif
+// Default instantiations of call_minReduce(const TYPE*, TYPE*, const int, int*)
+template<class TYPE>
+void MPI_CLASS::call_minReduce( const TYPE *, TYPE *, const int, int * ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  maxReduce                                                         *
+ ************************************************************************/
+template<class TYPE>
+inline TYPE MPI_CLASS::maxReduce( const TYPE value ) const
+{
+    if ( comm_size > 1 ) {
+        TYPE tmp = value;
+        call_maxReduce( &tmp, 1, nullptr );
+        return tmp;
+    } else {
+        return value;
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::maxReduce( TYPE *x, const int n, int *rank_of_max ) const
+{
+    if ( comm_size > 1 ) {
+        call_maxReduce( x, n, rank_of_max );
+    } else {
+        if ( rank_of_max != nullptr ) {
+            for ( int i = 0; i < n; i++ )
+                rank_of_max[i] = 0;
+        }
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::maxReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_max ) const
+{
+    if ( comm_size > 1 ) {
+        call_maxReduce( x, y, n, rank_of_max );
+    } else {
+        for ( int i = 0; i < n; i++ ) {
+            y[i] = x[i];
+            if ( rank_of_max != nullptr )
+                rank_of_max[i] = 0;
+        }
+    }
+}
+// Define specializations of call_maxReduce(TYPE*, const int, int*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_maxReduce<unsigned char>( unsigned char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<char>( char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned int>( unsigned int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<int>( int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<long int>( long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long long int>(
+    unsigned long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<long long int>( long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<size_t>( size_t *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<float>( float *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<double>( double *, const int, int * ) const;
+#endif
+// Default instantiations of call_maxReduce(TYPE*, const int, int*)
+template<class TYPE>
+void MPI_CLASS::call_maxReduce( TYPE *, const int, int * ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+// Define specializations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_maxReduce<unsigned char>(
+    const unsigned char *, unsigned char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<char>( const char *, char *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned int>(
+    const unsigned int *, unsigned int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<int>( const int *, int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long int>(
+    const unsigned long int *, unsigned long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<long int>( const long int *, long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long long int>(
+    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<long long int>(
+    const long long int *, long long int *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<float>( const float *, float *, const int, int * ) const;
+template<>
+void MPI_CLASS::call_maxReduce<double>( const double *, double *, const int, int * ) const;
+#endif
+// Default instantiations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
+template<class TYPE>
+void MPI_CLASS::call_maxReduce( const TYPE *, TYPE *, const int, int * ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  bcast                                                                *
+ ************************************************************************/
+// Define specializations of bcast(TYPE*, const int, const int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_bcast<unsigned char>( unsigned char *, const int, const int ) const;
+template<>
+void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
+template<>
+void MPI_CLASS::call_bcast<unsigned int>( unsigned int *, const int, const int ) const;
+template<>
+void MPI_CLASS::call_bcast<int>( int *, const int, const int ) const;
+template<>
+void MPI_CLASS::call_bcast<float>( float *, const int, const int ) const;
+template<>
+void MPI_CLASS::call_bcast<double>( double *, const int, const int ) const;
+#else
+template<>
+void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
+#endif
+// Default instantiations of bcast(TYPE*, const int, const int)
+template<class TYPE>
+void MPI_CLASS::call_bcast( TYPE *x, const int n, const int root ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    call_bcast<char>( (char *) x, (int) n * sizeof( TYPE ), root );
+}
+// Specialization of bcast for std::string
+template<>
+inline std::string MPI_CLASS::bcast<std::string>( const std::string &value, const int root ) const
+{
+    if ( comm_size == 1 )
+        return value;
+    int length = static_cast<int>( value.size() );
+    call_bcast<int>( &length, 1, root );
+    if ( length == 0 )
+        return std::string();
+    char *str = new char[length + 1];
+    if ( root == comm_rank ) {
+        for ( int i = 0; i < length; i++ )
+            str[i] = value[i];
+    }
+    call_bcast<char>( str, length, root );
+    str[length] = 0;
+    std::string result( str );
+    delete[] str;
+    return result;
+}
+template<>
+inline void MPI_CLASS::bcast<std::string>( std::string *, const int, const int ) const
+{
+    MPI_CLASS_ERROR( "Cannot bcast an array of strings" );
+}
+// Default implimentation of bcast
+template<class TYPE>
+inline TYPE MPI_CLASS::bcast( const TYPE &value, const int root ) const
+{
+    if ( root >= comm_size )
+        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
+    if ( comm_size > 1 ) {
+        TYPE tmp = value;
+        call_bcast( &tmp, 1, root );
+        return tmp;
+    } else {
+        return value;
+    }
+}
+template<class TYPE>
+inline void MPI_CLASS::bcast( TYPE *x, const int n, const int root ) const
+{
+    if ( root >= comm_size )
+        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
+    if ( comm_size > 1 )
+        call_bcast( x, n, root );
+}
+
+
+/************************************************************************
+ *  send                                                                 *
+ ************************************************************************/
+// Define specializations of send(const TYPE*, const int, const int, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
+template<>
+void MPI_CLASS::send<int>( const int *, int, const int, int ) const;
+template<>
+void MPI_CLASS::send<float>( const float *, const int, const int, int ) const;
+template<>
+void MPI_CLASS::send<double>( const double *, const int, const int, int ) const;
+#else
+template<>
+void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
+#endif
+// Default instantiations of send(const TYPE*, const int, const int, int)
+template<class TYPE>
+inline void MPI_CLASS::send(
+    const TYPE *buf, const int length, const int recv_proc_number, int tag ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    send<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
+}
+
+
+/************************************************************************
+ *  Isend                                                                *
+ ************************************************************************/
+// Define specializations of Isend(const TYPE*, const int, const int, const int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Isend<int>( const int *, int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Isend<float>( const float *, const int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Isend<double>( const double *, const int, const int, const int ) const;
+#else
+template<>
+MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
+#endif
+// Default instantiations of Isend(const TYPE*, const int, const int, const int)
+template<class TYPE>
+inline MPI_Request MPI_CLASS::Isend(
+    const TYPE *buf, const int length, const int recv_proc_number, const int tag ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    return Isend<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
+}
+
+
+/************************************************************************
+ *  recv                                                                 *
+ ************************************************************************/
+// Define specializations of recv(TYPE*, int&, const int, const bool, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
+template<>
+void MPI_CLASS::recv<int>( int *, int &, const int, const bool, int ) const;
+template<>
+void MPI_CLASS::recv<float>( float *, int &, const int, const bool, int ) const;
+template<>
+void MPI_CLASS::recv<double>( double *, int &, const int, const bool, int ) const;
+#else
+template<>
+void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
+#endif
+// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
+template<class TYPE>
+inline void MPI_CLASS::recv(
+    TYPE *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    int size = length * sizeof( TYPE );
+    recv<char>( (char *) buf, size, send_proc_number, get_length, tag );
+    if ( get_length ) {
+        MPI_CLASS_ASSERT( size % sizeof( TYPE ) == 0 );
+        length = size / sizeof( TYPE );
+    }
+}
+
+
+/************************************************************************
+ *  Irecv                                                                *
+ ************************************************************************/
+// Define specializations of recv(TYPE*, int&, const int, const bool, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Irecv<int>( int *, const int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Irecv<float>( float *, const int, const int, const int ) const;
+template<>
+MPI_Request MPI_CLASS::Irecv<double>( double *, const int, const int, const int ) const;
+#else
+template<>
+MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
+#endif
+// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
+template<class TYPE>
+inline MPI_Request MPI_CLASS::Irecv(
+    TYPE *buf, const int length, const int send_proc, const int tag ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    return Irecv<char>( (char *) buf, length * sizeof( TYPE ), send_proc, tag );
+}
+
+
+/************************************************************************
+ *  allGather                                                            *
+ ************************************************************************/
+template<class TYPE>
+std::vector<TYPE> MPI_CLASS::allGather( const TYPE &x ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    if ( getSize() <= 1 )
+        return std::vector<TYPE>( 1, x );
+    std::vector<TYPE> data( getSize() );
+    allGather( x, data.data() );
+    return data;
+}
+template<class TYPE>
+std::vector<TYPE> MPI_CLASS::allGather( const std::vector<TYPE> &x ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    if ( getSize() <= 1 )
+        return x;
+    std::vector<int> count = allGather<int>( x.size() );
+    std::vector<int> disp( getSize(), 0 );
+    size_t N = count[0];
+    for ( size_t i = 1; i < count.size(); i++ ) {
+        disp[i] = disp[i - 1] + count[i - 1];
+        N += count[i];
+    }
+    std::vector<TYPE> data( N );
+    allGather<TYPE>( x.data(), x.size(), data.data(), count.data(), disp.data(), true );
+    return data;
+}
+// Specialization of MPI_CLASS::allGather for std::string
+template<>
+inline void MPI_CLASS::allGather<std::string>( const std::string &x_in, std::string *x_out ) const
+{
+    // Get the bytes recvied per processor
+    std::vector<int> recv_cnt( comm_size, 0 );
+    allGather<int>( (int) x_in.size() + 1, &recv_cnt[0] );
+    std::vector<int> recv_disp( comm_size, 0 );
+    for ( int i = 1; i < comm_size; i++ )
+        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
+    // Call the vector form of allGather for the char arrays
+    char *recv_data = new char[recv_disp[comm_size - 1] + recv_cnt[comm_size - 1]];
+    allGather<char>(
+        x_in.c_str(), (int) x_in.size() + 1, recv_data, &recv_cnt[0], &recv_disp[0], true );
+    for ( int i = 0; i < comm_size; i++ )
+        x_out[i] = std::string( &recv_data[recv_disp[i]] );
+    delete[] recv_data;
+}
+// Default instantiation of MPI_CLASS::allGather
+template<class TYPE>
+inline void MPI_CLASS::allGather( const TYPE &x_in, TYPE *x_out ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    if ( comm_size > 1 ) {
+        // We can use the vector form of allGather with a char array to ge the data we want
+        call_allGather( x_in, x_out );
+    } else {
+        // Single processor case
+        x_out[0] = x_in;
+    }
+}
+// Specialization of MPI_CLASS::allGather for std::string
+template<>
+inline int MPI_CLASS::allGather<std::string>(
+    const std::string *, const int, std::string *, int *, int *, bool ) const
+{
+    MPI_CLASS_ERROR( "Cannot allGather an array of strings" );
+    return 0;
+}
+// Define specializations of call_allGather(const TYPE, TYPE*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_allGather<unsigned char>( const unsigned char &, unsigned char * ) const;
+template<>
+void MPI_CLASS::call_allGather<char>( const char &, char * ) const;
+template<>
+void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &, unsigned int * ) const;
+template<>
+void MPI_CLASS::call_allGather<int>( const int &, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<unsigned long int>(
+    const unsigned long int &, unsigned long int * ) const;
+template<>
+void MPI_CLASS::call_allGather<long int>( const long int &, long int * ) const;
+template<>
+void MPI_CLASS::call_allGather<float>( const float &, float * ) const;
+template<>
+void MPI_CLASS::call_allGather<double>( const double &, double * ) const;
+#endif
+// Default instantiation of MPI_CLASS::allGather
+template<class TYPE>
+int MPI_CLASS::allGather( const TYPE *send_data, const int send_cnt, TYPE *recv_data, int *recv_cnt,
+    int *recv_disp, bool known_recv ) const
+{
+    // Check the inputs
+    if ( known_recv && ( recv_cnt == nullptr || recv_disp == nullptr ) )
+        MPI_CLASS_ERROR( "Error calling allGather" );
+    // Check if we are dealing with a single processor
+    if ( comm_size == 1 ) {
+        if ( send_data == nullptr && send_cnt > 0 ) {
+            MPI_CLASS_ERROR( "send_data is null" );
+        } else if ( !known_recv ) {
+            // We do not know the recieved sizes
+            for ( int i = 0; i < send_cnt; i++ )
+                recv_data[i] = send_data[i];
+            if ( recv_cnt != nullptr )
+                recv_cnt[0] = send_cnt;
+            if ( recv_disp != nullptr )
+                recv_disp[0] = 0;
+        } else {
+            // We know the recieved sizes
+            for ( int i = 0; i < send_cnt; i++ )
+                recv_data[i + recv_disp[0]] = send_data[i];
+        }
+        return send_cnt;
+    }
+    // Get the sizes of the recieved data (if necessary)
+    int *recv_cnt2  = recv_cnt;
+    int *recv_disp2 = recv_disp;
+    if ( !known_recv ) {
+        if ( recv_cnt == nullptr )
+            recv_cnt2 = new int[comm_size];
+        if ( recv_disp == nullptr )
+            recv_disp2 = new int[comm_size];
+        call_allGather( send_cnt, recv_cnt2 );
+        recv_disp2[0] = 0;
+        for ( int i = 1; i < comm_size; i++ )
+            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
+    }
+    int N_recv = 0;
+    for ( int i = 0; i < comm_size; i++ )
+        N_recv += recv_cnt2[i];
+    // Send/recv the data
+    call_allGather( send_data, send_cnt, recv_data, recv_cnt2, recv_disp2 );
+    // Delete any temporary memory
+    if ( recv_cnt == nullptr )
+        delete[] recv_cnt2;
+    if ( recv_disp == nullptr )
+        delete[] recv_disp2;
+    return N_recv;
+}
+// Default instantiations of call_allGather(const TYPE, TYPE*)
+template<class TYPE>
+void MPI_CLASS::call_allGather( const TYPE &x_in, TYPE *x_out ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    allGather<char>( (const char *) &x_in, (int) sizeof( TYPE ), (char *) x_out );
+}
+// Define specializations of call_allGather(const TYPE*, int, TYPE*, int*, int*)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_allGather<unsigned char>(
+    const unsigned char *, int, unsigned char *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<unsigned int>(
+    const unsigned int *, int, unsigned int *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<int>( const int *, int, int *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<unsigned long int>(
+    const unsigned long int *, int, unsigned long int *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<long int>( const long int *, int, long int *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<float>( const float *, int, float *, int *, int * ) const;
+template<>
+void MPI_CLASS::call_allGather<double>( const double *, int, double *, int *, int * ) const;
+#else
+template<>
+void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
+#endif
+// Default instantiations of int call_allGather(const TYPE*, int, TYPE*, int*)
+template<class TYPE>
+void MPI_CLASS::call_allGather(
+    const TYPE *x_in, int size_in, TYPE *x_out, int *size_out, int *disp_out ) const
+{
+    int *size2 = new int[comm_size];
+    int *disp2 = new int[comm_size];
+    for ( int i = 0; i < comm_size; i++ ) {
+        size2[i] = size_out[i] * sizeof( TYPE );
+        disp2[i] = disp_out[i] * sizeof( TYPE );
+    }
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    call_allGather<char>(
+        (const char *) x_in, (int) size_in * sizeof( TYPE ), (char *) x_out, size2, disp2 );
+    delete[] size2;
+    delete[] disp2;
+}
+
+
+/************************************************************************
+ *  setGather                                                            *
+ ************************************************************************/
+template<class TYPE>
+inline void MPI_CLASS::setGather( std::set<TYPE> &set ) const
+{
+    std::vector<TYPE> send_buf( set.begin(), set.end() );
+    std::vector<int> recv_cnt( this->comm_size, 0 );
+    this->allGather<int>( (int) send_buf.size(), &recv_cnt[0] );
+    std::vector<int> recv_disp( this->comm_size, 0 );
+    for ( int i = 1; i < this->comm_size; i++ )
+        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
+    size_t N_recv_tot = 0;
+    for ( int i = 0; i < this->comm_size; i++ )
+        N_recv_tot += recv_cnt[i];
+    if ( N_recv_tot == 0 )
+        return;
+    std::vector<TYPE> recv_buf( N_recv_tot );
+    TYPE *send_data = nullptr;
+    if ( send_buf.size() > 0 ) {
+        send_data = &send_buf[0];
+    }
+    TYPE *recv_data = &recv_buf[0];
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    this->allGather<TYPE>(
+        send_data, (int) send_buf.size(), recv_data, &recv_cnt[0], &recv_disp[0], true );
+    for ( size_t i = 0; i < recv_buf.size(); i++ )
+        set.insert( recv_buf[i] );
+}
+
+
+/************************************************************************
+ *  mapGather                                                            *
+ ************************************************************************/
+template<class KEY, class DATA>
+inline void MPI_CLASS::mapGather( std::map<KEY, DATA> &map ) const
+{
+    std::vector<KEY> send_id;
+    std::vector<DATA> send_data;
+    send_id.reserve( map.size() );
+    send_data.reserve( map.size() );
+    for ( auto it = map.begin(); it != map.end(); ++it ) {
+        send_id.push_back( it->first );
+        send_data.push_back( it->second );
+    }
+    int send_size = (int) send_id.size();
+    std::vector<int> recv_cnt( this->comm_size, 0 );
+    this->allGather<int>( send_size, &recv_cnt[0] );
+    std::vector<int> recv_disp( this->comm_size, 0 );
+    for ( int i = 1; i < this->comm_size; i++ )
+        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
+    size_t N_recv_tot = 0;
+    for ( int i = 0; i < this->comm_size; i++ )
+        N_recv_tot += recv_cnt[i];
+    if ( N_recv_tot == 0 )
+        return;
+    std::vector<KEY> recv_id( N_recv_tot );
+    std::vector<DATA> recv_data( N_recv_tot );
+    KEY *send_data1  = nullptr;
+    DATA *send_data2 = nullptr;
+    if ( send_id.size() > 0 ) {
+        send_data1 = &send_id[0];
+        send_data2 = &send_data[0];
+    }
+    static_assert( is_mpi_copyable<DATA>(), "Object is not trivially copyable" );
+    this->allGather<KEY>( send_data1, send_size, &recv_id[0], &recv_cnt[0], &recv_disp[0], true );
+    this->allGather<DATA>(
+        send_data2, send_size, &recv_data[0], &recv_cnt[0], &recv_disp[0], true );
+    map = std::map<KEY, DATA>();
+    for ( size_t i = 0; i < N_recv_tot; i++ )
+        map.insert( std::pair<KEY, DATA>( recv_id[i], recv_data[i] ) );
+}
+
+
+/************************************************************************
+ *  sumScan                                                              *
+ ************************************************************************/
+template<class TYPE>
+inline void MPI_CLASS::sumScan( const TYPE *x, TYPE *y, const int n ) const
+{
+    if ( comm_size > 1 ) {
+        call_sumScan( x, y, n );
+    } else {
+        for ( int i = 0; i < n; i++ )
+            y[i] = x[i];
+    }
+}
+// Define specializations of call_sumScan(const TYPE*, TYPE*, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_sumScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<char>( const char *, char *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<int>( const int *, int *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<unsigned long int>(
+    const unsigned long int *, unsigned long int *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<long int>( const long int *, long int *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<size_t>( const size_t *, size_t *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<float>( const float *, float *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<double>( const double *, double *, int ) const;
+template<>
+void MPI_CLASS::call_sumScan<std::complex<double>>(
+    const std::complex<double> *, std::complex<double> *, int ) const;
+#endif
+// Default instantiations of call_sumScan(const TYPE*, TYPE*, int)
+template<class TYPE>
+void MPI_CLASS::call_sumScan( const TYPE *, TYPE *, int ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of sumScan in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  minScan                                                              *
+ ************************************************************************/
+template<class TYPE>
+inline void MPI_CLASS::minScan( const TYPE *x, TYPE *y, const int n ) const
+{
+    if ( comm_size > 1 ) {
+        call_minScan( x, y, n );
+    } else {
+        for ( int i = 0; i < n; i++ )
+            y[i] = x[i];
+    }
+}
+// Define specializations of call_minScan(const TYPE*, TYPE*, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_minScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<char>( const char *, char *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<int>( const int *, int *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<unsigned long int>(
+    const unsigned long int *, unsigned long int *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<long int>( const long int *, long int *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<size_t>( const size_t *, size_t *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<float>( const float *, float *, int ) const;
+template<>
+void MPI_CLASS::call_minScan<double>( const double *, double *, int ) const;
+#endif
+// Default instantiations of call_minScan(const TYPE*, TYPE*, int)
+template<class TYPE>
+void MPI_CLASS::call_minScan( const TYPE *, TYPE *, int ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of minScan in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  maxScan                                                              *
+ ************************************************************************/
+template<class TYPE>
+inline void MPI_CLASS::maxScan( const TYPE *x, TYPE *y, const int n ) const
+{
+    if ( comm_size > 1 ) {
+        call_maxScan( x, y, n );
+    } else {
+        for ( int i = 0; i < n; i++ )
+            y[i] = x[i];
+    }
+}
+// Define specializations of call_maxScan(const TYPE*, TYPE*, int)
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_maxScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<char>( const char *, char *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<int>( const int *, int *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<unsigned long int>(
+    const unsigned long int *, unsigned long int *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<long int>( const long int *, long int *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<size_t>( const size_t *, size_t *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<float>( const float *, float *, int ) const;
+template<>
+void MPI_CLASS::call_maxScan<double>( const double *, double *, int ) const;
+#endif
+// Default instantiations of call_maxScan(const TYPE*, TYPE*, int)
+template<class TYPE>
+void MPI_CLASS::call_maxScan( const TYPE *, TYPE *, int ) const
+{
+    char message[200];
+    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
+        typeid( TYPE ).name() );
+    MPI_CLASS_ERROR( message );
+}
+
+
+/************************************************************************
+ *  allToAll                                                             *
+ ************************************************************************/
+// Define specializations of allToAll(const int n, const char*, char* )
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::allToAll<unsigned char>(
+    const int n, const unsigned char *, unsigned char * ) const;
+template<>
+void MPI_CLASS::allToAll<char>( const int n, const char *, char * ) const;
+template<>
+void MPI_CLASS::allToAll<unsigned int>( const int n, const unsigned int *, unsigned int * ) const;
+template<>
+void MPI_CLASS::allToAll<int>( const int n, const int *, int * ) const;
+template<>
+void MPI_CLASS::allToAll<unsigned long int>(
+    const int n, const unsigned long int *, unsigned long int * ) const;
+template<>
+void MPI_CLASS::allToAll<long int>( const int n, const long int *, long int * ) const;
+template<>
+void MPI_CLASS::allToAll<float>( const int n, const float *, float * ) const;
+template<>
+void MPI_CLASS::allToAll<double>( const int n, const double *, double * ) const;
+#endif
+// Default instantiations of allToAll(const int n, const char*, char* )
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<class TYPE>
+void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
+{
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    allToAll<char>( n * sizeof( TYPE ), (char *) send_data, (char *) recv_data );
+}
+#else
+template<class TYPE>
+void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
+{
+    if ( comm_size != 1 )
+        MPI_CLASS_ERROR( "Invalid size for allToAll" );
+    for ( int i = 0; i < n; i++ )
+        recv_data[i] = send_data[i];
+}
+#endif
+
+
+/************************************************************************
+ *  allToAll                                                             *
+ ************************************************************************/
+template<class TYPE>
+int MPI_CLASS::allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
+    TYPE *recv_data, int *recv_cnt, int *recv_disp, bool known_recv ) const
+{
+    int N_recieved = 0;
+    if ( comm_size == 1 ) {
+        // Special case for single-processor communicators
+        if ( known_recv ) {
+            if ( recv_cnt[0] != send_cnt[0] && send_cnt[0] > 0 )
+                MPI_CLASS_ERROR( "Single processor send/recv are different sizes" );
+        } else {
+            if ( recv_cnt != nullptr )
+                recv_cnt[0] = send_cnt[0];
+            if ( recv_disp != nullptr )
+                recv_disp[0] = send_disp[0];
+        }
+        for ( int i = 0; i < send_cnt[0]; i++ )
+            recv_data[i + recv_disp[0]] = send_data[i + send_disp[0]];
+        N_recieved = send_cnt[0];
+    } else if ( known_recv ) {
+        // The recieve sizes are known
+        MPI_CLASS_ASSERT( recv_cnt != nullptr && recv_disp != nullptr );
+        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt, recv_disp );
+        for ( int i = 0; i < comm_size; i++ )
+            N_recieved += recv_cnt[i];
+    } else {
+        // The recieve sizes are not known, we need to communicate that information first
+        int *recv_cnt2  = recv_cnt;
+        int *recv_disp2 = recv_disp;
+        if ( recv_cnt == nullptr )
+            recv_cnt2 = new int[comm_size];
+        if ( recv_disp == nullptr )
+            recv_disp2 = new int[comm_size];
+        // Communicate the size we will be recieving from each processor
+        allToAll<int>( 1, send_cnt, recv_cnt2 );
+        recv_disp2[0] = 0;
+        for ( int i = 1; i < comm_size; i++ )
+            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
+        // Send the data
+        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt2, recv_disp2 );
+        for ( int i = 0; i < comm_size; i++ )
+            N_recieved += recv_cnt2[i];
+        if ( recv_cnt == nullptr )
+            delete[] recv_cnt2;
+        if ( recv_disp == nullptr )
+            delete[] recv_disp2;
+    }
+    return N_recieved;
+}
+// Define specializations of call_allToAll
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *, const int *, const int *,
+    unsigned char *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<char>(
+    const char *, const int *, const int *, char *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *, const int *, const int *,
+    unsigned int *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<int>(
+    const int *, const int *, const int *, int *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *, const int *,
+    const int *, unsigned long int *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<long int>(
+    const long int *, const int *, const int *, long int *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<float>(
+    const float *, const int *, const int *, float *, const int *, const int * ) const;
+template<>
+void MPI_CLASS::call_allToAll<double>(
+    const double *, const int *, const int *, double *, const int *, const int * ) const;
+#else
+template<>
+void MPI_CLASS::call_allToAll<char>(
+    const char *, const int *, const int *, char *, const int *, const int * ) const;
+#endif
+// Default instantiations of call_allToAll
+template<class TYPE>
+void MPI_CLASS::call_allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
+    TYPE *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    int *send_cnt2  = new int[comm_size];
+    int *recv_cnt2  = new int[comm_size];
+    int *send_disp2 = new int[comm_size];
+    int *recv_disp2 = new int[comm_size];
+    for ( int i = 0; i < comm_size; i++ ) {
+        send_cnt2[i]  = send_cnt[i] * sizeof( TYPE );
+        send_disp2[i] = send_disp[i] * sizeof( TYPE );
+        recv_cnt2[i]  = recv_cnt[i] * sizeof( TYPE );
+        recv_disp2[i] = recv_disp[i] * sizeof( TYPE );
+    }
+    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
+    call_allToAll<char>(
+        (char *) send_data, send_cnt2, send_disp2, (char *) recv_data, recv_cnt2, recv_disp2 );
+    delete[] send_cnt2;
+    delete[] recv_cnt2;
+    delete[] send_disp2;
+    delete[] recv_disp2;
+}
+
+
+} // namespace Utilities
+
+#endif
diff --git a/common/MPI.cpp b/common/MPI.cpp
new file mode 100644
index 00000000..d20c1af2
--- /dev/null
+++ b/common/MPI.cpp
@@ -0,0 +1,3758 @@
+// This file impliments a wrapper class for MPI functions
+
+#include "common/MPI.h"
+#include "common/Utilities.h"
+
+#include "ProfilerApp.h"
+#include "StackTrace/ErrorHandlers.h"
+#include "StackTrace/StackTrace.h"
+
+// Include all other headers
+#include <algorithm>
+#include <chrono>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <random>
+#include <stdexcept>
+#include <thread>
+#include <typeinfo>
+
+
+// Include OS specific headers
+#undef USE_WINDOWS
+#undef USE_LINUX
+#undef USE_MAC
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
+// We are using windows
+#define USE_WINDOWS
+#include <process.h>
+#include <windows.h>
+#define sched_yield() Sleep( 0 )
+#elif defined( __APPLE__ )
+// Using MAC
+#define USE_MAC
+#include <sched.h>
+#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+// We are using linux
+#define USE_LINUX
+#include <sched.h>
+#include <unistd.h>
+#else
+#error Unknown OS
+#endif
+
+
+// Convience defines
+#define MPI_ERROR ERROR
+#define MPI_ASSERT ASSERT
+#define MPI_INSIST INSIST
+#define MPI_WARNING WARNING
+#define MPI_CLASS_COMM_NULL MPI_COMM_NULL
+#define MPI_CLASS_COMM_SELF MPI_COMM_SELF
+#define MPI_CLASS_COMM_WORLD MPI_COMM_WORLD
+
+
+// Global variable to track create new unique comms (dup and split)
+#ifndef USE_MPI
+MPI_Comm uniqueGlobalComm = 11;
+#endif
+
+
+#if defined( USE_SAMRAI ) && defined( USE_PETSC ) && !defined( USE_MPI )
+int MPI_REQUEST_NULL  = 3;
+int MPI_ERR_IN_STATUS = 4;
+#endif
+
+
+namespace Utilities {
+
+
+// Some special structs to work with MPI
+#ifdef USE_MPI
+struct IntIntStruct {
+    int j;
+    int i;
+};
+struct LongIntStruct {
+    long int j;
+    int i;
+};
+struct FloatIntStruct {
+    float f;
+    int i;
+};
+struct DoubleIntStruct {
+    double d;
+    int i;
+};
+#endif
+
+
+// Initialized the static member variables
+volatile unsigned int MPI_CLASS::N_MPI_Comm_created   = 0;
+volatile unsigned int MPI_CLASS::N_MPI_Comm_destroyed = 0;
+short MPI_CLASS::profile_level                        = 127;
+
+
+// Define a type for use with size_t
+#ifdef USE_MPI
+static MPI_Datatype MPI_SIZE_T = 0x0;
+static MPI_Datatype getSizeTDataType()
+{
+    int size_int, size_long, size_longlong, size_longlong2;
+    MPI_Type_size( MPI_UNSIGNED, &size_int );
+    MPI_Type_size( MPI_UNSIGNED_LONG, &size_long );
+    MPI_Type_size( MPI_UNSIGNED_LONG_LONG, &size_longlong );
+    MPI_Type_size( MPI_LONG_LONG_INT, &size_longlong2 );
+    if ( sizeof( size_t ) == size_int ) {
+        return MPI_UNSIGNED;
+    } else if ( sizeof( size_t ) == size_long ) {
+        return MPI_UNSIGNED_LONG;
+    } else if ( sizeof( size_t ) == size_longlong ) {
+        return MPI_UNSIGNED_LONG_LONG;
+    } else if ( sizeof( size_t ) == size_longlong2 ) {
+        MPI_WARNING( "Using signed long long datatype for size_t in MPI" );
+        return MPI_LONG_LONG_INT; // Note: this is not unsigned
+    } else {
+        MPI_ERROR( "No suitable datatype found" );
+    }
+    return 0;
+}
+#endif
+
+
+// Static data for asyncronous communication without MPI
+// Note: these routines may not be thread-safe yet
+#ifndef USE_MPI
+static const int mpi_max_tag = 0x003FFFFF;
+struct Isendrecv_struct {
+    const char *data; // Pointer to data
+    int status;       // Status: 1-sending, 2-recieving
+};
+std::map<MPI_Request, Isendrecv_struct> global_isendrecv_list;
+static MPI_Request getRequest( MPI_Comm comm, int tag )
+{
+    MPI_ASSERT( tag >= 0 && tag <= mpi_max_tag );
+    // Use hashing function: 2^64*0.5*(sqrt(5)-1)
+    uint64_t a    = static_cast<uint8_t>( comm ) * 0x9E3779B97F4A7C15;
+    uint64_t b    = static_cast<uint8_t>( tag ) * 0x9E3779B97F4A7C15;
+    uint64_t hash = a ^ b;
+    MPI_Request request;
+    memcpy( &request, &hash, sizeof( MPI_Request ) );
+    return request;
+}
+#endif
+
+
+// Check the mpi error code
+#ifdef USE_MPI
+inline void check_MPI( int error )
+{
+    if ( error != MPI_SUCCESS )
+        MPI_ERROR( "Error calling MPI routine" );
+}
+#endif
+
+
+/******************************************************************
+ * Some helper functions to convert between signed/unsigned types  *
+ ******************************************************************/
+DISABLE_WARNINGS
+static inline constexpr unsigned int offset_int()
+{
+    return ~static_cast<unsigned int>( std::numeric_limits<int>::min() ) + 1;
+}
+static inline constexpr unsigned long int offset_long()
+{
+    return ~static_cast<long int>( std::numeric_limits<long int>::min() ) + 1;
+}
+static inline constexpr unsigned long long int offset_long_long()
+{
+    return ~static_cast<long long int>( std::numeric_limits<long long int>::min() ) + 1;
+}
+ENABLE_WARNINGS
+static inline unsigned int signed_to_unsigned( int x )
+{
+    const auto offset = offset_int();
+    return ( x >= 0 ) ? static_cast<unsigned int>( x ) + offset :
+                        offset - static_cast<unsigned int>( -x );
+}
+static inline unsigned long int signed_to_unsigned( long int x )
+{
+    const auto offset = offset_long();
+    return ( x >= 0 ) ? static_cast<unsigned long int>( x ) + offset :
+                        offset - static_cast<unsigned long int>( -x );
+}
+static inline unsigned long long int signed_to_unsigned( long long int x )
+{
+    const auto offset = offset_long_long();
+    return ( x >= 0 ) ? static_cast<unsigned long long int>( x ) + offset :
+                        offset - static_cast<unsigned long long int>( -x );
+}
+static inline int unsigned_to_signed( unsigned int x )
+{
+    const auto offset = offset_int();
+    return ( x >= offset ) ? static_cast<int>( x - offset ) : -static_cast<int>( offset - x );
+}
+static inline long int unsigned_to_signed( unsigned long int x )
+{
+    const auto offset = offset_long();
+    return ( x >= offset ) ? static_cast<long int>( x - offset ) :
+                             -static_cast<long int>( offset - x );
+}
+static inline long long int unsigned_to_signed( unsigned long long int x )
+{
+    const auto offset = offset_long_long();
+    return ( x >= offset ) ? static_cast<long long int>( x - offset ) :
+                             -static_cast<long long int>( offset - x );
+}
+
+
+/************************************************************************
+ *  Get the MPI version                                                  *
+ ************************************************************************/
+std::array<int, 2> MPI_CLASS::version()
+{
+#ifdef USE_MPI
+    int MPI_version;
+    int MPI_subversion;
+    MPI_Get_version( &MPI_version, &MPI_subversion );
+    return { MPI_version, MPI_subversion };
+#else
+    return { 0, 0 };
+#endif
+}
+std::string MPI_CLASS::info()
+{
+#ifdef USE_MPI
+#if MPI_VERSION >= 3
+    int MPI_version_length = 0;
+    char MPI_version_string[MPI_MAX_LIBRARY_VERSION_STRING];
+    MPI_Get_library_version( MPI_version_string, &MPI_version_length );
+    if ( MPI_version_length > 0 ) {
+        std::string MPI_info( MPI_version_string, MPI_version_length );
+        size_t pos = MPI_info.find( '\n' );
+        while ( pos != std::string::npos ) {
+            MPI_info.insert( pos + 1, "   " );
+            pos = MPI_info.find( '\n', pos + 1 );
+        }
+        return MPI_info;
+    }
+#endif
+    auto tmp = version();
+    return std::to_string( tmp[0] ) + "." + std::to_string( tmp[0] );
+#else
+    return std::string();
+#endif
+}
+
+
+/************************************************************************
+ *  Functions to get/set the process affinities                          *
+ ************************************************************************/
+int MPI_CLASS::getNumberOfProcessors() { return std::thread::hardware_concurrency(); }
+std::vector<int> MPI_CLASS::getProcessAffinity()
+{
+    std::vector<int> procs;
+#ifdef USE_LINUX
+    cpu_set_t mask;
+    int error = sched_getaffinity( getpid(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        MPI_ERROR( "Error getting process affinity" );
+    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
+        if ( CPU_ISSET( i, &mask ) )
+            procs.push_back( i );
+    }
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    printf( "Warning: MAC does not support getting the process affinity\n" );
+    procs.clear();
+#elif defined( USE_WINDOWS )
+    HANDLE hProc = GetCurrentProcess();
+    size_t procMask;
+    size_t sysMask;
+    PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>( &procMask );
+    PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>( &sysMask );
+    GetProcessAffinityMask( hProc, procMaskPtr, sysMaskPtr );
+    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
+        if ( ( procMask & 0x1 ) != 0 )
+            procs.push_back( i );
+        procMask >>= 1;
+    }
+#else
+#error Unknown OS
+#endif
+    return procs;
+}
+void MPI_CLASS::setProcessAffinity( const std::vector<int> &procs )
+{
+#ifdef USE_LINUX
+    cpu_set_t mask;
+    CPU_ZERO( &mask );
+    for ( auto cpu : procs )
+        CPU_SET( cpu, &mask );
+    int error = sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
+    if ( error != 0 )
+        MPI_ERROR( "Error setting process affinity" );
+#elif defined( USE_MAC )
+    // MAC does not support getting or setting the affinity
+    NULL_USE( procs );
+#elif defined( USE_WINDOWS )
+    DWORD mask = 0;
+    for ( size_t i = 0; i < procs.size(); i++ )
+        mask |= ( (DWORD) 1 ) << procs[i];
+    HANDLE hProc = GetCurrentProcess();
+    SetProcessAffinityMask( hProc, mask );
+#else
+#error Unknown OS
+#endif
+}
+
+
+/************************************************************************
+ *  Function to check if MPI is active                                   *
+ ************************************************************************/
+bool MPI_CLASS::MPI_active()
+{
+#ifdef USE_MPI
+    int initialized = 0, finalized = 0;
+    MPI_Initialized( &initialized );
+    MPI_Finalized( &finalized );
+    return initialized != 0 && finalized == 0;
+#else
+    return true;
+#endif
+}
+MPI_CLASS::ThreadSupport MPI_CLASS::queryThreadSupport()
+{
+#ifdef USE_MPI
+    int provided = 0;
+    MPI_Query_thread( &provided );
+    if ( provided == MPI_THREAD_SINGLE )
+        return ThreadSupport::SINGLE;
+    if ( provided == MPI_THREAD_FUNNELED )
+        return ThreadSupport::FUNNELED;
+    if ( provided == MPI_THREAD_SERIALIZED )
+        return ThreadSupport::SERIALIZED;
+    if ( provided == MPI_THREAD_MULTIPLE )
+        return ThreadSupport::MULTIPLE;
+    return ThreadSupport::SINGLE;
+#else
+    return ThreadSupport::MULTIPLE;
+#endif
+}
+
+
+/************************************************************************
+ *  Function to perform a load balance of the given processes            *
+ ************************************************************************/
+void MPI_CLASS::balanceProcesses( const MPI_CLASS &globalComm, const int method,
+    const std::vector<int> &procs, const int N_min_in, const int N_max_in )
+{
+    // Build the list of processors to use
+    std::vector<int> cpus = procs;
+    if ( cpus.empty() ) {
+        for ( int i = 0; i < getNumberOfProcessors(); i++ )
+            cpus.push_back( i );
+    }
+    // Handle the "easy cases"
+    if ( method == 1 ) {
+        // Trivial case where we do not need any communication
+        setProcessAffinity( cpus );
+        return;
+    }
+    // Get the sub-communicator for the current node
+    MPI_CLASS nodeComm = globalComm.splitByNode();
+    int N_min          = std::min<int>( std::max<int>( N_min_in, 1 ), cpus.size() );
+    int N_max          = N_max_in;
+    if ( N_max == -1 )
+        N_max = cpus.size();
+    N_max = std::min<int>( N_max, cpus.size() );
+    MPI_ASSERT( N_max >= N_min );
+    // Perform the load balance within the node
+    if ( method == 2 ) {
+        int N_proc = cpus.size() / nodeComm.getSize();
+        N_proc     = std::max<int>( N_proc, N_min );
+        N_proc     = std::min<int>( N_proc, N_max );
+        std::vector<int> cpus2( N_proc, -1 );
+        for ( int i = 0; i < N_proc; i++ )
+            cpus2[i] = cpus[( nodeComm.getRank() * N_proc + i ) % cpus.size()];
+        setProcessAffinity( cpus2 );
+    } else {
+        MPI_ERROR( "Unknown method for load balance" );
+    }
+}
+
+
+/************************************************************************
+ *  Empty constructor                                                    *
+ ************************************************************************/
+MPI_CLASS::MPI_CLASS()
+{
+// Initialize the data members to a defaul communicator of self
+#ifdef USE_MPI
+    communicator = MPI_COMM_NULL;
+    d_maxTag     = 0x7FFFFFFF;
+#else
+    communicator = MPI_CLASS_COMM_NULL;
+    d_maxTag     = mpi_max_tag;
+#endif
+    d_ranks       = nullptr;
+    d_count       = nullptr;
+    d_manage      = false;
+    comm_rank     = 0;
+    comm_size     = 1;
+    d_isNull      = true;
+    d_currentTag  = nullptr;
+    d_call_abort  = true;
+    tmp_alignment = -1;
+}
+
+
+/************************************************************************
+ *  Empty deconstructor                                                  *
+ ************************************************************************/
+MPI_CLASS::~MPI_CLASS() { reset(); }
+void MPI_CLASS::reset()
+{
+    // Decrement the count if used
+    int count = -1;
+    if ( d_count != nullptr )
+        count = --( *d_count );
+    if ( count == 0 ) {
+        // We are holding that last reference to the MPI_Comm object, we need to free it
+        if ( d_manage ) {
+#ifdef USE_MPI
+            MPI_Comm_set_errhandler( communicator, MPI_ERRORS_ARE_FATAL );
+            int err = MPI_Comm_free( &communicator );
+            if ( err != MPI_SUCCESS )
+                MPI_ERROR( "Problem free'ing MPI_Comm object" );
+            communicator = MPI_CLASS_COMM_NULL;
+            ++N_MPI_Comm_destroyed;
+#endif
+        }
+        if ( d_ranks != nullptr )
+            delete[] d_ranks;
+        delete d_count;
+    }
+    if ( d_currentTag == nullptr ) {
+        // No tag index
+    } else if ( d_currentTag[1] > 1 ) {
+        --( d_currentTag[1] );
+    } else {
+        delete[] d_currentTag;
+    }
+    d_manage     = false;
+    d_count      = nullptr;
+    d_ranks      = nullptr;
+    comm_rank    = 0;
+    comm_size    = 1;
+    d_maxTag     = 0;
+    d_isNull     = true;
+    d_currentTag = nullptr;
+    d_call_abort = true;
+}
+
+
+/************************************************************************
+ *  Copy constructors                                                    *
+ ************************************************************************/
+MPI_CLASS::MPI_CLASS( const MPI_CLASS &comm )
+    : communicator( comm.communicator ),
+      d_isNull( comm.d_isNull ),
+      d_manage( comm.d_manage ),
+      comm_rank( comm.comm_rank ),
+      comm_size( comm.comm_size ),
+      d_ranks( comm.d_ranks ),
+      d_maxTag( comm.d_maxTag ),
+      d_currentTag( comm.d_currentTag )
+{
+    // Initialize the data members to the existing comm object
+    if ( d_currentTag != nullptr )
+        ++d_currentTag[1];
+    d_call_abort = comm.d_call_abort;
+    // Set and increment the count
+    d_count = comm.d_count;
+    if ( d_count != nullptr )
+        ++( *d_count );
+    tmp_alignment = -1;
+}
+MPI_CLASS::MPI_CLASS( MPI_CLASS &&rhs ) : MPI_CLASS()
+{
+    std::swap( communicator, rhs.communicator );
+    std::swap( d_isNull, rhs.d_isNull );
+    std::swap( d_manage, rhs.d_manage );
+    std::swap( d_call_abort, rhs.d_call_abort );
+    std::swap( profile_level, rhs.profile_level );
+    std::swap( comm_rank, rhs.comm_rank );
+    std::swap( comm_size, rhs.comm_size );
+    std::swap( d_ranks, rhs.d_ranks );
+    std::swap( d_maxTag, rhs.d_maxTag );
+    std::swap( d_currentTag, rhs.d_currentTag );
+    std::swap( d_count, rhs.d_count );
+    std::swap( tmp_alignment, rhs.tmp_alignment );
+}
+
+
+/************************************************************************
+ *  Assignment operators                                                 *
+ ************************************************************************/
+MPI_CLASS &MPI_CLASS::operator=( const MPI_CLASS &comm )
+{
+    if ( this == &comm ) // protect against invalid self-assignment
+        return *this;
+    // Destroy the previous object
+    this->reset();
+    // Initialize the data members to the existing object
+    this->communicator = comm.communicator;
+    this->comm_rank    = comm.comm_rank;
+    this->comm_size    = comm.comm_size;
+    this->d_ranks      = comm.d_ranks;
+    this->d_isNull     = comm.d_isNull;
+    this->d_manage     = comm.d_manage;
+    this->d_maxTag     = comm.d_maxTag;
+    this->d_call_abort = comm.d_call_abort;
+    this->d_currentTag = comm.d_currentTag;
+    if ( this->d_currentTag != nullptr )
+        ++( this->d_currentTag[1] );
+    // Set and increment the count
+    this->d_count = comm.d_count;
+    if ( this->d_count != nullptr )
+        ++( *d_count );
+    this->tmp_alignment = -1;
+    return *this;
+}
+MPI_CLASS &MPI_CLASS::operator=( MPI_CLASS &&rhs )
+{
+    if ( this == &rhs ) // protect against invalid self-assignment
+        return *this;
+    std::swap( communicator, rhs.communicator );
+    std::swap( d_isNull, rhs.d_isNull );
+    std::swap( d_manage, rhs.d_manage );
+    std::swap( d_call_abort, rhs.d_call_abort );
+    std::swap( profile_level, rhs.profile_level );
+    std::swap( comm_rank, rhs.comm_rank );
+    std::swap( comm_size, rhs.comm_size );
+    std::swap( d_ranks, rhs.d_ranks );
+    std::swap( d_maxTag, rhs.d_maxTag );
+    std::swap( d_currentTag, rhs.d_currentTag );
+    std::swap( d_count, rhs.d_count );
+    std::swap( tmp_alignment, rhs.tmp_alignment );
+    return *this;
+}
+
+
+/************************************************************************
+ *  Constructor from existing MPI communicator                           *
+ ************************************************************************/
+int d_global_currentTag_world1[2] = { 1, 1 };
+int d_global_currentTag_world2[2] = { 1, 1 };
+int d_global_currentTag_self[2]   = { 1, 1 };
+#ifdef USE_MPI
+std::atomic_int d_global_count_world1 = { 1 };
+std::atomic_int d_global_count_world2 = { 1 };
+std::atomic_int d_global_count_self   = { 1 };
+#endif
+MPI_CLASS::MPI_CLASS( MPI_Comm comm, bool manage )
+{
+    d_count       = nullptr;
+    d_ranks       = nullptr;
+    d_manage      = false;
+    tmp_alignment = -1;
+    // Check if we are using our version of comm_world
+    if ( comm == MPI_CLASS_COMM_WORLD ) {
+        communicator = MPI_COMM_WORLD;
+    } else if ( comm == MPI_CLASS_COMM_SELF ) {
+        communicator = MPI_COMM_SELF;
+    } else if ( comm == MPI_CLASS_COMM_NULL ) {
+        communicator = MPI_COMM_NULL;
+    } else {
+        communicator = comm;
+    }
+#ifdef USE_MPI
+    // We are using MPI, use the MPI communicator to initialize the data
+    if ( communicator != MPI_COMM_NULL ) {
+        // Set the MPI_SIZE_T datatype if it has not been set
+        if ( MPI_SIZE_T == 0x0 )
+            MPI_SIZE_T = getSizeTDataType();
+        // Attach the error handler
+        StackTrace::setMPIErrorHandler( communicator );
+        // Get the communicator properties
+        MPI_Comm_rank( communicator, &comm_rank );
+        MPI_Comm_size( communicator, &comm_size );
+        int flag, *val;
+        int ierr = MPI_Comm_get_attr( communicator, MPI_TAG_UB, &val, &flag );
+        MPI_ASSERT( ierr == MPI_SUCCESS );
+        if ( flag == 0 ) {
+            d_maxTag = 0x7FFFFFFF; // The tag is not a valid attribute (set to 2^31-1)
+        } else {
+            d_maxTag = *val;
+            if ( d_maxTag < 0 ) {
+                d_maxTag = 0x7FFFFFFF;
+            } // The maximum tag is > a signed int (set to 2^31-1)
+            MPI_INSIST( d_maxTag >= 0x7FFF, "maximum tag size is < MPI standard" );
+        }
+    } else {
+        comm_rank = 1;
+        comm_size = 0;
+        d_maxTag  = 0x7FFFFFFF;
+    }
+    d_isNull = communicator == MPI_COMM_NULL;
+    if ( manage && communicator != MPI_COMM_NULL && communicator != MPI_COMM_SELF &&
+         communicator != MPI_COMM_WORLD )
+        d_manage = true;
+    // Create the count (Note: we do not need to worry about thread safety)
+    if ( communicator == MPI_CLASS_COMM_WORLD ) {
+        d_count = &d_global_count_world1;
+        ++( *d_count );
+    } else if ( communicator == MPI_COMM_WORLD ) {
+        d_count = &d_global_count_world2;
+        ++( *d_count );
+    } else if ( communicator == MPI_COMM_SELF ) {
+        d_count = &d_global_count_self;
+        ++( *d_count );
+    } else if ( communicator == MPI_COMM_NULL ) {
+        d_count = nullptr;
+    } else {
+        d_count  = new std::atomic_int;
+        *d_count = 1;
+    }
+    if ( d_manage )
+        ++N_MPI_Comm_created;
+    // Create d_ranks
+    if ( comm_size > 1 ) {
+        d_ranks    = new int[comm_size];
+        d_ranks[0] = -1;
+    }
+#else
+    // We are not using MPI, intialize based on the communicator
+    NULL_USE( manage );
+    comm_rank = 0;
+    comm_size = 1;
+    d_maxTag  = mpi_max_tag;
+    d_isNull  = communicator == MPI_COMM_NULL;
+    if ( d_isNull )
+        comm_size    = 0;
+#endif
+    if ( communicator == MPI_CLASS_COMM_WORLD ) {
+        d_currentTag = d_global_currentTag_world1;
+        ++( this->d_currentTag[1] );
+    } else if ( communicator == MPI_COMM_WORLD ) {
+        d_currentTag = d_global_currentTag_world2;
+        ++( this->d_currentTag[1] );
+    } else if ( communicator == MPI_COMM_SELF ) {
+        d_currentTag = d_global_currentTag_self;
+        ++( this->d_currentTag[1] );
+    } else if ( communicator == MPI_COMM_NULL ) {
+        d_currentTag = nullptr;
+    } else {
+        d_currentTag    = new int[2];
+        d_currentTag[0] = ( d_maxTag <= 0x10000 ) ? 1 : 0x1FFF;
+        d_currentTag[1] = 1;
+    }
+    d_call_abort = true;
+}
+
+
+/************************************************************************
+ *  Return the ranks of the communicator in the global comm              *
+ ************************************************************************/
+std::vector<int> MPI_CLASS::globalRanks() const
+{
+    // Get my global rank if it has not been set
+    static int myGlobalRank = -1;
+    if ( myGlobalRank == -1 ) {
+#ifdef USE_MPI
+        if ( MPI_active() )
+            MPI_Comm_rank( MPI_CLASS_COMM_WORLD, &myGlobalRank );
+#else
+        myGlobalRank = 0;
+#endif
+    }
+    // Check if we are dealing with a serial or null communicator
+    if ( comm_size == 1 )
+        return std::vector<int>( 1, myGlobalRank );
+    if ( d_ranks == nullptr || communicator == MPI_COMM_NULL )
+        return std::vector<int>();
+    // Fill d_ranks if necessary
+    if ( d_ranks[0] == -1 ) {
+        if ( communicator == MPI_CLASS_COMM_WORLD ) {
+            for ( int i = 0; i < comm_size; i++ )
+                d_ranks[i] = i;
+        } else {
+
+            MPI_ASSERT( myGlobalRank != -1 );
+            this->allGather( myGlobalRank, d_ranks );
+        }
+    }
+    // Return d_ranks
+    return std::vector<int>( d_ranks, d_ranks + comm_size );
+}
+
+
+/************************************************************************
+ *  Generate a random number                                             *
+ ************************************************************************/
+size_t MPI_CLASS::rand() const
+{
+    size_t val = 0;
+    if ( getRank() == 0 ) {
+        static std::random_device rd;
+        static std::mt19937 gen( rd() );
+        static std::uniform_int_distribution<size_t> dist;
+        val = dist( gen );
+    }
+    val = bcast( val, 0 );
+    return val;
+}
+
+
+/************************************************************************
+ *  Intersect two communicators                                          *
+ ************************************************************************/
+#ifdef USE_MPI
+static inline void MPI_Group_free2( MPI_Group *group )
+{
+    if ( *group != MPI_GROUP_EMPTY ) {
+        // MPICH is fine with free'ing an empty group, OpenMPI crashes
+        MPI_Group_free( group );
+    }
+}
+MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
+{
+    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY;
+    if ( !comm1.isNull() ) {
+        MPI_Group_free2( &group1 );
+        MPI_Comm_group( comm1.communicator, &group1 );
+    }
+    if ( !comm2.isNull() ) {
+        MPI_Group_free2( &group2 );
+        MPI_Comm_group( comm2.communicator, &group2 );
+    }
+    MPI_Group group12;
+    MPI_Group_intersection( group1, group2, &group12 );
+    int compare1, compare2;
+    MPI_Group_compare( group1, group12, &compare1 );
+    MPI_Group_compare( group2, group12, &compare2 );
+    MPI_CLASS new_comm( MPI_CLASS_COMM_NULL );
+    int size;
+    MPI_Group_size( group12, &size );
+    if ( compare1 != MPI_UNEQUAL && size != 0 ) {
+        // The intersection matches comm1
+        new_comm = comm1;
+    } else if ( compare2 != MPI_UNEQUAL && size != 0 ) {
+        // The intersection matches comm2
+        new_comm = comm2;
+    } else if ( comm1.isNull() ) {
+        // comm1 is null, we can return safely (comm1 is needed for communication)
+    } else {
+        // The intersection is smaller than comm1 or comm2
+        // Check if the new comm is nullptr for all processors
+        int max_size = 0;
+        MPI_Allreduce( &size, &max_size, 1, MPI_INT, MPI_MAX, comm1.communicator );
+        if ( max_size == 0 ) {
+            // We are dealing with completely disjoint sets
+            new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
+        } else {
+            // Create the new comm
+            // Note: OpenMPI crashes if the intersection group is EMPTY for any processors
+            // We will set it to SELF for the EMPTY processors, then create a nullptr comm later
+            if ( group12 == MPI_GROUP_EMPTY ) {
+                MPI_Group_free2( &group12 );
+                MPI_Comm_group( MPI_COMM_SELF, &group12 );
+            }
+            MPI_Comm new_MPI_comm;
+            MPI_Comm_create( comm1.communicator, group12, &new_MPI_comm );
+            if ( size > 0 ) {
+                // This is the valid case where we create a new intersection comm
+                new_comm = MPI_CLASS( new_MPI_comm, true );
+            } else {
+                // We actually want a null comm for this communicator
+                new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
+                MPI_Comm_free( &new_MPI_comm );
+            }
+        }
+    }
+    MPI_Group_free2( &group1 );
+    MPI_Group_free2( &group2 );
+    MPI_Group_free2( &group12 );
+    return new_comm;
+}
+#else
+MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
+{
+    if ( comm1.isNull() || comm2.isNull() )
+        return MPI_CLASS( MPI_CLASS_COMM_NULL, false );
+    MPI_ASSERT( comm1.comm_size == 1 && comm2.comm_size == 1 );
+    return comm1;
+}
+#endif
+
+
+/************************************************************************
+ *  Split a comm						                                    *
+ ************************************************************************/
+MPI_CLASS MPI_CLASS::split( int color, int key ) const
+{
+    if ( d_isNull ) {
+        return MPI_CLASS( MPI_CLASS_COMM_NULL );
+    } else if ( comm_size == 1 ) {
+        if ( color == -1 )
+            return MPI_CLASS( MPI_CLASS_COMM_NULL );
+        return dup();
+    }
+    MPI_Comm new_MPI_comm = MPI_CLASS_COMM_NULL;
+#ifdef USE_MPI
+    // USE MPI to split the communicator
+    if ( color == -1 ) {
+        check_MPI( MPI_Comm_split( communicator, MPI_UNDEFINED, key, &new_MPI_comm ) );
+    } else {
+        check_MPI( MPI_Comm_split( communicator, color, key, &new_MPI_comm ) );
+    }
+#endif
+    // Create the new object
+    NULL_USE( key );
+    MPI_CLASS new_comm( new_MPI_comm, true );
+    new_comm.d_call_abort = d_call_abort;
+    return new_comm;
+}
+MPI_CLASS MPI_CLASS::splitByNode( int key ) const
+{
+    // Check if we are dealing with a single processor (trivial case)
+    if ( comm_size == 1 )
+        return this->split( 0, 0 );
+    // Get the node name
+    std::string name = MPI_CLASS::getNodeName();
+    // Gather the names from all ranks
+    std::vector<std::string> list( comm_size );
+    allGather( name, &list[0] );
+    // Create the colors
+    std::vector<int> color( comm_size, -1 );
+    color[0] = 0;
+    for ( int i = 1; i < comm_size; i++ ) {
+        const std::string tmp1 = list[i];
+        for ( int j = 0; j < i; j++ ) {
+            const std::string tmp2 = list[j];
+            if ( tmp1 == tmp2 ) {
+                color[i] = color[j];
+                break;
+            }
+            color[i] = color[i - 1] + 1;
+        }
+    }
+    MPI_CLASS new_comm = this->split( color[comm_rank], key );
+    return new_comm;
+}
+
+
+/************************************************************************
+ *  Duplicate an exisiting comm object                                   *
+ ************************************************************************/
+MPI_CLASS MPI_CLASS::dup() const
+{
+    if ( d_isNull )
+        return MPI_CLASS( MPI_CLASS_COMM_NULL );
+    MPI_Comm new_MPI_comm = communicator;
+#if defined( USE_MPI ) || defined( USE_PETSC )
+    // USE MPI to duplicate the communicator
+    MPI_Comm_dup( communicator, &new_MPI_comm );
+#else
+    new_MPI_comm = uniqueGlobalComm;
+    uniqueGlobalComm++;
+#endif
+    // Create the new comm object
+    MPI_CLASS new_comm( new_MPI_comm, true );
+    new_comm.d_isNull     = d_isNull;
+    new_comm.d_call_abort = d_call_abort;
+    return new_comm;
+}
+
+
+/************************************************************************
+ *  Get the node name                                                    *
+ ************************************************************************/
+std::string MPI_CLASS::getNodeName()
+{
+#ifdef USE_MPI
+    int length;
+    char name[MPI_MAX_PROCESSOR_NAME + 1];
+    memset( name, 0, MPI_MAX_PROCESSOR_NAME + 1 );
+    MPI_Get_processor_name( name, &length );
+    return std::string( name );
+#else
+    return "Node0";
+#endif
+}
+
+
+/************************************************************************
+ *  Overload operator ==                                                 *
+ ************************************************************************/
+bool MPI_CLASS::operator==( const MPI_CLASS &comm ) const
+{
+    return communicator == comm.communicator;
+}
+
+
+/************************************************************************
+ *  Overload operator !=                                                 *
+ ************************************************************************/
+bool MPI_CLASS::operator!=( const MPI_CLASS &comm ) const
+{
+    return communicator != comm.communicator;
+}
+
+
+/************************************************************************
+ *  Overload operator <                                                  *
+ ************************************************************************/
+bool MPI_CLASS::operator<( const MPI_CLASS &comm ) const
+{
+    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
+    bool flag = true;
+    // First check if either communicator is NULL
+    if ( this->d_isNull )
+        return false;
+    if ( comm.d_isNull )
+        flag = false;
+    // Use compare to check if the comms are equal
+    if ( compare( comm ) != 0 )
+        return false;
+    // Check that the size of the other communicator is > the current communicator size
+    if ( comm_size >= comm.comm_size )
+        flag = false;
+// Check the union of the communicator groups
+// this is < comm iff this group is a subgroup of comm's group
+#ifdef USE_MPI
+    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
+    if ( !d_isNull )
+        MPI_Comm_group( communicator, &group1 );
+    if ( !comm.d_isNull )
+        MPI_Comm_group( comm.communicator, &group2 );
+    MPI_Group_union( group1, group2, &group12 );
+    int compare;
+    MPI_Group_compare( group2, group12, &compare );
+    if ( compare == MPI_UNEQUAL )
+        flag = false;
+    MPI_Group_free( &group1 );
+    MPI_Group_free( &group2 );
+    MPI_Group_free( &group12 );
+#endif
+    // Perform a global reduce of the flag (equivalent to all operation)
+    return allReduce( flag );
+}
+
+
+/************************************************************************
+ *  Overload operator <=                                                 *
+ ************************************************************************/
+bool MPI_CLASS::operator<=( const MPI_CLASS &comm ) const
+{
+    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
+    bool flag = true;
+    // First check if either communicator is NULL
+    if ( this->d_isNull )
+        return false;
+    if ( comm.d_isNull )
+        flag = false;
+#ifdef USE_MPI
+    int world_size = 0;
+    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
+    if ( comm.getSize() == world_size )
+        return true;
+    if ( getSize() == 1 && !comm.d_isNull )
+        return true;
+#endif
+    // Use compare to check if the comms are equal
+    if ( compare( comm ) != 0 )
+        return true;
+    // Check that the size of the other communicator is > the current communicator size
+    // this is <= comm iff this group is a subgroup of comm's group
+    if ( comm_size > comm.comm_size )
+        flag = false;
+// Check the unnion of the communicator groups
+#ifdef USE_MPI
+    MPI_Group group1, group2, group12;
+    MPI_Comm_group( communicator, &group1 );
+    MPI_Comm_group( comm.communicator, &group2 );
+    MPI_Group_union( group1, group2, &group12 );
+    int compare;
+    MPI_Group_compare( group2, group12, &compare );
+    if ( compare == MPI_UNEQUAL )
+        flag = false;
+    MPI_Group_free( &group1 );
+    MPI_Group_free( &group2 );
+    MPI_Group_free( &group12 );
+#endif
+    // Perform a global reduce of the flag (equivalent to all operation)
+    return allReduce( flag );
+}
+
+
+/************************************************************************
+ *  Overload operator >                                                  *
+ ************************************************************************/
+bool MPI_CLASS::operator>( const MPI_CLASS &comm ) const
+{
+    bool flag = true;
+    // First check if either communicator is NULL
+    if ( this->d_isNull )
+        return false;
+    if ( comm.d_isNull )
+        flag = false;
+    // Use compare to check if the comms are equal
+    if ( compare( comm ) != 0 )
+        return false;
+    // Check that the size of the other communicator is > the current communicator size
+    if ( comm_size <= comm.comm_size )
+        flag = false;
+// Check the unnion of the communicator groups
+// this is > comm iff comm's group is a subgroup of this group
+#ifdef USE_MPI
+    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
+    if ( !d_isNull )
+        MPI_Comm_group( communicator, &group1 );
+    if ( !comm.d_isNull )
+        MPI_Comm_group( comm.communicator, &group2 );
+    MPI_Group_union( group1, group2, &group12 );
+    int compare;
+    MPI_Group_compare( group1, group12, &compare );
+    if ( compare == MPI_UNEQUAL )
+        flag = false;
+    MPI_Group_free( &group1 );
+    MPI_Group_free( &group2 );
+    MPI_Group_free( &group12 );
+#endif
+    // Perform a global reduce of the flag (equivalent to all operation)
+    return allReduce( flag );
+}
+
+
+/************************************************************************
+ *  Overload operator >=                                                 *
+ ************************************************************************/
+bool MPI_CLASS::operator>=( const MPI_CLASS &comm ) const
+{
+    bool flag = true;
+    // First check if either communicator is NULL
+    if ( this->d_isNull )
+        return false;
+    if ( comm.d_isNull )
+        flag = false;
+#ifdef USE_MPI
+    int world_size = 0;
+    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
+    if ( getSize() == world_size )
+        return true;
+    if ( comm.getSize() == 1 && !comm.d_isNull )
+        return true;
+#endif
+    // Use compare to check if the comms are equal
+    if ( compare( comm ) != 0 )
+        return true;
+    // Check that the size of the other communicator is > the current communicator size
+    if ( comm_size < comm.comm_size )
+        flag = false;
+// Check the unnion of the communicator groups
+// this is >= comm iff comm's group is a subgroup of this group
+#ifdef USE_MPI
+    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
+    if ( !d_isNull )
+        MPI_Comm_group( communicator, &group1 );
+    if ( !comm.d_isNull )
+        MPI_Comm_group( comm.communicator, &group2 );
+    MPI_Group_union( group1, group2, &group12 );
+    int compare;
+    MPI_Group_compare( group1, group12, &compare );
+    if ( compare == MPI_UNEQUAL )
+        flag = false;
+    MPI_Group_free( &group1 );
+    MPI_Group_free( &group2 );
+    MPI_Group_free( &group12 );
+#endif
+    // Perform a global reduce of the flag (equivalent to all operation)
+    return allReduce( flag );
+}
+
+
+/************************************************************************
+ *  Compare two comm objects                                             *
+ ************************************************************************/
+int MPI_CLASS::compare( const MPI_CLASS &comm ) const
+{
+    if ( communicator == comm.communicator )
+        return 1;
+#ifdef USE_MPI
+    if ( d_isNull || comm.d_isNull )
+        return 0;
+    int result;
+    check_MPI( MPI_Comm_compare( communicator, comm.communicator, &result ) );
+    if ( result == MPI_IDENT )
+        return 2;
+    else if ( result == MPI_CONGRUENT )
+        return 3;
+    else if ( result == MPI_SIMILAR )
+        return 4;
+    else if ( result == MPI_UNEQUAL )
+        return 0;
+    MPI_ERROR( "Unknown results from comm compare" );
+#else
+    if ( comm.communicator == MPI_COMM_NULL || communicator == MPI_COMM_NULL )
+        return 0;
+    else
+        return 3;
+#endif
+    return 0;
+}
+
+
+/************************************************************************
+ *  Abort the program.                                                   *
+ ************************************************************************/
+void MPI_CLASS::setCallAbortInSerialInsteadOfExit( bool flag ) { d_call_abort = flag; }
+void MPI_CLASS::abort() const
+{
+#ifdef USE_MPI
+    MPI_Comm comm = communicator;
+    if ( comm == MPI_COMM_NULL )
+        comm = MPI_COMM_WORLD;
+    if ( !MPI_active() ) {
+        // MPI is not availible
+        exit( -1 );
+    } else if ( comm_size > 1 ) {
+        MPI_Abort( comm, -1 );
+    } else if ( d_call_abort ) {
+        MPI_Abort( comm, -1 );
+    } else {
+        exit( -1 );
+    }
+#else
+    exit( -1 );
+#endif
+}
+
+
+/************************************************************************
+ *  newTag                                                               *
+ ************************************************************************/
+int MPI_CLASS::newTag()
+{
+#ifdef USE_MPI
+    // Syncronize the processes to ensure all ranks enter this call
+    // Needed so the count will match
+    barrier();
+    // Return and increment the tag
+    int tag = ( *d_currentTag )++;
+    MPI_INSIST( tag <= d_maxTag, "Maximum number of tags exceeded\n" );
+    return tag;
+#else
+    static int globalCurrentTag = 1;
+    return globalCurrentTag++;
+#endif
+}
+
+
+/************************************************************************
+ *  allReduce                                                            *
+ ************************************************************************/
+bool MPI_CLASS::allReduce( const bool value ) const
+{
+    bool ret = value;
+    if ( comm_size > 1 ) {
+#ifdef USE_MPI
+        MPI_Allreduce(
+            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
+#else
+        MPI_ERROR( "This shouldn't be possible" );
+#endif
+    }
+    return ret;
+}
+
+
+/************************************************************************
+ *  anyReduce                                                            *
+ ************************************************************************/
+bool MPI_CLASS::anyReduce( const bool value ) const
+{
+    bool ret = value;
+    if ( comm_size > 1 ) {
+#ifdef USE_MPI
+        MPI_Allreduce(
+            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
+#else
+        MPI_ERROR( "This shouldn't be possible" );
+#endif
+    }
+    return ret;
+}
+
+
+/************************************************************************
+ *  call_sumReduce                                                       *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_sumReduce<unsigned char>(
+    const unsigned char *send, unsigned char *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<unsigned char>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<unsigned char>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<unsigned char>", profile_level );
+    auto send = x;
+    auto recv = new unsigned char[n];
+    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_sumReduce<char>( const char *send, char *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<char>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<char>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<char>( char *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<char>", profile_level );
+    auto send = x;
+    auto recv = new char[n];
+    MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_sumReduce<unsigned int>(
+    const unsigned int *send, unsigned int *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<unsigned int>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<unsigned int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<unsigned int>", profile_level );
+    auto send = x;
+    auto recv = new unsigned int[n];
+    MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_sumReduce<int>( const int *send, int *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<int>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<int>( int *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<int>", profile_level );
+    auto send = x;
+    auto recv = new int[n];
+    MPI_Allreduce( send, recv, n, MPI_INT, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<int>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_sumReduce<long int>( const long int *send, long int *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<long int>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<long int>( long int *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<long int>", profile_level );
+    auto send = x;
+    auto recv = new long int[n];
+    MPI_Allreduce( send, recv, n, MPI_LONG, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<long int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_sumReduce<unsigned long>(
+    const unsigned long *send, unsigned long *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<unsigned long>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<unsigned long>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<unsigned long>( unsigned long *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<unsigned long>", profile_level );
+    auto send = x;
+    auto recv = new unsigned long int[n];
+    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<unsigned long>", profile_level );
+}
+// size_t
+#ifdef USE_WINDOWS
+template<>
+void MPI_CLASS::call_sumReduce<size_t>( const size_t *send, size_t *recv, const int n ) const
+{
+    MPI_ASSERT( MPI_SIZE_T != 0 );
+    PROFILE_START( "sumReduce1<size_t>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<size_t>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<size_t>( size_t *x, const int n ) const
+{
+    MPI_ASSERT( MPI_SIZE_T != 0 );
+    PROFILE_START( "sumReduce2<size_t>", profile_level );
+    auto send = x;
+    auto recv = new size_t[n];
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<size_t>", profile_level );
+}
+#endif
+// float
+template<>
+void MPI_CLASS::call_sumReduce<float>( const float *send, float *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<float>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<float>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<float>( float *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<float>", profile_level );
+    auto send = x;
+    auto recv = new float[n];
+    MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_sumReduce<double>( const double *send, double *recv, const int n ) const
+{
+    PROFILE_START( "sumReduce1<double>", profile_level );
+    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
+    PROFILE_STOP( "sumReduce1<double>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<double>( double *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<double>", profile_level );
+    auto send = x;
+    auto recv = new double[n];
+    MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<double>", profile_level );
+}
+// std::complex<double>
+template<>
+void MPI_CLASS::call_sumReduce<std::complex<double>>(
+    const std::complex<double> *x, std::complex<double> *y, const int n ) const
+{
+    PROFILE_START( "sumReduce1<complex double>", profile_level );
+    auto send = new double[2 * n];
+    auto recv = new double[2 * n];
+    for ( int i = 0; i < n; i++ ) {
+        send[2 * i + 0] = real( x[i] );
+        send[2 * i + 1] = imag( x[i] );
+    }
+    MPI_Allreduce( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
+    delete[] send;
+    delete[] recv;
+    PROFILE_STOP( "sumReduce1<complex double>", profile_level );
+}
+template<>
+void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *x, const int n ) const
+{
+    PROFILE_START( "sumReduce2<complex double>", profile_level );
+    auto send = new double[2 * n];
+    auto recv = new double[2 * n];
+    for ( int i = 0; i < n; i++ ) {
+        send[2 * i + 0] = real( x[i] );
+        send[2 * i + 1] = imag( x[i] );
+    }
+    MPI_Allreduce( send, recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        x[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
+    delete[] send;
+    delete[] recv;
+    PROFILE_STOP( "sumReduce2<complex double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  call_minReduce                                                       *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_minReduce<unsigned char>(
+    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce1<unsigned char>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
+        PROFILE_STOP( "minReduce1<unsigned char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = send[i];
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<unsigned char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_minReduce<unsigned char>(
+    unsigned char *x, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce2<unsigned char>", profile_level );
+        auto send = x;
+        auto recv = new unsigned char[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "minReduce2<unsigned char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = x[i];
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            x[i] = static_cast<unsigned char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+// char
+template<>
+void MPI_CLASS::call_minReduce<char>(
+    const char *send, char *recv, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce1<char>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
+        PROFILE_STOP( "minReduce1<char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = send[i];
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_minReduce<char>( char *x, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce2<char>", profile_level );
+        auto send = x;
+        auto recv = new char[n];
+        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "minReduce2<char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = x[i];
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            x[i] = static_cast<char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_minReduce<unsigned int>(
+    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce1<unsigned int>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
+        PROFILE_STOP( "minReduce1<unsigned int>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( send[i] );
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_minReduce<unsigned int>(
+    unsigned int *x, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce2<unsigned int>", profile_level );
+        auto send = x;
+        auto recv = new unsigned int[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "minReduce2<unsigned int>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( x[i] );
+        call_minReduce<int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            x[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+// int
+template<>
+void MPI_CLASS::call_minReduce<int>(
+    const int *x, int *y, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
+    } else {
+        auto recv = new IntIntStruct[n];
+        auto send = new IntIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].j;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce1<int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<int>( int *x, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce2<int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        auto send = x;
+        auto recv = new int[n];
+        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new IntIntStruct[n];
+        auto send = new IntIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].j;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce2<int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_minReduce<unsigned long int>( const unsigned long int *send,
+    unsigned long int *recv, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce1<unsigned long>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
+        PROFILE_STOP( "minReduce1<unsigned long>", profile_level );
+    } else {
+        auto tmp = new long int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( send[i] );
+        call_minReduce<long int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_minReduce<unsigned long int>(
+    unsigned long int *x, const int n, int *comm_rank_of_min ) const
+{
+    if ( comm_rank_of_min == nullptr ) {
+        PROFILE_START( "minReduce2<unsigned long>", profile_level );
+        auto send = x;
+        auto recv = new unsigned long int[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "minReduce2<unsigned long>", profile_level );
+    } else {
+        auto tmp = new long int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( x[i] );
+        call_minReduce<long int>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            x[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+// long int
+template<>
+void MPI_CLASS::call_minReduce<long int>(
+    const long int *x, long int *y, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<long int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MIN, communicator );
+    } else {
+        auto recv = new LongIntStruct[n];
+        auto send = new LongIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].j;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<long int>( long int *x, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce2<long int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        auto send = x;
+        auto recv = new long int[n];
+        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MIN, communicator );
+        for ( long int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new LongIntStruct[n];
+        auto send = new LongIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].j;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce2<long int>", profile_level );
+}
+// unsigned long long int
+template<>
+void MPI_CLASS::call_minReduce<unsigned long long int>( const unsigned long long int *send,
+    unsigned long long int *recv, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<long int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        auto x = new long long int[n];
+        auto y = new long long int[n];
+        for ( int i = 0; i < n; i++ )
+            x[i] = unsigned_to_signed( send[i] );
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( y[i] );
+        delete[] x;
+        delete[] y;
+    } else {
+        printf( "minReduce<long long int> will use double\n" );
+        auto tmp = new double[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = static_cast<double>( send[i] );
+        call_minReduce<double>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<long long int>( tmp[i] );
+        delete[] tmp;
+    }
+    PROFILE_STOP( "minReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<unsigned long long int>(
+    unsigned long long int *x, const int n, int *comm_rank_of_min ) const
+{
+    auto recv = new unsigned long long int[n];
+    call_minReduce<unsigned long long int>( x, recv, n, comm_rank_of_min );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+}
+// long long int
+template<>
+void MPI_CLASS::call_minReduce<long long int>(
+    const long long int *x, long long int *y, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<long int>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
+    } else {
+        printf( "minReduce<long long int> will use double\n" );
+        auto tmp = new double[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = static_cast<double>( x[i] );
+        call_minReduce<double>( tmp, n, comm_rank_of_min );
+        for ( int i = 0; i < n; i++ )
+            y[i] = static_cast<long long int>( tmp[i] );
+        delete[] tmp;
+    }
+    PROFILE_STOP( "minReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<long long int>(
+    long long int *x, const int n, int *comm_rank_of_min ) const
+{
+    auto recv = new long long int[n];
+    call_minReduce<long long int>( x, recv, n, comm_rank_of_min );
+    for ( int i = 0; i < n; i++ )
+        x[i] = signed_to_unsigned( recv[i] );
+    delete[] recv;
+}
+// float
+template<>
+void MPI_CLASS::call_minReduce<float>(
+    const float *x, float *y, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<float>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
+    } else {
+        auto recv = new FloatIntStruct[n];
+        auto send = new FloatIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].f = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].f;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce1<float>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<float>( float *x, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce2<float>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        auto send = x;
+        auto recv = new float[n];
+        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new FloatIntStruct[n];
+        auto send = new FloatIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].f = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].f;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce2<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_minReduce<double>(
+    const double *x, double *y, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce1<double>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MIN, communicator );
+    } else {
+        auto recv = new DoubleIntStruct[n];
+        auto send = new DoubleIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].d = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].d;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce1<double>", profile_level );
+}
+template<>
+void MPI_CLASS::call_minReduce<double>( double *x, const int n, int *comm_rank_of_min ) const
+{
+    PROFILE_START( "minReduce2<double>", profile_level );
+    if ( comm_rank_of_min == nullptr ) {
+        auto send = x;
+        auto recv = new double[n];
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MIN, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new DoubleIntStruct[n];
+        auto send = new DoubleIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].d = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].d;
+            comm_rank_of_min[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "minReduce2<double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  call_maxReduce                                                    *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_maxReduce<unsigned char>(
+    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce1<unsigned char>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
+        PROFILE_STOP( "maxReduce1<unsigned char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = send[i];
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<unsigned char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_maxReduce<unsigned char>(
+    unsigned char *x, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce2<unsigned char>", profile_level );
+        auto send = x;
+        auto recv = new unsigned char[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "maxReduce2<unsigned char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = x[i];
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            x[i] = static_cast<unsigned char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+// char
+template<>
+void MPI_CLASS::call_maxReduce<char>(
+    const char *send, char *recv, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce1<char>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
+        PROFILE_STOP( "maxReduce1<char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = send[i];
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_maxReduce<char>( char *x, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce2<char>", profile_level );
+        auto send = x;
+        auto recv = new char[n];
+        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "maxReduce2<char>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = x[i];
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            x[i] = static_cast<char>( tmp[i] );
+        delete[] tmp;
+    }
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_maxReduce<unsigned int>(
+    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce1<unsigned int>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
+        PROFILE_STOP( "maxReduce1<unsigned int>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( send[i] );
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_maxReduce<unsigned int>(
+    unsigned int *x, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce2<unsigned int>", profile_level );
+        auto send = x;
+        auto recv = new unsigned int[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "maxReduce2<unsigned int>", profile_level );
+    } else {
+        auto tmp = new int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( x[i] );
+        call_maxReduce<int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            x[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+// int
+template<>
+void MPI_CLASS::call_maxReduce<int>(
+    const int *x, int *y, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MAX, communicator );
+    } else {
+        auto recv = new IntIntStruct[n];
+        auto send = new IntIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].j;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce1<int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<int>( int *x, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce2<int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        int *send = x;
+        auto recv = new int[n];
+        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new IntIntStruct[n];
+        auto send = new IntIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].j;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce2<int>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_maxReduce<long int>(
+    const long int *x, long int *y, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<lond int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MAX, communicator );
+    } else {
+        auto recv = new LongIntStruct[n];
+        auto send = new LongIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].j;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce1<lond int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<long int>( long int *x, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce2<lond int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        auto send = x;
+        auto recv = new long int[n];
+        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new LongIntStruct[n];
+        auto send = new LongIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].j = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].j;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce2<lond int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long int>( const unsigned long int *send,
+    unsigned long int *recv, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce1<unsigned long>", profile_level );
+        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
+        PROFILE_STOP( "maxReduce1<unsigned long>", profile_level );
+    } else {
+        auto tmp = new long int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( send[i] );
+        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long int>(
+    unsigned long int *x, const int n, int *comm_rank_of_max ) const
+{
+    if ( comm_rank_of_max == nullptr ) {
+        PROFILE_START( "maxReduce2<unsigned long>", profile_level );
+        auto send = x;
+        auto recv = new unsigned long int[n];
+        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+        PROFILE_STOP( "maxReduce2<unsigned long>", profile_level );
+    } else {
+        auto tmp = new long int[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = unsigned_to_signed( x[i] );
+        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            x[i] = signed_to_unsigned( tmp[i] );
+        delete[] tmp;
+    }
+}
+// unsigned long long int
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long long int>( const unsigned long long int *send,
+    unsigned long long int *recv, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<long int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        auto x = new long long int[n];
+        auto y = new long long int[n];
+        for ( int i = 0; i < n; i++ )
+            x[i] = unsigned_to_signed( send[i] );
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = signed_to_unsigned( y[i] );
+        delete[] x;
+        delete[] y;
+    } else {
+        printf( "maxReduce<long long int> will use double\n" );
+        auto tmp = new double[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = static_cast<double>( send[i] );
+        call_maxReduce<double>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            recv[i] = static_cast<long long int>( tmp[i] );
+        delete[] tmp;
+    }
+    PROFILE_STOP( "maxReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<unsigned long long int>(
+    unsigned long long int *x, const int n, int *comm_rank_of_max ) const
+{
+    auto recv = new unsigned long long int[n];
+    call_maxReduce<unsigned long long int>( x, recv, n, comm_rank_of_max );
+    for ( int i = 0; i < n; i++ )
+        x[i] = recv[i];
+    delete[] recv;
+}
+// long long int
+template<>
+void MPI_CLASS::call_maxReduce<long long int>(
+    const long long int *x, long long int *y, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<long int>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
+    } else {
+        printf( "maxReduce<long long int> will use double\n" );
+        auto tmp = new double[n];
+        for ( int i = 0; i < n; i++ )
+            tmp[i] = static_cast<double>( x[i] );
+        call_maxReduce<double>( tmp, n, comm_rank_of_max );
+        for ( int i = 0; i < n; i++ )
+            y[i] = static_cast<long long int>( tmp[i] );
+        delete[] tmp;
+    }
+    PROFILE_STOP( "maxReduce1<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<long long int>(
+    long long int *x, const int n, int *comm_rank_of_max ) const
+{
+    auto recv = new long long int[n];
+    call_maxReduce<long long int>( x, recv, n, comm_rank_of_max );
+    for ( int i = 0; i < n; i++ )
+        x[i] = signed_to_unsigned( recv[i] );
+    delete[] recv;
+}
+// float
+template<>
+void MPI_CLASS::call_maxReduce<float>(
+    const float *x, float *y, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<float>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_FLOAT, MPI_MAX, communicator );
+    } else {
+        auto recv = new FloatIntStruct[n];
+        auto send = new FloatIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].f = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].f;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce1<float>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<float>( float *x, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce2<float>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        auto send = x;
+        auto recv = new float[n];
+        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new FloatIntStruct[n];
+        auto send = new FloatIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].f = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].f;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce2<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_maxReduce<double>(
+    const double *x, double *y, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce1<double>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MAX, communicator );
+    } else {
+        auto recv = new DoubleIntStruct[n];
+        auto send = new DoubleIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].d = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            y[i]                = recv[i].d;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce1<double>", profile_level );
+}
+template<>
+void MPI_CLASS::call_maxReduce<double>( double *x, const int n, int *comm_rank_of_max ) const
+{
+    PROFILE_START( "maxReduce2<double>", profile_level );
+    if ( comm_rank_of_max == nullptr ) {
+        auto send = x;
+        auto recv = new double[n];
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MAX, communicator );
+        for ( int i = 0; i < n; i++ )
+            x[i] = recv[i];
+        delete[] recv;
+    } else {
+        auto recv = new DoubleIntStruct[n];
+        auto send = new DoubleIntStruct[n];
+        for ( int i = 0; i < n; ++i ) {
+            send[i].d = x[i];
+            send[i].i = comm_rank;
+        }
+        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
+        for ( int i = 0; i < n; ++i ) {
+            x[i]                = recv[i].d;
+            comm_rank_of_max[i] = recv[i].i;
+        }
+        delete[] recv;
+        delete[] send;
+    }
+    PROFILE_STOP( "maxReduce2<double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  bcast                                                                *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// char
+template<>
+void MPI_CLASS::call_bcast<unsigned char>( unsigned char *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<unsigned char>", profile_level );
+    MPI_Bcast( x, n, MPI_UNSIGNED_CHAR, root, communicator );
+    PROFILE_STOP( "bcast<unsigned char>", profile_level );
+}
+template<>
+void MPI_CLASS::call_bcast<char>( char *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<char>", profile_level );
+    MPI_Bcast( x, n, MPI_CHAR, root, communicator );
+    PROFILE_STOP( "bcast<char>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_bcast<unsigned int>( unsigned int *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<unsigned int>", profile_level );
+    MPI_Bcast( x, n, MPI_UNSIGNED, root, communicator );
+    PROFILE_STOP( "bcast<unsigned int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_bcast<int>( int *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<int>", profile_level );
+    MPI_Bcast( x, n, MPI_INT, root, communicator );
+    PROFILE_STOP( "bcast<int>", profile_level );
+}
+// float
+template<>
+void MPI_CLASS::call_bcast<float>( float *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<float>", profile_level );
+    MPI_Bcast( x, n, MPI_FLOAT, root, communicator );
+    PROFILE_STOP( "bcast<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_bcast<double>( double *x, const int n, const int root ) const
+{
+    PROFILE_START( "bcast<double>", profile_level );
+    MPI_Bcast( x, n, MPI_DOUBLE, root, communicator );
+    PROFILE_STOP( "bcast<double>", profile_level );
+}
+#else
+// We need a concrete instantiation of bcast<char>(x,n,root);
+template<>
+void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const
+{
+}
+#endif
+
+
+/************************************************************************
+ *  Perform a global barrier across all processors.                      *
+ ************************************************************************/
+void MPI_CLASS::barrier() const
+{
+#ifdef USE_MPI
+    MPI_Barrier( communicator );
+#endif
+}
+
+
+/************************************************************************
+ *  Send data array to another processor.                                *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// char
+template<>
+void MPI_CLASS::send<char>(
+    const char *buf, const int length, const int recv_proc_number, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    // Send the data
+    PROFILE_START( "send<char>", profile_level );
+    MPI_Send( (void *) buf, length, MPI_CHAR, recv_proc_number, tag, communicator );
+    PROFILE_STOP( "send<char>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::send<int>(
+    const int *buf, const int length, const int recv_proc_number, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    // Send the data
+    PROFILE_START( "send<int>", profile_level );
+    MPI_Send( (void *) buf, length, MPI_INT, recv_proc_number, tag, communicator );
+    PROFILE_STOP( "send<int>", profile_level );
+}
+// float
+template<>
+void MPI_CLASS::send<float>(
+    const float *buf, const int length, const int recv_proc_number, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    // Send the data
+    PROFILE_START( "send<float>", profile_level );
+    MPI_Send( (void *) buf, length, MPI_FLOAT, recv_proc_number, tag, communicator );
+    PROFILE_STOP( "send<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::send<double>(
+    const double *buf, const int length, const int recv_proc_number, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    // Send the data
+    PROFILE_START( "send<double>", profile_level );
+    MPI_Send( (void *) buf, length, MPI_DOUBLE, recv_proc_number, tag, communicator );
+    PROFILE_STOP( "send<double>", profile_level );
+}
+#else
+// We need a concrete instantiation of send for use without MPI
+template<>
+void MPI_CLASS::send<char>( const char *buf, const int length, const int, int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    PROFILE_START( "send<char>", profile_level );
+    auto id = getRequest( communicator, tag );
+    auto it = global_isendrecv_list.find( id );
+    MPI_INSIST( it == global_isendrecv_list.end(),
+        "send must be paired with a previous call to irecv in serial" );
+    MPI_ASSERT( it->second.status == 2 );
+    memcpy( (char *) it->second.data, buf, length );
+    global_isendrecv_list.erase( it );
+    PROFILE_START( "send<char>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  Non-blocking send data array to another processor.                   *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// char
+template<>
+MPI_Request MPI_CLASS::Isend<char>(
+    const char *buf, const int length, const int recv_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Isend<char>", profile_level );
+    MPI_Isend( (void *) buf, length, MPI_CHAR, recv_proc, tag, communicator, &request );
+    PROFILE_STOP( "Isend<char>", profile_level );
+    return request;
+}
+// int
+template<>
+MPI_Request MPI_CLASS::Isend<int>(
+    const int *buf, const int length, const int recv_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Isend<int>", profile_level );
+    MPI_Isend( (void *) buf, length, MPI_INT, recv_proc, tag, communicator, &request );
+    PROFILE_STOP( "Isend<int>", profile_level );
+    return request;
+}
+// float
+template<>
+MPI_Request MPI_CLASS::Isend<float>(
+    const float *buf, const int length, const int recv_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Isend<float>", profile_level );
+    MPI_Isend( (void *) buf, length, MPI_FLOAT, recv_proc, tag, communicator, &request );
+    PROFILE_STOP( "Isend<float>", profile_level );
+    return request;
+}
+// double
+template<>
+MPI_Request MPI_CLASS::Isend<double>(
+    const double *buf, const int length, const int recv_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Isend<double>", profile_level );
+    MPI_Isend( (void *) buf, length, MPI_DOUBLE, recv_proc, tag, communicator, &request );
+    PROFILE_STOP( "Isend<double>", profile_level );
+    return request;
+}
+#else
+// We need a concrete instantiation of send for use without mpi
+template<>
+MPI_Request MPI_CLASS::Isend<char>(
+    const char *buf, const int length, const int, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    PROFILE_START( "Isend<char>", profile_level );
+    auto id = getRequest( communicator, tag );
+    auto it = global_isendrecv_list.find( id );
+    if ( it == global_isendrecv_list.end() ) {
+        // We are calling isend first
+        Isendrecv_struct data;
+        data.data   = buf;
+        data.status = 1;
+        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
+    } else {
+        // We called irecv first
+        MPI_ASSERT( it->second.status == 2 );
+        memcpy( (char *) it->second.data, buf, length );
+        global_isendrecv_list.erase( it );
+    }
+    PROFILE_STOP( "Isend<char>", profile_level );
+    return id;
+}
+#endif
+
+
+/************************************************************************
+ *  Send byte array to another processor.                                *
+ ************************************************************************/
+void MPI_CLASS::sendBytes(
+    const void *buf, const int number_bytes, const int recv_proc_number, int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    send<char>( (const char *) buf, number_bytes, recv_proc_number, tag );
+}
+
+
+/************************************************************************
+ *  Non-blocking send byte array to another processor.                   *
+ ************************************************************************/
+MPI_Request MPI_CLASS::IsendBytes(
+    const void *buf, const int number_bytes, const int recv_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    return Isend<char>( (const char *) buf, number_bytes, recv_proc, tag );
+}
+
+
+/************************************************************************
+ *  Recieve data array to another processor.                             *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// char
+template<>
+void MPI_CLASS::recv<char>(
+    char *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    PROFILE_START( "recv<char>", profile_level );
+    // Get the recieve length if necessary
+    if ( get_length ) {
+        int bytes       = this->probe( send_proc_number, tag );
+        int recv_length = bytes / sizeof( char );
+        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
+        length = recv_length;
+    }
+    // Send the data
+    MPI_Status status;
+    MPI_Recv( (void *) buf, length, MPI_CHAR, send_proc_number, tag, communicator, &status );
+    PROFILE_STOP( "recv<char>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::recv<int>(
+    int *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    PROFILE_START( "recv<int>", profile_level );
+    // Get the recieve length if necessary
+    if ( get_length ) {
+        int bytes       = this->probe( send_proc_number, tag );
+        int recv_length = bytes / sizeof( int );
+        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
+        length = recv_length;
+    }
+    // Send the data
+    MPI_Status status;
+    MPI_Recv( (void *) buf, length, MPI_INT, send_proc_number, tag, communicator, &status );
+    PROFILE_STOP( "recv<int>", profile_level );
+}
+// float
+template<>
+void MPI_CLASS::recv<float>(
+    float *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    PROFILE_START( "recv<float>", profile_level );
+    // Get the recieve length if necessary
+    if ( get_length ) {
+        int bytes       = this->probe( send_proc_number, tag );
+        int recv_length = bytes / sizeof( float );
+        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
+        length = recv_length;
+    }
+    // Send the data
+    MPI_Status status;
+    MPI_Recv( (void *) buf, length, MPI_FLOAT, send_proc_number, tag, communicator, &status );
+    PROFILE_STOP( "recv<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::recv<double>(
+    double *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
+{
+    // Set the tag to 0 if it is < 0
+    tag = ( tag >= 0 ) ? tag : 0;
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    PROFILE_START( "recv<double>", profile_level );
+    // Get the recieve length if necessary
+    if ( get_length ) {
+        int bytes       = this->probe( send_proc_number, tag );
+        int recv_length = bytes / sizeof( double );
+        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
+        length = recv_length;
+    }
+    // Send the data
+    MPI_Status status;
+    MPI_Recv( (void *) buf, length, MPI_DOUBLE, send_proc_number, tag, communicator, &status );
+    PROFILE_STOP( "recv<double>", profile_level );
+}
+#else
+// We need a concrete instantiation of recv for use without mpi
+template<>
+void MPI_CLASS::recv<char>( char *buf, int &length, const int, const bool, int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    PROFILE_START( "recv<char>", profile_level );
+    auto id = getRequest( communicator, tag );
+    auto it = global_isendrecv_list.find( id );
+    MPI_INSIST( it != global_isendrecv_list.end(),
+        "recv must be paired with a previous call to isend in serial" );
+    MPI_ASSERT( it->second.status == 1 );
+    memcpy( buf, it->second.data, length );
+    global_isendrecv_list.erase( it );
+    PROFILE_STOP( "recv<char>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  Non-blocking recieve data array to another processor.                *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// char
+template<>
+MPI_Request MPI_CLASS::Irecv<char>(
+    char *buf, const int length, const int send_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Irecv<char>", profile_level );
+    MPI_Irecv( (void *) buf, length, MPI_CHAR, send_proc, tag, communicator, &request );
+    PROFILE_STOP( "Irecv<char>", profile_level );
+    return request;
+}
+// int
+template<>
+MPI_Request MPI_CLASS::Irecv<int>(
+    int *buf, const int length, const int send_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Irecv<int>", profile_level );
+    MPI_Irecv( (void *) buf, length, MPI_INT, send_proc, tag, communicator, &request );
+    PROFILE_STOP( "Irecv<int>", profile_level );
+    return request;
+}
+// float
+template<>
+MPI_Request MPI_CLASS::Irecv<float>(
+    float *buf, const int length, const int send_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Irecv<float>", profile_level );
+    MPI_Irecv( (void *) buf, length, MPI_FLOAT, send_proc, tag, communicator, &request );
+    PROFILE_STOP( "Irecv<float>", profile_level );
+    return request;
+}
+// double
+template<>
+MPI_Request MPI_CLASS::Irecv<double>(
+    double *buf, const int length, const int send_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Request request;
+    PROFILE_START( "Irecv<double>", profile_level );
+    MPI_Irecv( (void *) buf, length, MPI_DOUBLE, send_proc, tag, communicator, &request );
+    PROFILE_STOP( "Irecv<double>", profile_level );
+    return request;
+}
+#else
+// We need a concrete instantiation of irecv for use without mpi
+template<>
+MPI_Request MPI_CLASS::Irecv<char>( char *buf, const int length, const int, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    PROFILE_START( "Irecv<char>", profile_level );
+    auto id = getRequest( communicator, tag );
+    auto it = global_isendrecv_list.find( id );
+    if ( it == global_isendrecv_list.end() ) {
+        // We are calling Irecv first
+        Isendrecv_struct data;
+        data.data   = buf;
+        data.status = 2;
+        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
+    } else {
+        // We called Isend first
+        MPI_ASSERT( it->second.status == 1 );
+        memcpy( buf, it->second.data, length );
+        global_isendrecv_list.erase( it );
+    }
+    PROFILE_STOP( "Irecv<char>", profile_level );
+    return id;
+}
+#endif
+
+
+/************************************************************************
+ *  Recieve byte array to another processor.                             *
+ ************************************************************************/
+void MPI_CLASS::recvBytes( void *buf, int &number_bytes, const int send_proc, int tag ) const
+{
+    recv<char>( (char *) buf, number_bytes, send_proc, false, tag );
+}
+
+
+/************************************************************************
+ *  Recieve byte array to another processor.                             *
+ ************************************************************************/
+MPI_Request MPI_CLASS::IrecvBytes(
+    void *buf, const int number_bytes, const int send_proc, const int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    return Irecv<char>( (char *) buf, number_bytes, send_proc, tag );
+}
+
+
+/************************************************************************
+ *  allGather                                                            *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_allGather<unsigned char>(
+    const unsigned char &x_in, unsigned char *x_out ) const
+{
+    PROFILE_START( "allGather<unsigned char>", profile_level );
+    MPI_Allgather(
+        (void *) &x_in, 1, MPI_UNSIGNED_CHAR, (void *) x_out, 1, MPI_UNSIGNED_CHAR, communicator );
+    PROFILE_STOP( "allGather<unsigned char>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<unsigned char>( const unsigned char *x_in, int size_in,
+    unsigned char *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<unsigned char>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
+        communicator );
+    PROFILE_STOP( "allGatherv<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_allGather<char>( const char &x_in, char *x_out ) const
+{
+    PROFILE_START( "allGather<char>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_CHAR, (void *) x_out, 1, MPI_CHAR, communicator );
+    PROFILE_STOP( "allGather<char>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<char>(
+    const char *x_in, int size_in, char *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<char>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
+        communicator );
+    PROFILE_STOP( "allGatherv<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &x_in, unsigned int *x_out ) const
+{
+    PROFILE_START( "allGather<unsigned int>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_UNSIGNED, (void *) x_out, 1, MPI_UNSIGNED, communicator );
+    PROFILE_STOP( "allGather<unsigned int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<unsigned int>(
+    const unsigned int *x_in, int size_in, unsigned int *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<unsigned int>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED, (void *) x_out, size_out, disp_out,
+        MPI_UNSIGNED, communicator );
+    PROFILE_STOP( "allGatherv<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_allGather<int>( const int &x_in, int *x_out ) const
+{
+    PROFILE_START( "allGather<int>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_INT, (void *) x_out, 1, MPI_INT, communicator );
+    PROFILE_STOP( "allGather<int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<int>(
+    const int *x_in, int size_in, int *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<int>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_INT, (void *) x_out, size_out, disp_out, MPI_INT,
+        communicator );
+    PROFILE_STOP( "allGatherv<int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_allGather<unsigned long int>(
+    const unsigned long int &x_in, unsigned long int *x_out ) const
+{
+    PROFILE_START( "allGather<unsigned long>", profile_level );
+    MPI_Allgather(
+        (void *) &x_in, 1, MPI_UNSIGNED_LONG, (void *) x_out, 1, MPI_UNSIGNED_LONG, communicator );
+    PROFILE_STOP( "allGather<unsigned long>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<unsigned long int>( const unsigned long int *x_in, int size_in,
+    unsigned long int *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<unsigned long>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED_LONG, (void *) x_out, size_out, disp_out,
+        MPI_UNSIGNED_LONG, communicator );
+    PROFILE_STOP( "allGatherv<unsigned long>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_allGather<long int>( const long int &x_in, long int *x_out ) const
+{
+    PROFILE_START( "allGather<long int>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_LONG, (void *) x_out, 1, MPI_LONG, communicator );
+    PROFILE_STOP( "allGather<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<long int>(
+    const long int *x_in, int size_in, long int *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<long int>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_LONG, (void *) x_out, size_out, disp_out, MPI_LONG,
+        communicator );
+    PROFILE_STOP( "allGatherv<long int>", profile_level );
+}
+// float
+template<>
+void MPI_CLASS::call_allGather<float>( const float &x_in, float *x_out ) const
+{
+    PROFILE_START( "allGather<float>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_FLOAT, (void *) x_out, 1, MPI_FLOAT, communicator );
+    PROFILE_STOP( "allGather<float>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<float>(
+    const float *x_in, int size_in, float *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<float>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_FLOAT, (void *) x_out, size_out, disp_out,
+        MPI_FLOAT, communicator );
+    PROFILE_STOP( "allGatherv<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_allGather<double>( const double &x_in, double *x_out ) const
+{
+    PROFILE_START( "allGather<double>", profile_level );
+    MPI_Allgather( (void *) &x_in, 1, MPI_DOUBLE, (void *) x_out, 1, MPI_DOUBLE, communicator );
+    PROFILE_STOP( "allGather<double>", profile_level );
+}
+template<>
+void MPI_CLASS::call_allGather<double>(
+    const double *x_in, int size_in, double *x_out, int *size_out, int *disp_out ) const
+{
+    PROFILE_START( "allGatherv<double>", profile_level );
+    MPI_Allgatherv( (void *) x_in, size_in, MPI_DOUBLE, (void *) x_out, size_out, disp_out,
+        MPI_DOUBLE, communicator );
+    PROFILE_STOP( "allGatherv<double>", profile_level );
+}
+#else
+// We need a concrete instantiation of call_allGather<char>(x_in,size_in,x_out,size_out)
+template<>
+void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const
+{
+    MPI_ERROR( "Internal error in communicator (allGather) " );
+}
+#endif
+
+
+/************************************************************************
+ *  allToAll                                                             *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+template<>
+void MPI_CLASS::allToAll<unsigned char>(
+    const int n, const unsigned char *send, unsigned char *recv ) const
+{
+    PROFILE_START( "allToAll<unsigned char>", profile_level );
+    MPI_Alltoall(
+        (void *) send, n, MPI_UNSIGNED_CHAR, (void *) recv, n, MPI_UNSIGNED_CHAR, communicator );
+    PROFILE_STOP( "allToAll<unsigned char>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<char>( const int n, const char *send, char *recv ) const
+{
+    PROFILE_START( "allToAll<char>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_CHAR, (void *) recv, n, MPI_CHAR, communicator );
+    PROFILE_STOP( "allToAll<char>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<unsigned int>(
+    const int n, const unsigned int *send, unsigned int *recv ) const
+{
+    PROFILE_START( "allToAll<unsigned int>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_UNSIGNED, (void *) recv, n, MPI_UNSIGNED, communicator );
+    PROFILE_STOP( "allToAll<unsigned int>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<int>( const int n, const int *send, int *recv ) const
+{
+    PROFILE_START( "allToAll<int>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_INT, (void *) recv, n, MPI_INT, communicator );
+    PROFILE_STOP( "allToAll<int>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<unsigned long int>(
+    const int n, const unsigned long int *send, unsigned long int *recv ) const
+{
+    PROFILE_START( "allToAll<unsigned long>", profile_level );
+    MPI_Alltoall(
+        (void *) send, n, MPI_UNSIGNED_LONG, (void *) recv, n, MPI_UNSIGNED_LONG, communicator );
+    PROFILE_STOP( "allToAll<unsigned long>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<long int>( const int n, const long int *send, long int *recv ) const
+{
+    PROFILE_START( "allToAll<long int>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_LONG, (void *) recv, n, MPI_LONG, communicator );
+    PROFILE_STOP( "allToAll<long int>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<float>( const int n, const float *send, float *recv ) const
+{
+    PROFILE_START( "allToAll<float>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_FLOAT, (void *) recv, n, MPI_FLOAT, communicator );
+    PROFILE_STOP( "allToAll<float>", profile_level );
+}
+template<>
+void MPI_CLASS::allToAll<double>( const int n, const double *send, double *recv ) const
+{
+    PROFILE_START( "allToAll<double>", profile_level );
+    MPI_Alltoall( (void *) send, n, MPI_DOUBLE, (void *) recv, n, MPI_DOUBLE, communicator );
+    PROFILE_STOP( "allToAll<double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  call_allToAll                                                        *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *send_data, const int send_cnt[],
+    const int send_disp[], unsigned char *recv_data, const int *recv_cnt,
+    const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<unsigned char>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_CHAR,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_CHAR, communicator );
+    PROFILE_STOP( "allToAllv<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_allToAll<char>( const char *send_data, const int send_cnt[],
+    const int send_disp[], char *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<char>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_CHAR,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_CHAR, communicator );
+    PROFILE_STOP( "allToAllv<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *send_data, const int send_cnt[],
+    const int send_disp[], unsigned int *recv_data, const int *recv_cnt,
+    const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<unsigned int>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED, communicator );
+    PROFILE_STOP( "allToAllv<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_allToAll<int>( const int *send_data, const int send_cnt[],
+    const int send_disp[], int *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<int>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_INT,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_INT, communicator );
+    PROFILE_STOP( "allToAllv<int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *send_data,
+    const int send_cnt[], const int send_disp[], unsigned long int *recv_data, const int *recv_cnt,
+    const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<unsigned long>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_LONG,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_LONG, communicator );
+    PROFILE_STOP( "allToAllv<unsigned long>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_allToAll<long int>( const long int *send_data, const int send_cnt[],
+    const int send_disp[], long int *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<long int>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_LONG,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_LONG, communicator );
+    PROFILE_STOP( "allToAllv<long int>", profile_level );
+}
+// float
+template<>
+void MPI_CLASS::call_allToAll<float>( const float *send_data, const int send_cnt[],
+    const int send_disp[], float *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<float>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_FLOAT,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_FLOAT, communicator );
+    PROFILE_STOP( "allToAllv<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_allToAll<double>( const double *send_data, const int send_cnt[],
+    const int send_disp[], double *recv_data, const int *recv_cnt, const int *recv_disp ) const
+{
+    PROFILE_START( "allToAllv<double>", profile_level );
+    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_DOUBLE,
+        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_DOUBLE, communicator );
+    PROFILE_STOP( "allToAllv<double>", profile_level );
+}
+#else
+// Default instatiation of unsigned char
+template<>
+void MPI_CLASS::call_allToAll<char>(
+    const char *, const int[], const int[], char *, const int *, const int * ) const
+{
+    MPI_ERROR( "Should not reach this point" );
+}
+#endif
+
+
+/************************************************************************
+ *  call_sumScan                                                         *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_sumScan<unsigned char>(
+    const unsigned char *send, unsigned char *recv, int n ) const
+{
+    PROFILE_START( "sumScan<unsigned char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_sumScan<char>( const char *send, char *recv, int n ) const
+{
+    PROFILE_START( "sumScan<char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_sumScan<unsigned int>(
+    const unsigned int *send, unsigned int *recv, int n ) const
+{
+    PROFILE_START( "sumScan<unsigned int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_sumScan<int>( const int *send, int *recv, int n ) const
+{
+    PROFILE_START( "sumScan<int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<int>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_sumScan<long int>( const long int *send, long int *recv, int n ) const
+{
+    PROFILE_START( "sumScan<long int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<long int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_sumScan<unsigned long>(
+    const unsigned long *send, unsigned long *recv, int n ) const
+{
+    PROFILE_START( "sumScan<unsigned long>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<unsigned long>", profile_level );
+}
+// size_t
+#ifdef USE_WINDOWS
+template<>
+void MPI_CLASS::call_sumScan<size_t>( const size_t *send, size_t *recv, int n ) const
+{
+    MPI_ASSERT( MPI_SIZE_T != 0 );
+    PROFILE_START( "sumScan<size_t>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<size_t>", profile_level );
+}
+#endif
+// float
+template<>
+void MPI_CLASS::call_sumScan<float>( const float *send, float *recv, int n ) const
+{
+    PROFILE_START( "sumScan<float>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_sumScan<double>( const double *send, double *recv, int n ) const
+{
+    PROFILE_START( "sumScan<double>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
+    PROFILE_STOP( "sumScan<double>", profile_level );
+}
+// std::complex<double>
+template<>
+void MPI_CLASS::call_sumScan<std::complex<double>>(
+    const std::complex<double> *x, std::complex<double> *y, int n ) const
+{
+    auto send = new double[2 * n];
+    auto recv = new double[2 * n];
+    for ( int i = 0; i < n; i++ ) {
+        send[2 * i + 0] = real( x[i] );
+        send[2 * i + 1] = imag( x[i] );
+    }
+    MPI_Scan( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
+    for ( int i = 0; i < n; i++ )
+        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
+    delete[] send;
+    delete[] recv;
+}
+#endif
+
+
+/************************************************************************
+ *  call_minScan                                                         *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_minScan<unsigned char>(
+    const unsigned char *send, unsigned char *recv, int n ) const
+{
+    PROFILE_START( "minScan<unsigned char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_minScan<char>( const char *send, char *recv, int n ) const
+{
+    PROFILE_START( "minScan<char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_minScan<unsigned int>(
+    const unsigned int *send, unsigned int *recv, int n ) const
+{
+    PROFILE_START( "minScan<unsigned int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_minScan<int>( const int *send, int *recv, int n ) const
+{
+    PROFILE_START( "minScan<int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_minScan<unsigned long int>(
+    const unsigned long int *send, unsigned long int *recv, int n ) const
+{
+    PROFILE_START( "minScan<unsigned long>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<unsigned long>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_minScan<long int>( const long int *send, long int *recv, int n ) const
+{
+    PROFILE_START( "minScan<long int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<long int>", profile_level );
+}
+// size_t
+#ifdef USE_WINDOWS
+template<>
+void MPI_CLASS::call_minScan<size_t>( const size_t *send, size_t *recv, int n ) const
+{
+    MPI_ASSERT( MPI_SIZE_T != 0 );
+    PROFILE_START( "minScan<size_t>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<size_t>", profile_level );
+}
+#endif
+// float
+template<>
+void MPI_CLASS::call_minScan<float>( const float *send, float *recv, int n ) const
+{
+    PROFILE_START( "minScan<float>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_minScan<double>( const double *send, double *recv, int n ) const
+{
+    PROFILE_START( "minScan<double>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MIN, communicator );
+    PROFILE_STOP( "minScan<double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  call_maxScan                                                         *
+ *  Note: these specializations are only called when using MPI.          *
+ ************************************************************************/
+#ifdef USE_MPI
+// unsigned char
+template<>
+void MPI_CLASS::call_maxScan<unsigned char>(
+    const unsigned char *send, unsigned char *recv, int n ) const
+{
+    PROFILE_START( "maxScan<unsigned char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<unsigned char>", profile_level );
+}
+// char
+template<>
+void MPI_CLASS::call_maxScan<char>( const char *send, char *recv, int n ) const
+{
+    PROFILE_START( "maxScan<char>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<char>", profile_level );
+}
+// unsigned int
+template<>
+void MPI_CLASS::call_maxScan<unsigned int>(
+    const unsigned int *send, unsigned int *recv, int n ) const
+{
+    PROFILE_START( "maxScan<unsigned int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<unsigned int>", profile_level );
+}
+// int
+template<>
+void MPI_CLASS::call_maxScan<int>( const int *send, int *recv, int n ) const
+{
+    PROFILE_START( "maxScan<int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<int>", profile_level );
+}
+// long int
+template<>
+void MPI_CLASS::call_maxScan<long int>( const long int *send, long int *recv, int n ) const
+{
+    PROFILE_START( "maxScan<long int>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<long int>", profile_level );
+}
+// unsigned long int
+template<>
+void MPI_CLASS::call_maxScan<unsigned long int>(
+    const unsigned long int *send, unsigned long int *recv, int n ) const
+{
+    PROFILE_START( "maxScan<unsigned long>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<unsigned long>", profile_level );
+}
+// size_t
+#ifdef USE_WINDOWS
+template<>
+void MPI_CLASS::call_maxScan<size_t>( const size_t *send, size_t *recv, int n ) const
+{
+    MPI_ASSERT( MPI_SIZE_T != 0 );
+    PROFILE_START( "maxScan<size_t>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<size_t>", profile_level );
+}
+#endif
+// float
+template<>
+void MPI_CLASS::call_maxScan<float>( const float *send, float *recv, int n ) const
+{
+    PROFILE_START( "maxScan<float>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<float>", profile_level );
+}
+// double
+template<>
+void MPI_CLASS::call_maxScan<double>( const double *send, double *recv, int n ) const
+{
+    PROFILE_START( "maxScan<double>", profile_level );
+    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MAX, communicator );
+    PROFILE_STOP( "maxScan<double>", profile_level );
+}
+#endif
+
+
+/************************************************************************
+ *  Communicate ranks for communication                                  *
+ ************************************************************************/
+std::vector<int> MPI_CLASS::commRanks( const std::vector<int> &ranks ) const
+{
+#ifdef USE_MPI
+    // Get a byte array with the ranks to communicate
+    auto data1 = new char[comm_size];
+    auto data2 = new char[comm_size];
+    memset( data1, 0, comm_size );
+    memset( data2, 0, comm_size );
+    for ( auto &rank : ranks )
+        data1[rank] = 1;
+    MPI_Alltoall( data1, 1, MPI_CHAR, data2, 1, MPI_CHAR, communicator );
+    int N = 0;
+    for ( int i = 0; i < comm_size; i++ )
+        N += data2[i];
+    std::vector<int> ranks_out;
+    ranks_out.reserve( N );
+    for ( int i = 0; i < comm_size; i++ ) {
+        if ( data2[i] )
+            ranks_out.push_back( i );
+    }
+    delete[] data1;
+    delete[] data2;
+    return ranks_out;
+#else
+    return ranks;
+#endif
+}
+
+
+/************************************************************************
+ *  Wait functions                                                       *
+ ************************************************************************/
+#ifdef USE_MPI
+void MPI_CLASS::wait( MPI_Request request )
+{
+    PROFILE_START( "wait", profile_level );
+    MPI_Status status;
+    int flag = 0;
+    int err  = MPI_Test( &request, &flag, &status );
+    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
+    while ( !flag ) {
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+        // Check if the request has finished
+        MPI_Test( &request, &flag, &status );
+    }
+    PROFILE_STOP( "wait", profile_level );
+}
+int MPI_CLASS::waitAny( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return -1;
+    PROFILE_START( "waitAny", profile_level );
+    int index   = -1;
+    int flag    = 0;
+    auto status = new MPI_Status[count];
+    int err     = MPI_Testany( count, request, &index, &flag, status );
+    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
+    while ( !flag ) {
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+        // Check if the request has finished
+        MPI_Testany( count, request, &index, &flag, status );
+    }
+    MPI_ASSERT( index >= 0 ); // Check that the index is valid
+    delete[] status;
+    PROFILE_STOP( "waitAny", profile_level );
+    return index;
+}
+void MPI_CLASS::waitAll( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return;
+    PROFILE_START( "waitAll", profile_level );
+    int flag    = 0;
+    auto status = new MPI_Status[count];
+    int err     = MPI_Testall( count, request, &flag, status );
+    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
+    while ( !flag ) {
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+        // Check if the request has finished
+        MPI_Testall( count, request, &flag, status );
+    }
+    PROFILE_STOP( "waitAll", profile_level );
+    delete[] status;
+}
+std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return std::vector<int>();
+    PROFILE_START( "waitSome", profile_level );
+    std::vector<int> indicies( count, -1 );
+    auto *status = new MPI_Status[count];
+    int outcount = 0;
+    int err      = MPI_Testsome( count, request, &outcount, &indicies[0], status );
+    MPI_ASSERT( err == MPI_SUCCESS );        // Check that the first call is valid
+    MPI_ASSERT( outcount != MPI_UNDEFINED ); // Check that the first call is valid
+    while ( outcount == 0 ) {
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+        // Check if the request has finished
+        MPI_Testsome( count, request, &outcount, &indicies[0], status );
+    }
+    indicies.resize( outcount );
+    delete[] status;
+    PROFILE_STOP( "waitSome", profile_level );
+    return indicies;
+}
+#else
+void MPI_CLASS::wait( MPI_Request request )
+{
+    PROFILE_START( "wait", profile_level );
+    while ( 1 ) {
+        // Check if the request is in our list
+        if ( global_isendrecv_list.find( request ) == global_isendrecv_list.end() )
+            break;
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+    }
+    PROFILE_STOP( "wait", profile_level );
+}
+int MPI_CLASS::waitAny( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return -1;
+    PROFILE_START( "waitAny", profile_level );
+    int index = 0;
+    while ( 1 ) {
+        // Check if the request is in our list
+        bool found_any = false;
+        for ( int i = 0; i < count; i++ ) {
+            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() ) {
+                found_any = true;
+                index     = i;
+            }
+        }
+        if ( found_any )
+            break;
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+    }
+    PROFILE_STOP( "waitAny", profile_level );
+    return index;
+}
+void MPI_CLASS::waitAll( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return;
+    PROFILE_START( "waitAll", profile_level );
+    while ( 1 ) {
+        // Check if the request is in our list
+        bool found_all = true;
+        for ( int i = 0; i < count; i++ ) {
+            if ( global_isendrecv_list.find( request[i] ) != global_isendrecv_list.end() )
+                found_all = false;
+        }
+        if ( found_all )
+            break;
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+    }
+    PROFILE_STOP( "waitAll", profile_level );
+}
+std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
+{
+    if ( count == 0 )
+        return std::vector<int>();
+    PROFILE_START( "waitSome", profile_level );
+    std::vector<int> indicies;
+    while ( 1 ) {
+        // Check if the request is in our list
+        for ( int i = 0; i < count; i++ ) {
+            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() )
+                indicies.push_back( i );
+        }
+        if ( !indicies.empty() )
+            break;
+        // Put the current thread to sleep to allow other threads to run
+        sched_yield();
+    }
+    PROFILE_STOP( "waitSome", profile_level );
+    return indicies;
+}
+#endif
+
+
+/************************************************************************
+ *  Probe functions                                                      *
+ ************************************************************************/
+#ifdef USE_MPI
+int MPI_CLASS::Iprobe( int source, int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Status status;
+    int flag = 0;
+    MPI_Iprobe( source, tag, communicator, &flag, &status );
+    if ( flag == 0 )
+        return -1;
+    int count;
+    MPI_Get_count( &status, MPI_BYTE, &count );
+    MPI_ASSERT( count >= 0 );
+    return count;
+}
+int MPI_CLASS::probe( int source, int tag ) const
+{
+    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
+    MPI_INSIST( tag >= 0, "tag must be >= 0" );
+    MPI_Status status;
+    MPI_Probe( source, tag, communicator, &status );
+    int count;
+    MPI_Get_count( &status, MPI_BYTE, &count );
+    MPI_ASSERT( count >= 0 );
+    return count;
+}
+#else
+int MPI_CLASS::Iprobe( int, int ) const
+{
+    MPI_ERROR( "Not implimented for serial codes (Iprobe)" );
+    return 0;
+}
+int MPI_CLASS::probe( int, int ) const
+{
+    MPI_ERROR( "Not implimented for serial codes (probe)" );
+    return 0;
+}
+#endif
+
+
+/************************************************************************
+ *  Timer functions                                                      *
+ ************************************************************************/
+#ifdef USE_MPI
+double MPI_CLASS::time() { return MPI_Wtime(); }
+double MPI_CLASS::tick() { return MPI_Wtick(); }
+#else
+double MPI_CLASS::time()
+{
+    auto t  = std::chrono::system_clock::now();
+    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>( t.time_since_epoch() );
+    return 1e-9 * ns.count();
+}
+double MPI_CLASS::tick()
+{
+    auto period = std::chrono::system_clock::period();
+    return static_cast<double>( period.num ) / static_cast<double>( period.den );
+}
+#endif
+
+
+/************************************************************************
+ *  Serialize a block of code across MPI processes                       *
+ ************************************************************************/
+void MPI_CLASS::serializeStart()
+{
+#ifdef USE_MPI
+    using namespace std::chrono_literals;
+    if ( comm_rank == 0 ) {
+        // Start rank 0 immediately
+    } else {
+        // Wait for a message from the previous rank
+        MPI_Request request;
+        MPI_Status status;
+        int flag = false, buf = 0;
+        MPI_Irecv( &buf, 1, MPI_INT, comm_rank - 1, 5627, MPI_COMM_WORLD, &request );
+        while ( !flag ) {
+            MPI_Test( &request, &flag, &status );
+            std::this_thread::sleep_for( 50ms );
+        }
+    }
+#endif
+}
+void MPI_CLASS::serializeStop()
+{
+#ifdef USE_MPI
+    using namespace std::chrono_literals;
+    if ( comm_rank < comm_size - 1 ) {
+        // Send flag to next rank
+        MPI_Send( &comm_rank, 1, MPI_INT, comm_rank + 1, 5627, MPI_COMM_WORLD );
+        // Wait for final finished flag
+        int flag = false, buf = 0;
+        MPI_Request request;
+        MPI_Status status;
+        MPI_Irecv( &buf, 1, MPI_INT, comm_size - 1, 5627, MPI_COMM_WORLD, &request );
+        while ( !flag ) {
+            MPI_Test( &request, &flag, &status );
+            std::this_thread::sleep_for( 50ms );
+        }
+    } else {
+        // Send final flag to all ranks
+        for ( int i = 0; i < comm_size - 1; i++ )
+            MPI_Send( &comm_rank, 1, MPI_INT, i, 5627, MPI_COMM_WORLD );
+    }
+#endif
+}
+
+
+/****************************************************************************
+ * Function to start/stop MPI                                                *
+ ****************************************************************************/
+#ifdef USE_EXT_MPI
+static bool called_MPI_Init = false;
+#endif
+bool MPI_CLASS::MPI_Active()
+{
+#ifdef USE_EXT_MPI
+    int MPI_initialized, MPI_finialized;
+    MPI_Initialized( &MPI_initialized );
+    MPI_Finalized( &MPI_finialized );
+    return MPI_initialized != 0 && MPI_finialized == 0;
+#else
+    return false;
+#endif
+}
+void MPI_CLASS::start_MPI( int argc, char *argv[], int profile_level )
+{
+    changeProfileLevel( profile_level );
+    NULL_USE( argc );
+    NULL_USE( argv );
+#ifdef USE_EXT_MPI
+    if ( MPI_Active() ) {
+        called_MPI_Init = false;
+    } else {
+        int provided;
+        int result = MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
+        if ( result != MPI_SUCCESS )
+            MPI_ERROR( "Unable to initialize MPI" );
+        if ( provided < MPI_THREAD_MULTIPLE )
+            std::cerr << "Warning: Failed to start MPI with MPI_THREAD_MULTIPLE\n";
+        called_MPI_Init = true;
+    }
+#endif
+}
+void MPI_CLASS::stop_MPI()
+{
+#ifdef USE_EXT_MPI
+    int finalized;
+    MPI_Finalized( &finalized );
+    if ( called_MPI_Init && !finalized ) {
+        MPI_Barrier( MPI_COMM_WORLD );
+        MPI_Finalize();
+        called_MPI_Init = true;
+    }
+#endif
+}
+
+
+} // namespace Utilities
+
diff --git a/common/MPI.h b/common/MPI.h
new file mode 100644
index 00000000..e3fd3e13
--- /dev/null
+++ b/common/MPI.h
@@ -0,0 +1,1152 @@
+// This file includes a wrapper class for MPI functions
+// Note this is a modified version of the MPI class for the Advanced Multi-Physics Package
+// Used with permission
+
+/*
+
+Copyright (c) 2012 UT-Battelle, LLC
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Collection of administrative costs for redistribution of the source code or binary form is allowed. However, collection of a royalty or other fee in excess of good faith amount for cost recovery for such redistribution is prohibited.
+
+*/
+
+#ifndef included_LBPM_MPI
+#define included_LBPM_MPI
+
+
+#include <array>
+#include <atomic>
+#include <complex>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+
+// Include mpi.h (or define MPI objects)
+// clang-format off
+#ifdef USE_MPI
+    #include "mpi.h"
+#else
+    typedef int MPI_Comm;
+    typedef int MPI_Request;
+    typedef int MPI_Status;
+    typedef void *MPI_Errhandler;
+    enum MPI_TYPES { MPI_INT, MPI_FLOAT, MPI_DOUBLE };
+    #define MPI_COMM_WORLD ( (MPI_Comm) 0xF4000010 )
+    #define MPI_COMM_SELF ( (MPI_Comm) 0xF4000001 )
+    #define MPI_COMM_NULL ( (MPI_Comm) 0xF4000000 )
+#endif
+// clang-format on
+
+
+namespace Utilities {
+
+
+/**
+ * \class MPI
+ *
+ * @brief Provides C++ wrapper around MPI routines.
+ *
+ * Class MPI groups common MPI routines into one globally-accessible
+ * location.  It provides small, simple routines that are common in MPI code.
+ * In some cases, the calling syntax has been simplified for convenience.
+ * Moreover, there is no reason to include the preprocessor ifdef/endif
+ * guards around these calls, since the MPI libraries are not called in
+ * these routines if the MPI libraries are not being used (e.g., when
+ * writing serial code).
+ * Note: Many of the communication routines are templated on type.  When using
+ * unknown types the reduce calls will fail, the send and gather calls should
+ * succeed provided that the size of the data type object is a fixed size on
+ * all processors.  sizeof(type) must be the same for all elements and processors.
+ */
+class MPI final
+{
+public:
+    enum class ThreadSupport : int { SINGLE, FUNNELED, SERIALIZED, MULTIPLE };
+
+public: // Constructors
+    /**
+     *\brief  Is MPI active
+     *\details  This returns true if MPI is initailized and not finalized
+     */
+    static bool MPI_active();
+
+    /**
+     *\brief  Empty constructor
+     *\details  This creates an empty constructor that does not contain an MPI communicator.
+     */
+    MPI();
+
+
+    //!  Empty destructor
+    ~MPI();
+
+
+    /**
+     * \brief Constructor from existing MPI communicator
+     * \details  This constructor creates a new communicator from an existing MPI communicator.
+     *    This does not create a new internal MPI_Comm, but uses the existing comm.
+     *    Note that by default, this will not free the MPI_Comm object and the user is
+     * responsible
+     *      for free'ing the MPI_Comm when it is no longer used.  This behavior is controlled by the
+     *      optional manage argument.
+     * \param comm      Existing MPI communicator
+     * \param manage    Do we want to manage the comm (free the MPI_Comm when this object leaves
+     * scope)
+     */
+    MPI( MPI_Comm comm, bool manage = false );
+
+
+    /**
+     * \brief Constructor from existing communicator
+     * \details  This constructor creates a new communicator from an existing communicator.
+     *   This does not create a new internal MPI_Comm, but uses the existing comm.
+     * \param comm Existing communicator
+     */
+    MPI( const MPI &comm );
+
+
+    /*!
+     * Move constructor
+     * @param rhs           Communicator to copy
+     */
+    MPI( MPI &&rhs );
+
+
+    /**
+     * \brief Assignment operator
+     * \details  This operator overloads the assignment to correctly copy an communicator
+     * \param comm Existing MPI object
+     */
+    MPI &operator=( const MPI &comm );
+
+
+    /*!
+     * Move assignment operator
+     * @param rhs           Communicator to copy
+     */
+    MPI &operator=( MPI &&rhs );
+
+
+    /**
+     * \brief Reset the object
+     * \details  This resets the object to the empty state without an MPI_Comm
+     */
+    void reset();
+
+
+public: // Member functions
+    /**
+     * \brief Get the node name
+     * \details  This function returns a unique name for each node.
+     *    It is a wrapper for MPI_Get_processor_name.
+     */
+    static std::string getNodeName();
+
+
+    //! Function to return the number of processors available
+    static int getNumberOfProcessors();
+
+
+    //! Function to return the affinity of the current process
+    static std::vector<int> getProcessAffinity();
+
+
+    //! Function to set the affinity of the current process
+    static void setProcessAffinity( const std::vector<int> &procs );
+
+
+    /**
+     * \brief Load balance the processes within a node
+     * \details  This function will redistribute the processes within a node using the
+     *    process affinities to achieve the desired load balance.
+     *    Note: this is a global operation on the given comm, and it is STRONGLY
+     *    recommended to use COMM_WORLD.
+     * \param comm      The communicator to use (Default is COMM_WORLD)
+     * \param method    The desired load balance method to use:
+     *                  1:  Adjust the affinities so all processes share the given processors.
+     *                      This effectively allows the OS to handle the load balancing
+     *                      by migrating the processes as necessary.  This is recommended
+     *                      for most users and use cases. (default)
+     *                  2:  Adjust the affinities so that the fewest number of processes overlap.
+     *                      This will try to give each process a unique set of processors while
+     *                      ensuring that each process has at least N_min processes.
+     * \param procs     An optional list of processors to use.  By default, setting this to an
+     *                  empty vector will use all available processors on the given node.
+     * \param N_min     The minimum number of processors for any process (-1 indicates all available
+     * processors).
+     * \param N_max     The maximum number of processors for any process (-1 indicates all available
+     * processors).
+     *
+     */
+    static void balanceProcesses( const MPI &comm = MPI( MPI_COMM_WORLD ), const int method = 1,
+        const std::vector<int> &procs = std::vector<int>(), const int N_min = 1,
+        const int N_max = -1 );
+
+
+    //! Query the level of thread support
+    static ThreadSupport queryThreadSupport();
+
+
+    /**
+     * \brief Generate a random number
+     * \details  This generates a random number that is consistent across the comm
+     */
+    size_t rand() const;
+
+
+    /**
+     * \brief Split an existing communicator
+     * \details  This creates a new communicator by splitting an existing communicator.
+     *   See MPI_Comm_split for information on how the underlying split will occur.
+     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
+     *   used by any MPI objects.
+     * \param color  Control of subset assignment (nonnegative integer).
+     *               Processes with the same color are in the same new communicator .
+     *               -1: processor will not be a member of any object (NULL object will be returned)
+     * \param key    Control of rank assignment (integer).
+     *               Note that, for a fixed color, the keys need not be unique. The processes will
+     * be sorted
+     *               in ascending order according to this key, then all the processes in a given
+     * color will
+     *               have the relative rank order as they did in their parent group. (See
+     * MPI_Comm_split)
+     */
+    MPI split( int color, int key = -1 ) const;
+
+
+    /**
+     * \brief Split an existing communicator by node
+     * \details  This creates a new communicator by splitting an existing communicator
+     *   by the node.  This will result in a separate MPI_Comm for each physical node.
+     *   Internally this will use MPI_Get_processor_name to identify the nodes.
+     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
+     *   used by any MPI objects)
+     * \param key    Control of rank assignment (integer).
+     *               Note that, for a fixed color, the keys need not be unique. The processes will
+     * be sorted
+     *               in ascending order according to this key, then all the processes in a given
+     * color will
+     *               have the relative rank order as they did in their parent group. (See
+     * MPI_Comm_split)
+     */
+    MPI splitByNode( int key = -1 ) const;
+
+
+    /**
+     * \brief Duplicate an existing communicator
+     * \details  This creates a new communicator by duplicating an existing communicator.
+     *   The resulting communicator will exist over the same processes, but have a different
+     * context.
+     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
+     *   used by any MPI objects.
+     */
+    MPI dup() const;
+
+
+    /**
+     * \brief Create a communicator from the intersection of two communicators
+     * \details  This creates a new communicator by intersecting two existing communicators.
+     *   Any processors that do not contain the both communicators will receive a NULL communicator.
+     *   There are 3 possible cases:
+     *      The communicators are disjoint (a null communicator will be returned on all processors).
+     *      One communicator is a sub communicator of another.  This will require communication on
+     *          the smaller communicator only.
+     *      The communicators partially overlap.  This will require communication on the first
+     * communicator.
+     */
+    static MPI intersect( const MPI &comm1, const MPI &comm2 );
+
+
+    /**
+     * Check if the current communicator is NULL
+     */
+    bool isNull() const { return d_isNull; }
+
+
+    /**
+     * \brief Return the global ranks for the comm
+     * \details  This returns a vector which contains the global ranks for each
+     *   member of the communicator.  The global ranks are defined according to WORLD comm.
+     */
+    std::vector<int> globalRanks() const;
+
+
+    /**
+     *  Get the current MPI communicator.
+     *  Note: The underlying MPI_Comm object may be free'd by the object when it is no
+     *  longer used by any communicators.  If the user has made a copy using the
+     *  getCommunicator routine, then it may be free'd without user knowledge.  The
+     *  user is responsible for checking if the communicator is valid, or keeping a
+     *  copy of the communicator that provided the MPI_Communicator.
+     */
+    const MPI_Comm &getCommunicator() const { return communicator; }
+
+
+    /**
+     * \brief Overload operator ==
+     * \details  Overload operator comm1 == comm2.  Two MPI objects are == if they share the same
+     * communicator.
+     *   Note: this is a local operation.
+     */
+    bool operator==( const MPI & ) const;
+
+
+    /**
+     * \brief Overload operator !=
+     * \details  Overload operator comm1 != comm2.  Two MPI objects are != if they
+     *   do not share the same communicator.
+     *   Note: this is a local operation.
+     */
+    bool operator!=( const MPI & ) const;
+
+
+    /**
+     * \brief Overload operator <
+     * \details  Overload operator comm1 < comm2.  One MPI object is < another iff all the
+     *   processors in the first object are also in the second.  Additionally, the second
+     *   object must contain at least one processor that is not in the first object.
+     *   This is a collective operation, based on the first communicator.
+     *   As a result all processors on the first communicator will return the same value,
+     *   while any processors that are not on the first communicator will return an unknown value.
+     *   Additionally, all processors on the first object MUST call this routine and will be
+     *   synchronized through this call (there is an internalallReduce).
+     */
+    bool operator<( const MPI & ) const;
+
+
+    /**
+     * \brief Overload operator <=
+     * \details  Overload operator comm1 <= comm2.  One MPI object is <= another iff all the
+     *   processors in the first object are also in the second.  This is a collective operation,
+     *   based on the first communicator.  As a result all processors on the first communicator
+     *   will return the same value, while any processors that are not on the first communicator
+     *   will return an unknown value.  Additionally, all processors on the first object MUST
+     *   call this routine and will be synchronized through this call (there is an internal
+     *   allReduce).
+     */
+    bool operator<=( const MPI & ) const;
+
+
+    /**
+     * \brief Overload operator >
+     * \details  Overload operator comm1 > comm2.  One MPI object is > another iff all the
+     *   processors in the second object are also in the first.  Additionally, the first object
+     *   must contain at least one processor that is not in the second object.
+     *   This is a collective operation, based on the first communicator.
+     *   As a result all processors on the first communicator will return the same value,
+     *   while any processors that are not on the first communicator will return an unknown value.
+     *   Additionally, all processors on the first object MUST call this routine and will be
+     *   synchronized through this call (there is an internal allReduce).
+     */
+    bool operator>( const MPI & ) const;
+
+
+    /**
+     * \brief Overload operator >=
+     * \details  Overload operator comm1 >= comm2.  One MPI object is > another iff all the
+     *   processors in the second object are also in the first.  Additionally, the first object
+     *   must contain at least one processor that is not in the second object.
+     *   This is a collective operation, based on the first communicator.
+     *   As a result all processors on the first communicator will return the same value, while any
+     *   processors that are not on the first communicator will return an unknown value.
+     *   Additionally, all processors on the first object MUST call this routine and will be
+     *   synchronized through this call (there is an internal allReduce).
+     */
+    bool operator>=( const MPI & ) const;
+
+
+    /**
+     * \brief Compare to another communicator
+     * \details  This compares the current communicator to another communicator.
+     *   This returns 1 if the two communicators are equal (they share the same MPI communicator),
+     *   2 if the contexts and groups are the same, 3 if different contexts but identical groups,
+     *   4 if different contexts but similar groups, and 0 otherwise.
+     *   Note: this is a local operation.
+     */
+    int compare( const MPI & ) const;
+
+
+    /**
+     * Return the processor rank (identifier) from 0 through the number of
+     * processors minus one.
+     */
+    int getRank() const { return comm_rank; }
+
+
+    /**
+     * Return the number of processors.
+     */
+    int getSize() const { return comm_size; }
+
+
+    /**
+     * Return the maximum tag
+     */
+    int maxTag() const { return d_maxTag; }
+
+
+    /**
+     * \brief   Return a new tag
+     * \details This routine will return an unused tag for communication.
+     *   Note that this tag may match a user tag, but this function will
+     *   not return two duplicate tags.  This is a global operation.
+     */
+    int newTag();
+
+
+    /**
+     * Call MPI_Abort or exit depending on whether running with one or more
+     * processes and value set by function above, if called.  The default is
+     * to call exit(-1) if running with one processor and to call MPI_Abort()
+     * otherwise.  This function avoids having to guard abort calls in
+     * application code.
+     */
+    void abort() const;
+
+
+    /**
+     * Set boolean flag indicating whether exit or abort is called when running
+     * with one processor.  Calling this function influences the behavior of
+     * calls to abort().  By default, the flag is true meaning that
+     * abort() will be called.  Passing false means exit(-1) will be called.
+     */
+    void setCallAbortInSerialInsteadOfExit( bool flag = true );
+
+
+    /**
+     * \brief   Boolean all reduce
+     * \details This function performs a boolean all reduce across all processors.
+     *   It returns true iff all processor are true;
+     * \param value  The input value for the all reduce
+     */
+    bool allReduce( const bool value ) const;
+
+
+    /**
+     * \brief   Boolean any reduce
+     * \details This function performs a boolean any reduce across all processors.
+     *   It returns true if any processor is true;
+     * \param value  The input value for the all reduce
+     */
+    bool anyReduce( const bool value ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details This function performs a sum all reduce across all processor.
+     *   It returns the sum across all processors;
+     * \param value  The input value for the all reduce
+     */
+    template<class type>
+    type sumReduce( const type value ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array sum Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise sum is returned in the same array.
+     * \param x  The input/output array for the reduce
+     * \param n  The number of values in the array (must match on all nodes)
+     */
+    template<class type>
+    void sumReduce( type *x, const int n = 1 ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array sum Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise sum is returned in the same array.
+     * \param x  The input array for the reduce
+     * \param y  The output array for the reduce
+     * \param n  The number of values in the array (must match on all nodes)
+     */
+    template<class type>
+    void sumReduce( const type *x, type *y, const int n = 1 ) const;
+
+
+    /**
+     * \brief   Min Reduce
+     * \details This function performs a min all reduce across all processor.
+     *   It returns the minimum value across all processors;
+     * \param value  The input value for the all reduce
+     */
+    template<class type>
+    type minReduce( const type value ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array min Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise minimum is returned in the same array.
+     *
+     * If a 'rank_of_min' argument is provided, it will set the array to the
+     * rank of process holding the minimum value.  Like the double argument,
+     * the size of the supplied 'rank_of_min' array should be n.
+     * \param x         The input/output array for the reduce
+     * \param n         The number of values in the array (must match on all nodes)
+     * \param rank_of_min  Optional array indicating the rank of the processor containing the
+     * minimum value
+     */
+    template<class type>
+    void minReduce( type *x, const int n = 1, int *rank_of_min = nullptr ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array min Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise minimum is returned in the same array.
+     *
+     * If a 'rank_of_min' argument is provided, it will set the array to the
+     * rank of process holding the minimum value.  Like the double argument,
+     * the size of the supplied 'rank_of_min' array should be n.
+     * \param x         The input array for the reduce
+     * \param y         The output array for the reduce
+     * \param n         The number of values in the array (must match on all nodes)
+     * \param rank_of_min  Optional array indicating the rank of the processor containing the
+     * minimum value
+     */
+    template<class type>
+    void minReduce( const type *x, type *y, const int n = 1, int *rank_of_min = nullptr ) const;
+
+
+    /**
+     * \brief   Max Reduce
+     * \details This function performs a max all reduce across all processor.
+     *   It returns the maximum value across all processors;
+     * \param value     The input value for the all reduce
+     */
+    template<class type>
+    type maxReduce( const type value ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array max Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise maximum is returned in the same array.
+     *
+     * If a 'rank_of_min' argument is provided, it will set the array to the
+     * rank of process holding the minimum value.  Like the double argument,
+     * the size of the supplied 'rank_of_min' array should be n.
+     * \param x         The input/output array for the reduce
+     * \param n         The number of values in the array (must match on all nodes)
+     * \param rank_of_max  Optional array indicating the rank of the processor containing the
+     * minimum value
+     */
+    template<class type>
+    void maxReduce( type *x, const int n = 1, int *rank_of_max = nullptr ) const;
+
+
+    /**
+     * \brief   Sum Reduce
+     * \details Perform an array max Reduce across all nodes.  Each
+     * processor contributes an array of values, and the
+     * element-wise maximum is returned in the same array.
+     *
+     * If a 'rank_of_min' argument is provided, it will set the array to the
+     * rank of process holding the minimum value.  Like the double argument,
+     * the size of the supplied 'rank_of_min' array should be n.
+     * \param x         The input array for the reduce
+     * \param y         The output array for the reduce
+     * \param n         The number of values in the array (must match on all nodes)
+     * \param rank_of_max  Optional array indicating the rank of the processor containing the
+     * minimum value
+     */
+    template<class type>
+    void maxReduce( const type *x, type *y, const int n = 1, int *rank_of_max = nullptr ) const;
+
+
+    /**
+     * \brief    Scan Sum Reduce
+     * \details  Computes the sum scan (partial reductions) of data on a collection of processes.
+     *   See MPI_Scan for more information.
+     * \param x         The input array for the scan
+     * \param y         The output array for the scan
+     * \param n         The number of values in the array (must match on all nodes)
+     */
+    template<class type>
+    void sumScan( const type *x, type *y, const int n = 1 ) const;
+
+
+    /**
+     * \brief    Scan Min Reduce
+     * \details  Computes the min scan (partial reductions) of data on a collection of processes.
+     *   See MPI_Scan for more information.
+     * \param x         The input array for the scan
+     * \param y         The output array for the scan
+     * \param n         The number of values in the array (must match on all nodes)
+     */
+    template<class type>
+    void minScan( const type *x, type *y, const int n = 1 ) const;
+
+
+    /**
+     * \brief    Scan Max Reduce
+     * \details  Computes the max scan (partial reductions) of data on a collection of processes.
+     *   See MPI_Scan for more information.
+     * \param x         The input array for the scan
+     * \param y         The output array for the scan
+     * \param n     The number of values in the array (must match on all nodes)
+     */
+    template<class type>
+    void maxScan( const type *x, type *y, const int n = 1 ) const;
+
+
+    /**
+     * \brief   Broadcast
+     * \details This function broadcasts a value from root to all processors
+     * \param value     The input value for the broadcast.
+     * \param root      The processor performing the broadcast
+     */
+    template<class type>
+    type bcast( const type &value, const int root ) const;
+
+
+    /**
+     * \brief   Broadcast
+     * \details This function broadcasts an array from root to all processors
+     * \param value     The input/output array for the broadcast
+     * \param n         The number of values in the array (must match on all nodes)
+     * \param root      The processor performing the broadcast
+     */
+    template<class type>
+    void bcast( type *value, const int n, const int root ) const;
+
+
+    /**
+     * Perform a global barrier across all processors.
+     */
+    void barrier() const;
+
+
+    /*!
+     * @brief This function sends an MPI message with an array to another processor.
+     *
+     * If the receiving processor knows in advance the length
+     * of the array, use "send_length = false;"  otherwise,
+     * this processor will first send the length of the array,
+     * then send the data.  This call must be paired with a
+     * matching call to recv.
+     *
+     * @param buf       Pointer to array buffer with length integers.
+     * @param length    Number of integers in buf that we want to send.
+     * @param recv      Receiving processor number.
+     * @param tag       Optional integer argument specifying an integer tag
+     *                  to be sent with this message.  Default tag is 0.
+     *                  The matching recv must share this tag.
+     */
+    template<class type>
+    void send( const type *buf, const int length, const int recv, int tag = 0 ) const;
+
+
+    /*!
+     * @brief This function sends an MPI message with an array of bytes
+     * (MPI_BYTES) to receiving_proc_number.
+     *
+     * This call must be paired with a matching call to recvBytes.
+     *
+     * @param buf       Void pointer to an array of number_bytes bytes to send.
+     * @param N_bytes   Integer number of bytes to send.
+     * @param recv      Receiving processor number.
+     * @param tag       Optional integer argument specifying an integer tag
+     *                  to be sent with this message.  Default tag is 0.
+     *                  The matching recv must share this tag.
+     */
+    void sendBytes( const void *buf, const int N_bytes, const int recv, int tag = 0 ) const;
+
+
+    /*!
+     * @brief This function sends an MPI message with an array
+     *   to another processor using a non-blocking call.
+     *   The receiving processor must know the length of the array.
+     *   This call must be paired  with a matching call to Irecv.
+     *
+     * @param buf       Pointer to array buffer with length integers.
+     * @param length    Number of integers in buf that we want to send.
+     * @param recv_proc Receiving processor number.
+     * @param tag       Integer argument specifying an integer tag
+     *                  to be sent with this message.
+     */
+    template<class type>
+    MPI_Request Isend(
+        const type *buf, const int length, const int recv_proc, const int tag ) const;
+
+
+    /*!
+     * @brief This function sends an MPI message with an array of bytes
+     *   (MPI_BYTES) to receiving_proc_number using a non-blocking call.
+     *   The receiving processor must know the number of bytes to receive.
+     *   This call must be paired with a matching call to IrecvBytes.
+     *
+     * @param buf       Void pointer to an array of number_bytes bytes to send.
+     * @param N_bytes   Integer number of bytes to send.
+     * @param recv_proc Receiving processor number.
+     * @param tag       Integer argument specifying an integer tag
+     *                  to be sent with this message.
+     */
+    MPI_Request IsendBytes(
+        const void *buf, const int N_bytes, const int recv_proc, const int tag ) const;
+
+
+    /*!
+     * @brief This function receives an MPI message with a data
+     * array from another processor.
+     *
+     * If this processor knows in advance the length of the array,
+     * use "get_length = false;" otherwise we will get the return size.
+     * This call must be paired with a matching call to send.
+     *
+     * @param buf        Pointer to integer array buffer with capacity of length integers.
+     * @param length     If get_length==true: The number of elements to be received, otherwise
+     *                   the maximum number of values that can be stored in buf.
+     *                   On output the number of received elements.
+     * @param send       Processor number of sender.
+     * @param tag        Optional integer argument specifying a tag which must be matched
+     *                   by the tag of the incoming message. Default tag is 0.
+     */
+    template<class type>
+    inline void recv( type *buf, int length, const int send, int tag ) const
+    {
+        int length2 = length;
+        recv( buf, length2, send, false, tag );
+    }
+
+
+    /*!
+     * @brief This function receives an MPI message with a data
+     * array from another processor.
+     *
+     * If this processor knows in advance the length of the array,
+     * use "get_length = false;" otherwise we will get the return size.
+     * This call must be paired with a matching call to send.
+     *
+     * @param buf        Pointer to integer array buffer with capacity of length integers.
+     * @param length     If get_length==true: The number of elements to be received, otherwise
+     *                   the maximum number of values that can be stored in buf.
+     *                   On output the number of received elements.
+     * @param send       Processor number of sender.
+     * @param get_length Optional boolean argument specifying if we first
+     *                   need to check the message size to get the size of the array.
+     *                   Default value is true.
+     * @param tag        Optional integer argument specifying a tag which must be matched
+     *                   by the tag of the incoming message. Default tag is 0.
+     */
+    template<class type>
+    void recv( type *buf, int &length, const int send, const bool get_length, int tag ) const;
+
+
+    /*!
+     * @brief This function receives an MPI message with an array of
+     * max size number_bytes (MPI_BYTES) from any processor.
+     *
+     * This call must be paired with a matching call to sendBytes.
+     *
+     * @param buf       Void pointer to a buffer of size number_bytes bytes.
+     * @param N_bytes   Integer number specifying size of buf in bytes.
+     * @param send      Integer number specifying size of buf in bytes.
+     * @param tag       Optional integer argument specifying a tag which
+     *   must be matched by the tag of the incoming message. Default
+     *   tag is 0.
+     */
+    void recvBytes( void *buf, int &N_bytes, const int send, int tag = 0 ) const;
+
+
+    /*!
+     * @brief This function receives an MPI message with a data
+     * array from another processor using a non-blocking call.
+     *
+     * @param buf        Pointer to integer array buffer with capacity of length integers.
+     * @param length     Maximum number of values that can be stored in buf.
+     * @param send_proc  Processor number of sender.
+     * @param tag        Optional integer argument specifying a tag which must
+     *                   be matched by the tag of the incoming message.
+     */
+    template<class type>
+    MPI_Request Irecv( type *buf, const int length, const int send_proc, const int tag ) const;
+
+
+    /*!
+     * @brief This function receives an MPI message with an array of
+     * max size number_bytes (MPI_BYTES) from any processor.
+     *
+     * This call must be paired with a matching call to sendBytes.
+     *
+     * @param buf       Void pointer to a buffer of size number_bytes bytes.
+     * @param N_bytes   Integer number specifying size of buf in bytes.
+     * @param send_proc Processor number of sender.
+     * @param tag       Integer argument specifying a tag which must
+     *                  be matched by the tag of the incoming message.
+     */
+    MPI_Request IrecvBytes(
+        void *buf, const int N_bytes, const int send_proc, const int tag ) const;
+
+
+    /*!
+     * Each processor sends every other processor a single value.
+     * @param[in] x      Input value for allGather
+     * @return           Output array for allGather
+     */
+    template<class type>
+    std::vector<type> allGather( const type &x ) const;
+
+
+    /*!
+     * Each processor sends every other processor an array
+     * @param[in] x      Input array for allGather
+     * @return           Output array for allGather
+     */
+    template<class type>
+    std::vector<type> allGather( const std::vector<type> &x_in ) const;
+
+
+    /*!
+     * Each processor sends every other processor a single value.
+     * The x_out array should be preallocated to a length equal
+     * to the number of processors.
+     * @param x_in      Input value for allGather
+     * @param x_out     Output array for allGather (must be preallocated to the size of the
+     * communicator)
+     */
+    template<class type>
+    void allGather( const type &x_in, type *x_out ) const;
+
+
+    /*!
+     * Each processor sends an array of data to all other processors.
+     * Each processor receives the values from all processors and gathers them
+     * to a single array.  If successful, the total number of received
+     * elements will be returned.
+     * @param send_data     Input array
+     * @param send_cnt      The number of values to send
+     * @param recv_data     Output array of received values
+     * @param recv_cnt      The number of values to receive from each processor (N).
+     *                      If known, this should be provided as an input.  Otherwise
+     *                      it is an optional output that will return the number of
+     *                      received values from each processor.
+     * @param recv_disp     The displacement (relative to the start of the array)
+     *                      from which to store the data received from processor i.
+     *                      If known, this should be provided as an input.  Otherwise
+     *                      it is an optional output that will return the starting location
+     *                      (relative to the start of the array) for the received data from
+     * processor i.
+     * @param known_recv    Are the received counts and displacements known.
+     *                      If the received sizes are known, then they must be provided,
+     *                      and an extra communication step is not necessary.  If the received
+     *                      sizes are not known, then an extra communication step will occur
+     * internally
+     *                      and the sizes and displacements will be returned (if desired).
+     */
+    template<class type>
+    int allGather( const type *send_data, const int send_cnt, type *recv_data,
+        int *recv_cnt = nullptr, int *recv_disp = nullptr, bool known_recv = false ) const;
+
+
+    /*!
+     * This function combines sets from different processors to create a single master set
+     * @param set       Input/Output std::set for the gather.
+     */
+    template<class type>
+    void setGather( std::set<type> &set ) const;
+
+
+    /*!
+     * This function combines std::maps from different processors to create a single master std::map
+     * If two or more ranks share the same key, the lowest rank will be used
+     * @param map       Input/Output std::map for the gather.
+     */
+    template<class KEY, class DATA>
+    void mapGather( std::map<KEY, DATA> &map ) const;
+
+
+    /*!
+     * Each processor sends an array of n values to each processor.
+     * Each processor sends an array of n values to each processor.
+     * The jth block of data is sent from processor i to processor j and placed
+     * in the ith block on the receiving processor.  In the variable
+     * description, N is the size of the communicator.  Note that this is a
+     * blocking global communication.
+     * @param n             The number of elements in each data block to send.
+     * @param send_data     Input array (nxN)
+     * @param recv_data     Output array of received values (nxN)
+     */
+    template<class type>
+    void allToAll( const int n, const type *send_data, type *recv_data ) const;
+
+
+    /*!
+     * Each processor sends an array of data to the different processors.
+     * Each processor may send any size array to any processor.  In the variable
+     * description, N is the size of the communicator.  Note that this is a
+     * blocking global communication.  If successful, the total number of received
+     * elements will be returned.
+     * @param send_data     Input array
+     * @param send_cnt      The number of values to send to each processor (N)
+     * @param send_disp     The displacement (relative to the start of the array)
+     *                      from which to send to processor i
+     * @param recv_data     Output array of received values
+     * @param recv_cnt      The number of values to receive from each processor (N).
+     *                      If known, this should be provided as an input.  Otherwise
+     *                      it is an optional output that will return the number of
+     *                      received values from each processor.
+     * @param recv_disp     The displacement (relative to the start of the array)
+     *                      from which to send to processor i.
+     *                      If known, this should be provided as an input.  Otherwise
+     *                      it is an optional output that will return the starting location
+     *                      (relative to the start of the array) for the received data from
+     * processor i.
+     * @param known_recv    Are the received counts and displacements known.
+     *                      If the received sizes are known, then they must be provided,
+     *                      and an extra communication step is not necessary.  If the received
+     *                      sizes are not know, then an extra communication step will occur
+     * internally
+     *                      and the sizes and displacements will be returned (if desired).
+     */
+    template<class type>
+    int allToAll( const type *send_data, const int send_cnt[], const int send_disp[],
+        type *recv_data, int *recv_cnt = nullptr, int *recv_disp = nullptr,
+        bool known_recv = false ) const;
+
+
+    /*!
+     * \brief   Send a list of proccesor ids to communicate
+     * \details This function communicates a list of proccesors to communicate.
+     *    Given a list of ranks that we want to send/receieve data to/from, this routine
+     *    will communicate that set to the other ranks returning the list of processors
+     *    that want to communication with the current rank.
+     *    Note: this routine will involved global communication
+     * \param ranks         List of ranks that the current rank wants to communicate with
+     * \return              List of ranks that want to communicate with the current processor
+     */
+    std::vector<int> commRanks( const std::vector<int> &ranks ) const;
+
+
+    /*!
+     * \brief   Wait for a communication to finish
+     * \details Wait for a communication to finish.
+     *    Note: this does not require a communicator.
+     * \param request    Communication request to wait for (returned for Isend or Irecv)
+     */
+    static void wait( MPI_Request request );
+
+
+    /*!
+     * \brief   Wait for any communication to finish.
+     * \details This function waits for any of the given communication requests to finish.
+     *    It returns the index of the communication request that finished.
+     *    Note: this does not require a communicator.
+     * \param count      Number of communications to check
+     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
+     */
+    static int waitAny( int count, MPI_Request *request );
+
+
+    /*!
+     * \brief   Wait for all communications to finish.
+     * \details This function waits for all of the given communication requests to finish.
+     *    Note: this does not require a communicator.
+     * \param count      Number of communications to check
+     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
+     */
+    static void waitAll( int count, MPI_Request *request );
+
+
+    /*!
+     * \brief   Wait for some communications to finish.
+     * \details This function waits for one (or more) communications to finish.
+     *    It returns an array of the indicies that have finished.
+     *    Note: this does not require a communicator.
+     * \param count      Number of communications to check
+     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
+     */
+    static std::vector<int> waitSome( int count, MPI_Request *request );
+
+
+    /*!
+     * \brief   Nonblocking test for a message
+     * \details This function performs a non-blocking test for a message.
+     *    It will return the number of bytes in the message if a message with
+     *    the specified source and tag (on the current communicator) is available.
+     *    Otherwise it will return -1.
+     * \param source      source rank (-1: any source)
+     * \param tag         tag (-1: any tag)
+     */
+    int Iprobe( int source = -1, int tag = -1 ) const;
+
+
+    /*!
+     * \brief   Blocking test for a message
+     * \details This function performs a blocking test for a message.
+     *    It will return the number of bytes in the message when a message with
+     *    the specified source and tag (on the current communicator) is available
+     * \param source      source rank (-1: any source)
+     * \param tag         tag (-1: any tag)
+     */
+    int probe( int source = -1, int tag = -1 ) const;
+
+
+    /*!
+     * \brief   Start a serial region
+     * \details This function will serialize MPI processes so that they run
+     *    one at a time.  A call to serializeStart must be followed by a call
+     *    to serializeStop after the commands to be executed.
+     *    Note: the ranks will be run in order.
+     */
+    void serializeStart();
+
+
+    /*!
+     * \brief   Stop a serial region
+     * \details Stop a serial region.  See serializeStart for more information.
+     */
+    void serializeStop();
+
+
+    /*!
+     * \brief   Elapsed time
+     * \details This function returns the elapsed time on the calling processor
+     *    since an arbitrary point in the past (seconds).  It is a wrapper to MPI_Wtime.
+     *    See "tick" for the timer resolution in seconds.
+     *    The time may or may not be synchronized across processors depending on the MPI
+     *    implementation.  Refer to MPI documentation for the desired platform for more information.
+     */
+    static double time();
+
+
+    /*!
+     * \brief   Timer resolution
+     * \details This function returns the timer resolution used by "time"
+     */
+    static double tick();
+
+
+    /*!
+     * \brief   Change the level of the internal timers
+     * \details This function changes the level of the timers used to profile MPI
+     * \param level         New level of the timers
+     */
+    static void changeProfileLevel( int level ) { profile_level = level; }
+
+
+    //! Return the total number of MPI_Comm objects that have been created
+    static size_t MPI_Comm_created() { return N_MPI_Comm_created; }
+
+    //! Return the total number of MPI_Comm objects that have been destroyed
+    static size_t MPI_Comm_destroyed() { return N_MPI_Comm_destroyed; }
+
+    //! Return details about MPI
+    static std::string info();
+
+    //! Return the MPI version number { major, minor }
+    static std::array<int, 2> version();
+
+    //! Check if MPI is active
+    static bool MPI_Active();
+
+    //! Start MPI
+    static void start_MPI( int argc_in, char *argv_in[], int profile_level = 0 );
+
+    //! Stop MPI
+    static void stop_MPI();
+
+
+private: // Private helper functions for templated MPI operations;
+    template<class type>
+    void call_sumReduce( type *x, const int n = 1 ) const;
+    template<class type>
+    void call_sumReduce( const type *x, type *y, const int n = 1 ) const;
+    template<class type>
+    void call_minReduce( type *x, const int n = 1, int *rank_of_min = nullptr ) const;
+    template<class type>
+    void call_minReduce(
+        const type *x, type *y, const int n = 1, int *rank_of_min = nullptr ) const;
+    template<class type>
+    void call_maxReduce( type *x, const int n = 1, int *rank_of_max = nullptr ) const;
+    template<class type>
+    void call_maxReduce(
+        const type *x, type *y, const int n = 1, int *rank_of_max = nullptr ) const;
+    template<class type>
+    void call_bcast( type *x, const int n, const int root ) const;
+    template<class type>
+    void call_allGather( const type &x_in, type *x_out ) const;
+    template<class type>
+    void call_allGather(
+        const type *x_in, int size_in, type *x_out, int *size_out, int *disp_out ) const;
+    template<class type>
+    void call_sumScan( const type *x, type *y, int n = 1 ) const;
+    template<class type>
+    void call_minScan( const type *x, type *y, int n = 1 ) const;
+    template<class type>
+    void call_maxScan( const type *x, type *y, int n = 1 ) const;
+    template<class type>
+    void call_allToAll( const type *send_data, const int send_cnt[], const int send_disp[],
+        type *recv_data, const int *recv_cnt, const int *recv_disp ) const;
+
+
+private: // data members
+    // The internal MPI communicator
+    MPI_Comm communicator;
+
+    // Is the communicator NULL
+    bool d_isNull;
+
+    // Do we want to manage this communicator
+    bool d_manage;
+
+    // Do we want to call MPI_abort instead of exit
+    bool d_call_abort;
+
+    // The level for the profiles of MPI
+    static short profile_level;
+
+    // The rank and size of the communicator
+    int comm_rank, comm_size;
+
+    // The ranks of the comm in the global comm
+    mutable int *volatile d_ranks;
+
+    // Some attributes
+    int d_maxTag;
+    int *volatile d_currentTag;
+
+    /* How many objects share the same underlying MPI communicator.
+     * When the count goes to 0, the MPI comm will be free'd (assuming it was created
+     * by an communicator).  This may not be perfect, but is likely to be good enough.
+     * Note that for thread safety, any access to this variable should be blocked for thread safety.
+     * The value of count MUST be volatile to ensure the correct value is always used.
+     */
+    std::atomic_int *volatile d_count;
+
+    // Add a variable for data alignment (necessary for some Intel builds)
+    double tmp_alignment;
+
+    /* We want to keep track of how many MPI_Comm objects we have created over time.
+     * Like the count, for thread safety this should be blocked, however the most likely error
+     * caused by not blocking is a slight error in the MPI count.  Since this is just for reference
+     * we do not need to block (recognizing that the value may not be 100% accurate).
+     */
+    static volatile unsigned int N_MPI_Comm_created;
+    static volatile unsigned int N_MPI_Comm_destroyed;
+};
+
+
+} // namespace Utilities
+
+
+// Include the default instantiations
+// \cond HIDDEN_SYMBOLS
+#include "common/MPI.I"
+// \endcond
+
+
+#endif
diff --git a/common/MPI_Helpers.cpp b/common/MPI_Helpers.cpp
deleted file mode 100644
index 736a2f02..00000000
--- a/common/MPI_Helpers.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-#include "common/MPI_Helpers.h"
-#include "common/Utilities.h"
-
-
-/********************************************************
-* Return the MPI data type                              *
-********************************************************/
-template<> MPI_Datatype getMPItype<char>() {
-    return MPI_CHAR;
-}
-template<> MPI_Datatype getMPItype<unsigned char>() {
-    return MPI_UNSIGNED_CHAR;
-}
-template<> MPI_Datatype getMPItype<int>() {
-    return MPI_INT;
-}
-template<> MPI_Datatype getMPItype<long>() {
-    return MPI_LONG;
-}
-template<> MPI_Datatype getMPItype<unsigned long>() {
-    return MPI_UNSIGNED_LONG;
-}
-template<> MPI_Datatype getMPItype<long long>() {
-    return MPI_LONG_LONG;
-}
-template<> MPI_Datatype getMPItype<float>() {
-    return MPI_FLOAT;
-}
-template<> MPI_Datatype getMPItype<double>() {
-    return MPI_DOUBLE;
-}
-
-
-/********************************************************
-* Concrete implimentations for packing/unpacking        *
-********************************************************/
-// unsigned char
-template<>
-size_t packsize<unsigned char>( const unsigned char& )
-{
-    return sizeof(unsigned char);
-}
-template<>
-void pack<unsigned char>( const unsigned char& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(unsigned char));
-}
-template<>
-void unpack<unsigned char>( unsigned char& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(unsigned char));
-}
-// char
-template<>
-size_t packsize<char>( const char& )
-{
-    return sizeof(char);
-}
-template<>
-void pack<char>( const char& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(char));
-}
-template<>
-void unpack<char>( char& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(char));
-}
-// int
-template<>
-size_t packsize<int>( const int& )
-{
-    return sizeof(int);
-}
-template<>
-void pack<int>( const int& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(int));
-}
-template<>
-void unpack<int>( int& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(int));
-}
-// unsigned int
-template<>
-size_t packsize<unsigned int>( const unsigned int& )
-{
-    return sizeof(unsigned int);
-}
-template<>
-void pack<unsigned int>( const unsigned int& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(int));
-}
-template<>
-void unpack<unsigned int>( unsigned int& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(int));
-}
-// size_t
-template<>
-size_t packsize<size_t>( const size_t& )
-{
-    return sizeof(size_t);
-}
-template<>
-void pack<size_t>( const size_t& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(size_t));
-}
-template<>
-void unpack<size_t>( size_t& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(size_t));
-}
-// std::string
-template<>
-size_t packsize<std::string>( const std::string& rhs )
-{
-    return rhs.size()+1;
-}
-template<>
-void pack<std::string>( const std::string& rhs, char *buffer )
-{
-    memcpy(buffer,rhs.c_str(),rhs.size()+1);
-}
-template<>
-void unpack<std::string>( std::string& data, const char *buffer )
-{
-    data = std::string(buffer);
-}
-
-
-/********************************************************
-* Fake MPI routines                                     *
-********************************************************/
-#ifndef USE_MPI
-int MPI_Init(int*,char***)
-{
-    return 0;
-}
-int MPI_Init_thread(int*,char***, int required, int *provided )
-{
-    *provided = required;
-    return 0;
-}
-int MPI_Finalize()
-{
-    return 0;
-}
-int MPI_Comm_size( MPI_Comm, int *size )
-{
-    *size = 1;
-    return 0;
-}
-int MPI_Comm_rank( MPI_Comm, int *rank )
-{
-    *rank = 0;
-    return 0;
-}
-int MPI_Barrier( MPI_Comm )
-{
-    return 0;
-}
-int MPI_Waitall( int, MPI_Request[], MPI_Status[] )
-{
-    return 0;
-}
-int MPI_Wait( MPI_Request*, MPI_Status* )
-{
-    return 0;
-}
-int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm )
-{
-    return 0;
-}
-int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
-         MPI_Comm comm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
-         MPI_Comm comm, MPI_Status *status)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
-              MPI_Comm comm, MPI_Request *request)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
-              int tag, MPI_Comm comm, MPI_Request *request)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
-                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                  MPI_Comm comm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                   void *recvbuf, const int *recvcounts, const int *displs,
-                   MPI_Datatype recvtype, MPI_Comm comm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                int dest, int sendtag,
-                void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                int source, int recvtag,
-                MPI_Comm comm, MPI_Status *status)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-               MPI_Op op, int root, MPI_Comm comm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Comm_group(MPI_Comm comm, MPI_Group *group)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm)
-{
-    ERROR("Not implimented yet");
-    return 0;
-}
-int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm)
-{
-    *newcomm = comm;
-    return 0;
-}
-double MPI_Wtime( void )
-{
-    return 0.0;
-}
-int MPI_Comm_free(MPI_Comm *group)
-{
-    return 0;
-}
-int MPI_Group_free(MPI_Group *group)
-{
-    return 0;
-}
-#endif
-
-
diff --git a/common/MPI_Helpers.h b/common/MPI_Helpers.h
deleted file mode 100644
index 1d20318e..00000000
--- a/common/MPI_Helpers.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// This file contains wrappers for MPI routines and functions to pack/unpack data structures
-#ifndef MPI_WRAPPERS_INC
-#define MPI_WRAPPERS_INC
-
-#include <string.h>
-#include <vector>
-#include <set>
-#include <map>
-
-#ifdef USE_MPI
-    // Inlcude MPI
-    #include "mpi.h"
-#else
-    // Create fake MPI types
-    typedef int MPI_Comm;
-    typedef int MPI_Request;
-    typedef int MPI_Status;
-    #define MPI_COMM_WORLD 0
-    #define MPI_COMM_SELF 0
-    #define MPI_COMM_NULL -1
-    #define MPI_GROUP_NULL -2
-    #define MPI_STATUS_IGNORE NULL
-    enum MPI_Datatype { MPI_LOGICAL, MPI_CHAR, MPI_UNSIGNED_CHAR, MPI_INT, 
-        MPI_UNSIGNED, MPI_LONG, MPI_UNSIGNED_LONG, MPI_LONG_LONG, MPI_FLOAT, MPI_DOUBLE };
-    enum MPI_Op { MPI_MIN, MPI_MAX, MPI_SUM };
-    typedef int MPI_Group;
-    #define MPI_THREAD_SINGLE 0
-    #define MPI_THREAD_FUNNELED 1
-    #define MPI_THREAD_SERIALIZED 2
-    #define MPI_THREAD_MULTIPLE 3
-    // Fake MPI functions
-	int MPI_Init(int*,char***);
-    int MPI_Init_thread( int *argc, char ***argv, int required, int *provided );
-	int MPI_Finalize();
-    int MPI_Comm_size( MPI_Comm, int *size );
-    int MPI_Comm_rank( MPI_Comm, int *rank );
-    int MPI_Barrier(MPI_Comm);
-    int MPI_Wait(MPI_Request*,MPI_Status*);
-    int MPI_Waitall(int,MPI_Request[],MPI_Status[]);
-    int MPI_Bcast(void*,int,MPI_Datatype,int,MPI_Comm);
-    int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
-             MPI_Comm comm);
-    int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
-             MPI_Comm comm, MPI_Status *status);
-    int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
-              MPI_Comm comm, MPI_Request *request);
-    int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
-              int tag, MPI_Comm comm, MPI_Request *request);
-    int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
-                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
-    int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                  MPI_Comm comm);
-    int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                  void *recvbuf, const int *recvcounts, const int *displs,
-                  MPI_Datatype recvtype, MPI_Comm comm);
-    int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                int dest, int sendtag,
-                void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                int source, int recvtag,
-                MPI_Comm comm, MPI_Status *status);
-    int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-               MPI_Op op, int root, MPI_Comm comm);
-    double MPI_Wtime( void );
-    int MPI_Comm_group(MPI_Comm comm, MPI_Group *group);
-    int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
-    int MPI_Comm_free(MPI_Comm *group);
-    int MPI_Group_free(MPI_Group *group);
-    int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
-#endif
-
-
-//! Get the size of the MPI_Comm
-//  Note: this is a thread and interrupt safe function
-inline int comm_size( MPI_Comm comm ) {
-    int size = 1;
-    MPI_Comm_size( comm, &size );
-    return size;
-}
-    
-
-//! Get the rank of the MPI_Comm
-//  Note: this is a thread and interrupt safe function
-inline int comm_rank( MPI_Comm comm ) {
-    int rank = 1;
-    MPI_Comm_rank( comm, &rank );
-    return rank;
-}
-    
-
-//! Get the size of MPI_COMM_WORLD
-inline int MPI_WORLD_SIZE( ) {
-    return comm_size( MPI_COMM_WORLD );
-}
-
-//! Get the size of MPI_COMM_WORLD
-inline int MPI_WORLD_RANK( ) {
-    return comm_rank( MPI_COMM_WORLD );
-}
-
-//! Return the appropriate MPI datatype for a class
-template<class TYPE>
-MPI_Datatype getMPItype();
-
-
-//! Template function to return the buffer size required to pack a class
-template<class TYPE>
-size_t packsize( const TYPE& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const TYPE& rhs, char *buffer );
-
-//! Template function to unpack a class from a buffer
-template<class TYPE>
-void unpack( TYPE& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::vector
-template<class TYPE>
-size_t packsize( const std::vector<TYPE>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const std::vector<TYPE>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void unpack( std::vector<TYPE>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::pair
-template<class TYPE1, class TYPE2>
-size_t packsize( const std::pair<TYPE1,TYPE2>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void pack( const std::pair<TYPE1,TYPE2>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void unpack( std::pair<TYPE1,TYPE2>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::map
-template<class TYPE1, class TYPE2>
-size_t packsize( const std::map<TYPE1,TYPE2>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void pack( const std::map<TYPE1,TYPE2>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void unpack( std::map<TYPE1,TYPE2>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::set
-template<class TYPE>
-size_t packsize( const std::set<TYPE>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const std::set<TYPE>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void unpack( std::set<TYPE>& data, const char *buffer );
-
-
-
-// Helper functions
-inline double sumReduce( MPI_Comm comm, double x )
-{
-    double y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_DOUBLE,MPI_SUM,comm);
-    return y;
-}
-inline float sumReduce( MPI_Comm comm, float x )
-{
-    float y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_FLOAT,MPI_SUM,comm);
-    return y;
-}
-inline int sumReduce( MPI_Comm comm, int x )
-{
-    int y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_INT,MPI_SUM,comm);
-    return y;
-}
-inline long long sumReduce( MPI_Comm comm, long long x )
-{
-    long long y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_LONG_LONG,MPI_SUM,comm);
-    return y;
-}
-inline bool sumReduce( MPI_Comm comm, bool x )
-{
-    int y = sumReduce( comm, x?1:0 );
-    return y>0;
-}
-inline std::vector<float> sumReduce( MPI_Comm comm, const std::vector<float>& x )
-{
-    auto y = x;
-	MPI_Allreduce(x.data(),y.data(),x.size(),MPI_FLOAT,MPI_SUM,comm);
-    return y;
-}
-inline std::vector<int> sumReduce( MPI_Comm comm, const std::vector<int>& x )
-{
-    auto y = x;
-	MPI_Allreduce(x.data(),y.data(),x.size(),MPI_INT,MPI_SUM,comm);
-    return y;
-}
-inline double maxReduce( MPI_Comm comm, double x )
-{
-    double y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_DOUBLE,MPI_MAX,comm);
-    return y;
-}
-inline float maxReduce( MPI_Comm comm, float x )
-{
-    float y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_FLOAT,MPI_MAX,comm);
-    return y;
-}
-inline int maxReduce( MPI_Comm comm, int x )
-{
-    int y = 0;
-	MPI_Allreduce(&x,&y,1,MPI_INT,MPI_MAX,comm);
-    return y;
-}
-
-
-#endif
-
-
-#include "common/MPI_Helpers.hpp"
-
-
diff --git a/common/ReadMicroCT.cpp b/common/ReadMicroCT.cpp
index 79ef241e..2209e712 100644
--- a/common/ReadMicroCT.cpp
+++ b/common/ReadMicroCT.cpp
@@ -64,11 +64,11 @@ Array<uint8_t> readMicroCT( const std::string& filename )
 
 
 // Read the compressed micro CT data and distribute
-Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
+Array<uint8_t> readMicroCT( const Database& domain, const Utilities::MPI& comm )
 {
     // Get the local problem info
     auto n = domain.getVector<int>( "n" );
-    int rank = comm_rank(MPI_COMM_WORLD);
+    int rank = comm.getRank();
     auto nproc = domain.getVector<int>( "nproc" );
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     
diff --git a/common/ReadMicroCT.h b/common/ReadMicroCT.h
index f232740e..c8acc379 100644
--- a/common/ReadMicroCT.h
+++ b/common/ReadMicroCT.h
@@ -5,11 +5,12 @@
 #include "common/Array.h"
 #include "common/Communication.h"
 #include "common/Database.h"
+#include "common/MPI.h"
 
 
 Array<uint8_t> readMicroCT( const std::string& filename );
 
-Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm );
+Array<uint8_t> readMicroCT( const Database& domain, const Utilities::MPI& comm );
 
 
 #endif
diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index e8a75994..6f2966e7 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -5,9 +5,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	Lock=false; // unlock the communicator
 	//......................................................................................
 	// Create a separate copy of the communicator for the device
-	//MPI_Comm_group(Dm->Comm,&Group);
-	//MPI_Comm_create(Dm->Comm,Group,&MPI_COMM_SCALBL);
-	MPI_Comm_dup(Dm->Comm,&MPI_COMM_SCALBL);
+    MPI_COMM_SCALBL = Dm->Comm.dup();
 	//......................................................................................
 	// Copy the domain size and communication information directly from Dm
 	Nx = Dm->Nx;
@@ -215,7 +213,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	ScaLBL_CopyToZeroCopy(dvcRecvList_Yz,Dm->recvList_Yz,recvCount_Yz*sizeof(int));
 	//......................................................................................
 
-	MPI_Barrier(MPI_COMM_SCALBL);
+	MPI_COMM_SCALBL.barrier();
 
 	//...................................................................................
 	// Set up the recieve distribution lists
@@ -288,7 +286,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	//...................................................................................
 
 	//......................................................................................
-	MPI_Barrier(MPI_COMM_SCALBL);
+	MPI_COMM_SCALBL.barrier();
 	ScaLBL_DeviceBarrier();
 	//......................................................................................
 	SendCount = sendCount_x+sendCount_X+sendCount_y+sendCount_Y+sendCount_z+sendCount_Z+
@@ -869,8 +867,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(12,dvcSendList_x,3*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	ScaLBL_D3Q19_Pack(14,dvcSendList_x,4*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	
-	MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
-	MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 5*sendCount_x,rank_x,sendtag);
+	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 5*recvCount_X,rank_X,recvtag);
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,dist,N);
@@ -878,8 +876,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(11,dvcSendList_X,3*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(13,dvcSendList_X,4*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	
-	MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
-	MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 5*sendCount_X,rank_X,sendtag);
+	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 5*recvCount_x,rank_x,recvtag);
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(8,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,dist,N);
@@ -887,8 +885,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_y,3*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_y,4*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	
-	MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
-	MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 5*sendCount_y,rank_y,sendtag);
+	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 5*recvCount_Y,rank_Y,recvtag);
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
@@ -896,8 +894,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Y,3*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Y,4*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	
-	MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
-	MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 5*sendCount_Y,rank_Y,sendtag);
+	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 5*recvCount_y,rank_y,recvtag);
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(12,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,dist,N);
@@ -905,8 +903,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_z,3*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_z,4*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	
-	MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
-	MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 5*sendCount_z,rank_z,sendtag);
+	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 5*recvCount_Z,rank_Z,recvtag);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,dist,N);
@@ -915,57 +913,57 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Z,3*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_Z,4*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	
-	MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
-	MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
+	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 5*sendCount_Z,rank_Z,sendtag);
+	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 5*recvCount_z,rank_z,recvtag);
 	
 	//...Pack the xy edge (8)................................
 	ScaLBL_D3Q19_Pack(8,dvcSendList_xy,0,sendCount_xy,sendbuf_xy,dist,N);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
+	req1[6] = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
+	req2[6] = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
 	//...Pack the Xy edge (9)................................
 	ScaLBL_D3Q19_Pack(9,dvcSendList_Xy,0,sendCount_Xy,sendbuf_Xy,dist,N);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
+	req1[8] = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
+	req2[8] = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
 	//...Pack the xY edge (10)................................
 	ScaLBL_D3Q19_Pack(10,dvcSendList_xY,0,sendCount_xY,sendbuf_xY,dist,N);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
+	req1[9] = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
+	req2[9] = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
 	//...Pack the XY edge (7)................................
 	ScaLBL_D3Q19_Pack(7,dvcSendList_XY,0,sendCount_XY,sendbuf_XY,dist,N);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
+	req1[7] = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
+	req2[7] = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
 	//...Pack the xz edge (12)................................
 	ScaLBL_D3Q19_Pack(12,dvcSendList_xz,0,sendCount_xz,sendbuf_xz,dist,N);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
+	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
+	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
 	//...Pack the xZ edge (14)................................
 	ScaLBL_D3Q19_Pack(14,dvcSendList_xZ,0,sendCount_xZ,sendbuf_xZ,dist,N);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
+	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
+	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
 	//...Pack the Xz edge (13)................................
 	ScaLBL_D3Q19_Pack(13,dvcSendList_Xz,0,sendCount_Xz,sendbuf_Xz,dist,N);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
+	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
+	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
 	//...Pack the XZ edge (11)................................
 	ScaLBL_D3Q19_Pack(11,dvcSendList_XZ,0,sendCount_XZ,sendbuf_XZ,dist,N);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
+	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
+	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
 	//...Pack the yz edge (16)................................
 	ScaLBL_D3Q19_Pack(16,dvcSendList_yz,0,sendCount_yz,sendbuf_yz,dist,N);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
+	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
+	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
 	//...Pack the yZ edge (18)................................
 	ScaLBL_D3Q19_Pack(18,dvcSendList_yZ,0,sendCount_yZ,sendbuf_yZ,dist,N);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
+	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
+	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
 	//...Pack the Yz edge (17)................................
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Yz,0,sendCount_Yz,sendbuf_Yz,dist,N);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
+	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
+	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
 	//...Pack the YZ edge (15)................................
 	ScaLBL_D3Q19_Pack(15,dvcSendList_YZ,0,sendCount_YZ,sendbuf_YZ,dist,N);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
+	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
+	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
 	//...................................................................................
 
 }
@@ -975,8 +973,8 @@ void ScaLBL_Communicator::RecvD3Q19AA(double *dist){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	MPI_COMM_SCALBL.waitAll(18,req1);
+	MPI_COMM_SCALBL.waitAll(18,req2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1059,8 +1057,8 @@ void ScaLBL_Communicator::RecvGrad(double *phi, double *grad){
 	// Recieves halo and incorporates into D3Q19 based stencil gradient computation
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	MPI_COMM_SCALBL.waitAll(18,req1);
+	MPI_COMM_SCALBL.waitAll(18,req2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1153,36 +1151,36 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,0,sendCount_x,sendbuf_x,Aq,N);
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,sendCount_x,sendCount_x,sendbuf_x,Bq,N);
 
-	MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
-	MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 2*sendCount_x,rank_x,sendtag);
+	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 2*recvCount_X,rank_X,recvtag);
 	
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,Aq,N);
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,Bq,N);
 	
-	MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
-	MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 2*sendCount_X,rank_X,sendtag);
+	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 2*recvCount_x,rank_x,recvtag);
 
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,Aq,N);
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,Bq,N);
 
-	MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
-	MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 2*sendCount_y,rank_y,sendtag);
+	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 2*recvCount_Y,rank_Y,recvtag);
 	
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,Aq,N);
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,Bq,N);
 
-	MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
-	MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 2*sendCount_Y,rank_Y,sendtag);
+	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 2*recvCount_y,rank_y,recvtag);
 	
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,Aq,N);
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,Bq,N);
 	
-	MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
-	MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 2*sendCount_z,rank_z,sendtag);
+	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 2*recvCount_Z,rank_Z,recvtag);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,Aq,N);
@@ -1190,8 +1188,8 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 
 	//...................................................................................
 	// Send all the distributions
-	MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
-	MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
+	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 2*sendCount_Z,rank_Z,sendtag);
+	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 2*recvCount_z,rank_z,recvtag);
 
 }
 
@@ -1201,8 +1199,8 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_Waitall(6,req1,stat1);
-	MPI_Waitall(6,req2,stat2);
+	MPI_COMM_SCALBL.waitAll(6,req1);
+	MPI_COMM_SCALBL.waitAll(6,req2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1293,18 +1291,18 @@ void ScaLBL_Communicator::TriSendD3Q7AA(double *Aq, double *Bq, double *Cq){
 
 	//...................................................................................
 	// Send all the distributions
-	MPI_Isend(sendbuf_x, 3*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
-	MPI_Irecv(recvbuf_X, 3*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
-	MPI_Isend(sendbuf_X, 3*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
-	MPI_Irecv(recvbuf_x, 3*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
-	MPI_Isend(sendbuf_y, 3*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
-	MPI_Irecv(recvbuf_Y, 3*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
-	MPI_Isend(sendbuf_Y, 3*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
-	MPI_Irecv(recvbuf_y, 3*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
-	MPI_Isend(sendbuf_z, 3*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
-	MPI_Irecv(recvbuf_Z, 3*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
-	MPI_Isend(sendbuf_Z, 3*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
-	MPI_Irecv(recvbuf_z, 3*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
+	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 3*sendCount_x,rank_x,sendtag);
+	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 3*recvCount_X,rank_X,recvtag);
+	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 3*sendCount_X,rank_X,sendtag);
+	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 3*recvCount_x,rank_x,recvtag);
+	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 3*sendCount_y,rank_y,sendtag);
+	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 3*recvCount_Y,rank_Y,recvtag);
+	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 3*sendCount_Y,rank_Y,sendtag);
+	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 3*recvCount_y,rank_y,recvtag);
+	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 3*sendCount_z,rank_z,sendtag);
+	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 3*recvCount_Z,rank_Z,recvtag);
+	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 3*sendCount_Z,rank_Z,sendtag);
+	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 3*recvCount_z,rank_z,recvtag);
 
 }
 
@@ -1314,8 +1312,8 @@ void ScaLBL_Communicator::TriRecvD3Q7AA(double *Aq, double *Bq, double *Cq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_Waitall(6,req1,stat1);
-	MPI_Waitall(6,req2,stat2);
+	MPI_COMM_SCALBL.waitAll(6,req1);
+	MPI_COMM_SCALBL.waitAll(6,req2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1409,49 +1407,49 @@ void ScaLBL_Communicator::SendHalo(double *data){
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
 
-	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
-	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
-	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
-	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
-	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
-	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
-	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
-	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
-	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
-	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
-	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
-	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
+	req1[0]  = MPI_COMM_SCALBL.Isend(sendbuf_x, sendCount_x,rank_x,sendtag);
+	req2[0]  = MPI_COMM_SCALBL.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag);
+	req1[1]  = MPI_COMM_SCALBL.Isend(sendbuf_X, sendCount_X,rank_X,sendtag);
+	req2[1]  = MPI_COMM_SCALBL.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag);
+	req1[2]  = MPI_COMM_SCALBL.Isend(sendbuf_y, sendCount_y,rank_y,sendtag);
+	req2[2]  = MPI_COMM_SCALBL.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag);
+	req1[3]  = MPI_COMM_SCALBL.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag);
+	req2[3]  = MPI_COMM_SCALBL.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag);
+	req1[4]  = MPI_COMM_SCALBL.Isend(sendbuf_z, sendCount_z,rank_z,sendtag);
+	req2[4]  = MPI_COMM_SCALBL.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag);
+	req1[5]  = MPI_COMM_SCALBL.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag);
+	req2[5]  = MPI_COMM_SCALBL.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag);
+	req1[6]  = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
+	req2[6]  = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
+	req1[7]  = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
+	req2[7]  = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
+	req1[8]  = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
+	req2[8]  = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
+	req1[9]  = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
+	req2[9]  = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
+	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
+	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
+	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
+	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
+	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
+	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
+	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
+	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
+	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
+	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
+	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
+	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
+	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
+	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
+	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
+	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
 	//...................................................................................
 }
 void ScaLBL_Communicator::RecvHalo(double *data){
 
 	//...................................................................................
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	MPI_COMM_SCALBL.waitAll(18,req1);
+	MPI_COMM_SCALBL.waitAll(18,req2);
 	ScaLBL_DeviceBarrier();
 	//...................................................................................
 	//...................................................................................
@@ -1564,7 +1562,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		LocInletArea = double(sendCount_z);
 	else LocInletArea = 0.f;
 	
-	MPI_Allreduce(&LocInletArea,&InletArea,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
+	InletArea = MPI_COMM_SCALBL.sumReduce( LocInletArea );
 	//printf("Inlet area = %f \n", InletArea);
 
 	// Set the flux BC
@@ -1573,7 +1571,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAeven_Flux_BC_z(dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 		
-		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
+		sum = MPI_COMM_SCALBL.sumReduce( locsum );
 		din = flux/InletArea + sum;
 		//if (rank==0) printf("computed din (even) =%f \n",din);
 		if (kproc == 0)
@@ -1583,7 +1581,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAodd_Flux_BC_z(neighborList, dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 
-		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
+		sum = MPI_COMM_SCALBL.sumReduce( locsum );
 		din = flux/InletArea + sum;
 		
 		//if (rank==0) printf("computed din (odd)=%f \n",din);
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index a50ab7ed..78896d3f 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -207,9 +207,8 @@ private:
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
 	MPI_Group Group;	// Group of processors associated with this domain
-	MPI_Comm MPI_COMM_SCALBL;		// MPI Communicator for this domain
+	Utilities::MPI MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 	//......................................................................................
 	// MPI ranks for all 18 neighbors
 	//......................................................................................
diff --git a/common/SpherePack.cpp b/common/SpherePack.cpp
index a7246b72..18057653 100644
--- a/common/SpherePack.cpp
+++ b/common/SpherePack.cpp
@@ -9,7 +9,6 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 #include "common/SpherePack.h"
diff --git a/common/SpherePack.h b/common/SpherePack.h
index 5075b289..56284a40 100644
--- a/common/SpherePack.h
+++ b/common/SpherePack.h
@@ -12,7 +12,6 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 
diff --git a/common/UnitTest.cpp b/common/UnitTest.cpp
index b995fa68..aeb9026e 100755
--- a/common/UnitTest.cpp
+++ b/common/UnitTest.cpp
@@ -14,44 +14,49 @@
 /********************************************************************
  *  Constructor/Destructor                                           *
  ********************************************************************/
-UnitTest::UnitTest()
+UnitTest::UnitTest() : d_verbose( false ), d_comm( MPI_COMM_SELF )
 {
-#ifdef USE_MPI
-    comm = MPI_COMM_WORLD;
-#endif
+    if ( Utilities::MPI::MPI_active() )
+        d_comm = MPI_COMM_WORLD;
 }
 UnitTest::~UnitTest() { reset(); }
 void UnitTest::reset()
 {
-    mutex.lock();
+    d_mutex.lock();
     // Clear the data forcing a reallocation
-    std::vector<std::string>().swap( pass_messages );
-    std::vector<std::string>().swap( fail_messages );
-    std::vector<std::string>().swap( expected_fail_messages );
-    mutex.unlock();
+    std::vector<std::string>().swap( d_pass );
+    std::vector<std::string>().swap( d_fail );
+    std::vector<std::string>().swap( d_expected );
+    d_mutex.unlock();
 }
 
 
 /********************************************************************
  *  Add a pass, fail, expected failure message in a thread-safe way  *
  ********************************************************************/
-void UnitTest::passes( const std::string &in )
+void UnitTest::passes( std::string in )
 {
-    mutex.lock();
-    pass_messages.push_back( in );
-    mutex.unlock();
+    d_mutex.lock();
+    if ( d_verbose )
+        printf( "UnitTest: %i passes: %s\n", d_comm.getRank(), in.data() );
+    d_pass.emplace_back( std::move( in ) );
+    d_mutex.unlock();
 }
-void UnitTest::failure( const std::string &in )
+void UnitTest::failure( std::string in )
 {
-    mutex.lock();
-    fail_messages.push_back( in );
-    mutex.unlock();
+    d_mutex.lock();
+    if ( d_verbose )
+        printf( "UnitTest: %i failed: %s\n", d_comm.getRank(), in.data() );
+    d_fail.emplace_back( std::move( in ) );
+    d_mutex.unlock();
 }
-void UnitTest::expected_failure( const std::string &in )
+void UnitTest::expected_failure( std::string in )
 {
-    mutex.lock();
-    expected_fail_messages.push_back( in );
-    mutex.unlock();
+    d_mutex.lock();
+    if ( d_verbose )
+        printf( "UnitTest: %i expected_failure: %s\n", d_comm.getRank(), in.data() );
+    d_expected.emplace_back( std::move( in ) );
+    d_mutex.unlock();
 }
 
 
@@ -59,23 +64,6 @@ void UnitTest::expected_failure( const std::string &in )
  *  Print a global report                                            *
  *  Note: only rank 0 will print, all messages will be aggregated    *
  ********************************************************************/
-inline std::vector<int> UnitTest::allGather( int value ) const
-{
-    int size = getSize();
-    std::vector<int> data( size, value );
-#ifdef USE_MPI
-    if ( size > 1 )
-        MPI_Allgather( &value, 1, MPI_INT, data.data(), 1, MPI_INT, comm );
-#endif
-    return data;
-}
-inline void UnitTest::barrier() const
-{
-#ifdef USE_MPI
-    if ( getSize() > 1 )
-        MPI_Barrier( comm );
-#endif
-}
 static inline void print_messages( const std::vector<std::vector<std::string>> &messages )
 {
     if ( messages.size() > 1 ) {
@@ -93,28 +81,27 @@ static inline void print_messages( const std::vector<std::vector<std::string>> &
 }
 void UnitTest::report( const int level0 ) const
 {
-    mutex.lock();
-    int size = getSize();
-    int rank = getRank();
+    d_mutex.lock();
+    int size = d_comm.getSize();
+    int rank = d_comm.getRank();
+    // Give all processors a chance to print any remaining messages
+    d_comm.barrier();
+    Utilities::sleep_ms( 10 );
     // Broadcast the print level from rank 0
-    int level = level0;
-#ifdef USE_MPI
-    if ( getSize() > 1 )
-        MPI_Bcast( &level, 1, MPI_INT, 0, comm );
-#endif
+    int level = d_comm.bcast( level0, 0 );
     if ( level < 0 || level > 2 )
         ERROR( "Invalid print level" );
     // Perform a global all gather to get the number of failures per processor
-    auto N_pass             = allGather( pass_messages.size() );
-    auto N_fail             = allGather( fail_messages.size() );
-    auto N_expected_fail    = allGather( expected_fail_messages.size() );
-    int N_pass_tot          = 0;
-    int N_fail_tot          = 0;
-    int N_expected_fail_tot = 0;
+    auto N_pass        = d_comm.allGather<int>( d_pass.size() );
+    auto N_fail        = d_comm.allGather<int>( d_fail.size() );
+    auto N_expected    = d_comm.allGather<int>( d_expected.size() );
+    int N_pass_tot     = 0;
+    int N_fail_tot     = 0;
+    int N_expected_tot = 0;
     for ( int i = 0; i < size; i++ ) {
         N_pass_tot += N_pass[i];
         N_fail_tot += N_fail[i];
-        N_expected_fail_tot += N_expected_fail[i];
+        N_expected_tot += N_expected[i];
     }
     // Send all messages to rank 0 (if needed)
     std::vector<std::vector<std::string>> pass_messages_rank( size );
@@ -122,13 +109,13 @@ void UnitTest::report( const int level0 ) const
     std::vector<std::vector<std::string>> expected_fail_rank( size );
     // Get the pass messages
     if ( ( level == 1 && N_pass_tot <= 20 ) || level == 2 )
-        pass_messages_rank = UnitTest::gatherMessages( pass_messages, 1 );
+        pass_messages_rank = UnitTest::gatherMessages( d_pass, 1 );
     // Get the fail messages
     if ( level == 1 || level == 2 )
-        fail_messages_rank = UnitTest::gatherMessages( fail_messages, 2 );
+        fail_messages_rank = UnitTest::gatherMessages( d_fail, 2 );
     // Get the expected_fail messages
-    if ( ( level == 1 && N_expected_fail_tot <= 50 ) || level == 2 )
-        expected_fail_rank = UnitTest::gatherMessages( expected_fail_messages, 2 );
+    if ( ( level == 1 && N_expected_tot <= 50 ) || level == 2 )
+        expected_fail_rank = UnitTest::gatherMessages( d_expected, 2 );
     // Print the results of all messages (only rank 0 will print)
     if ( rank == 0 ) {
         pout << std::endl;
@@ -174,31 +161,31 @@ void UnitTest::report( const int level0 ) const
         pout << std::endl;
         // Print the tests that expected failed
         pout << "Tests expected failed" << std::endl;
-        if ( level == 0 || ( level == 1 && N_expected_fail_tot > 50 ) ) {
+        if ( level == 0 || ( level == 1 && N_expected_tot > 50 ) ) {
             // We want to print a summary
             if ( size > 8 ) {
                 // Print 1 summary for all processors
                 printp( "     %i tests expected failed (use report level 2 for more detail)\n",
-                    N_expected_fail_tot );
+                    N_expected_tot );
             } else {
                 // Print a summary for each processor
                 for ( int i = 0; i < size; i++ )
                     printp( "     %i tests expected failed (proc %i) (use report level 2 for more "
                             "detail)\n",
-                        N_expected_fail[i], i );
+                        N_expected[i], i );
             }
         } else {
             // We want to print all messages
             for ( int i = 0; i < size; i++ )
-                ASSERT( (int) expected_fail_rank[i].size() == N_expected_fail[i] );
+                ASSERT( (int) expected_fail_rank[i].size() == N_expected[i] );
             print_messages( expected_fail_rank );
         }
         pout << std::endl;
     }
     // Add a barrier to synchronize all processors (rank 0 is much slower)
-    barrier();
+    d_comm.barrier();
     Utilities::sleep_ms( 10 ); // Need a brief pause to allow any printing to finish
-    mutex.unlock();
+    d_mutex.unlock();
 }
 
 
@@ -208,8 +195,8 @@ void UnitTest::report( const int level0 ) const
 std::vector<std::vector<std::string>> UnitTest::gatherMessages(
     const std::vector<std::string> &local_messages, int tag ) const
 {
-    const int rank = getRank();
-    const int size = getSize();
+    const int rank = d_comm.getRank();
+    const int size = d_comm.getSize();
     std::vector<std::vector<std::string>> messages( size );
     if ( rank == 0 ) {
         // Rank 0 should receive all messages
@@ -233,7 +220,6 @@ std::vector<std::vector<std::string>> UnitTest::gatherMessages(
 void UnitTest::pack_message_stream(
     const std::vector<std::string> &messages, const int rank, const int tag ) const
 {
-#ifdef USE_MPI
     // Get the size of the messages
     auto N_messages  = (int) messages.size();
     auto *msg_size   = new int[N_messages];
@@ -254,18 +240,11 @@ void UnitTest::pack_message_stream(
         k += msg_size[i];
     }
     // Send the message stream (using a non-blocking send)
-    MPI_Request request;
-    MPI_Isend( data, size_data, MPI_CHAR, rank, tag, comm, &request );
+    auto request = d_comm.Isend( data, size_data, rank, tag );
     // Wait for the communication to send and free the temporary memory
-    MPI_Status status;
-    MPI_Wait( &request, &status );
+    d_comm.wait( request );
     delete[] data;
     delete[] msg_size;
-#else
-    NULL_USE( messages );
-    NULL_USE( rank );
-    NULL_USE( tag );
-#endif
 }
 
 
@@ -274,20 +253,15 @@ void UnitTest::pack_message_stream(
  ********************************************************************/
 std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const int tag ) const
 {
-#ifdef USE_MPI
     // Probe the message to get the message size
-    MPI_Status status;
-    MPI_Probe( rank, tag, comm, &status );
-    int size_data = -1;
-    MPI_Get_count( &status, MPI_BYTE, &size_data );
+    int size_data = d_comm.probe( rank, tag );
     ASSERT( size_data >= 0 );
     // Allocate memory to receive the data
     auto *data = new char[size_data];
     // receive the data (using a non-blocking receive)
-    MPI_Request request;
-    MPI_Irecv( data, size_data, MPI_CHAR, rank, tag, comm, &request );
+    auto request = d_comm.Irecv( data, size_data, rank, tag );
     // Wait for the communication to be received
-    MPI_Wait( &request, &status );
+    d_comm.wait( request );
     // Unpack the message stream
     int N_messages = 0;
     memcpy( &N_messages, data, sizeof( int ) );
@@ -303,77 +277,16 @@ std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const
         messages[i] = std::string( &data[k], msg_size[i] );
         k += msg_size[i];
     }
+    // Delete the temporary memory
     delete[] data;
     return messages;
-#else
-    NULL_USE( rank );
-    NULL_USE( tag );
-    return std::vector<std::string>();
-#endif
 }
 
 
 /********************************************************************
  *  Other functions                                                  *
  ********************************************************************/
-int UnitTest::getRank() const
-{
-    int rank = 0;
-#ifdef USE_MPI
-    int flag = 0;
-    MPI_Initialized( &flag );
-    if ( flag )
-        MPI_Comm_rank( comm, &rank );
-#endif
-    return rank;
-}
-int UnitTest::getSize() const
-{
-    int size = 1;
-#ifdef USE_MPI
-    int flag = 0;
-    MPI_Initialized( &flag );
-    if ( flag )
-        MPI_Comm_size( comm, &size );
-#endif
-    return size;
-}
-size_t UnitTest::NumPassGlobal() const
-{
-    size_t num = pass_messages.size();
-#ifdef USE_MPI
-    if ( getSize() > 1 ) {
-        auto send = static_cast<int>( num );
-        int sum   = 0;
-        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-        num = static_cast<size_t>( sum );
-    }
-#endif
-    return num;
-}
-size_t UnitTest::NumFailGlobal() const
-{
-    size_t num = fail_messages.size();
-#ifdef USE_MPI
-    if ( getSize() > 1 ) {
-        auto send = static_cast<int>( num );
-        int sum   = 0;
-        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-        num = static_cast<size_t>( sum );
-    }
-#endif
-    return num;
-}
-size_t UnitTest::NumExpectedFailGlobal() const
-{
-    size_t num = expected_fail_messages.size();
-#ifdef USE_MPI
-    if ( getSize() > 1 ) {
-        auto send = static_cast<int>( num );
-        int sum   = 0;
-        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
-        num = static_cast<size_t>( sum );
-    }
-#endif
-    return num;
-}
+size_t UnitTest::NumPassGlobal() const { return d_comm.sumReduce( d_pass.size() ); }
+size_t UnitTest::NumFailGlobal() const { return d_comm.sumReduce( d_fail.size() ); }
+size_t UnitTest::NumExpectedFailGlobal() const { return d_comm.sumReduce( d_expected.size() ); }
+
diff --git a/common/UnitTest.h b/common/UnitTest.h
index 80503d19..9d452747 100755
--- a/common/UnitTest.h
+++ b/common/UnitTest.h
@@ -1,13 +1,11 @@
 #ifndef included_UnitTest
 #define included_UnitTest
 
+#include "common/MPI.h"
+
 #include <mutex>
-#include <sstream>
 #include <string>
 #include <vector>
-#ifdef USE_MPI
-#include "mpi.h"
-#endif
 
 
 /*!
@@ -28,47 +26,47 @@
  * \endcode
 
  */
-class UnitTest
+class UnitTest final
 {
 public:
     //! Constructor
     UnitTest();
 
     //! Destructor
-    virtual ~UnitTest();
+    ~UnitTest();
+
+    // Copy constructor
+    UnitTest( const UnitTest & ) = delete;
+
+    // Assignment operator
+    UnitTest &operator=( const UnitTest & ) = delete;
 
     //! Indicate a passed test (thread-safe)
-    virtual void passes( const std::string &in );
+    void passes( std::string in );
 
     //! Indicate a failed test (thread-safe)
-    virtual void failure( const std::string &in );
+    void failure( std::string in );
 
     //! Indicate an expected failed test (thread-safe)
-    virtual void expected_failure( const std::string &in );
+    void expected_failure( std::string in );
 
     //! Return the number of passed tests locally
-    virtual size_t NumPassLocal() const { return pass_messages.size(); }
+    inline size_t NumPassLocal() const { return d_pass.size(); }
 
     //! Return the number of failed tests locally
-    virtual size_t NumFailLocal() const { return fail_messages.size(); }
+    inline size_t NumFailLocal() const { return d_fail.size(); }
 
     //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailLocal() const { return expected_fail_messages.size(); }
+    inline size_t NumExpectedFailLocal() const { return d_expected.size(); }
 
     //! Return the number of passed tests locally
-    virtual size_t NumPassGlobal() const;
+    size_t NumPassGlobal() const;
 
     //! Return the number of failed tests locally
-    virtual size_t NumFailGlobal() const;
+    size_t NumFailGlobal() const;
 
     //! Return the number of expected failed tests locally
-    virtual size_t NumExpectedFailGlobal() const;
-
-    //! Return the rank of the current processor
-    int getRank() const;
-
-    //! Return the number of processors
-    int getSize() const;
+    size_t NumExpectedFailGlobal() const;
 
     /*!
      * Print a report of the passed and failed tests.
@@ -77,29 +75,28 @@ public:
      * to print correctly).
      * @param level     Optional integer specifying the level of reporting (default: 1)
      *                  0: Report the number of tests passed, failed, and expected failures.
-     *                  1: Report the number of passed tests (if <=20) or the number passed
-     *                     otherwise, report all failures, report the number of expected
-     *                     failed tests (if <=50) or the number passed otherwise.
+     *                  1: Report the passed tests (if <=20) or number passed,
+     *                     Report all failures,
+     *                     Report the expected failed tests (if <=50) or the number passed.
      *                  2: Report all passed, failed, and expected failed tests.
      */
-    virtual void report( const int level = 1 ) const;
+    void report( const int level = 1 ) const;
 
     //! Clear the messages
     void reset();
 
-protected:
-    std::vector<std::string> pass_messages;
-    std::vector<std::string> fail_messages;
-    std::vector<std::string> expected_fail_messages;
-    mutable std::mutex mutex;
-#ifdef USE_MPI
-    MPI_Comm comm;
-#endif
+    //! Make the unit test operator verbose?
+    void verbose( bool verbose = true ) { d_verbose = verbose; }
 
 private:
-    // Make the copy constructor private
-    UnitTest( const UnitTest & ) {}
+    std::vector<std::string> d_pass;
+    std::vector<std::string> d_fail;
+    std::vector<std::string> d_expected;
+    bool d_verbose;
+    mutable std::mutex d_mutex;
+    Utilities::MPI d_comm;
 
+private:
     // Function to pack the messages into a single data stream and send to the given processor
     // Note: This function does not return until the message stream has been sent
     void pack_message_stream(
@@ -109,9 +106,7 @@ private:
     // Note: This function does not return until the message stream has been received
     std::vector<std::string> unpack_message_stream( const int rank, const int tag ) const;
 
-    // Helper functions
-    inline void barrier() const;
-    inline std::vector<int> allGather( int value ) const;
+    // Gather the messages
     inline std::vector<std::vector<std::string>> gatherMessages(
         const std::vector<std::string> &local_messages, int tag ) const;
 };
diff --git a/common/UtilityMacros.h b/common/UtilityMacros.h
index bfac172f..2c374ef1 100644
--- a/common/UtilityMacros.h
+++ b/common/UtilityMacros.h
@@ -143,35 +143,43 @@
  *      Be sure to follow with ENABLE_WARNINGS
  */
 // clang-format off
-#ifdef DISABLE_WARNINGS
-    // Macros previously defined
-#elif defined( USING_MSVC )
+#ifndef DISABLE_WARNINGS
+#if defined( USING_MSVC )
     #define DISABLE_WARNINGS __pragma( warning( push, 0 ) )
     #define ENABLE_WARNINGS __pragma( warning( pop ) )
 #elif defined( USING_CLANG )
     #define DISABLE_WARNINGS                                                \
-        _Pragma( "clang diagnostic push" ) _Pragma( "clang diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "clang diagnostic push" )                                  \
+        _Pragma( "clang diagnostic ignored \"-Wall\"" )                     \
         _Pragma( "clang diagnostic ignored \"-Wextra\"" )                   \
         _Pragma( "clang diagnostic ignored \"-Wunused-private-field\"" )    \
-        _Pragma( "clang diagnostic ignored \"-Wmismatched-new-delete\"" )
+        _Pragma( "clang diagnostic ignored \"-Wdeprecated-declarations\"" ) \
+        _Pragma( "clang diagnostic ignored \"-Winteger-overflow\"" )
     #define ENABLE_WARNINGS _Pragma( "clang diagnostic pop" )
 #elif defined( USING_GCC )
-    // Note: We cannot disable the -Wliteral-suffix message with this macro because the
-    // pragma command cannot suppress warnings from the C++ preprocessor.  See gcc bug #53431.
     #define DISABLE_WARNINGS                                                \
-        _Pragma( "GCC diagnostic push" ) _Pragma( "GCC diagnostic ignored \"-Wall\"" ) \
+        _Pragma( "GCC diagnostic push" )                                    \
+        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                   \
+        _Pragma( "GCC diagnostic ignored \"-Wall\"" )                       \
         _Pragma( "GCC diagnostic ignored \"-Wextra\"" )                     \
-        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                     \
+        _Pragma( "GCC diagnostic ignored \"-Wpedantic\"" )                  \
         _Pragma( "GCC diagnostic ignored \"-Wunused-local-typedefs\"" )     \
         _Pragma( "GCC diagnostic ignored \"-Woverloaded-virtual\"" )        \
         _Pragma( "GCC diagnostic ignored \"-Wunused-parameter\"" )          \
-        _Pragma( "GCC diagnostic ignored \"-Warray-bounds\"" )              \
+        _Pragma( "GCC diagnostic ignored \"-Wdeprecated-declarations\"" )   \
+        _Pragma( "GCC diagnostic ignored \"-Wvirtual-move-assign\"" )       \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-function\"" )           \
+        _Pragma( "GCC diagnostic ignored \"-Woverflow\"" )                  \
+        _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )           \
+        _Pragma( "GCC diagnostic ignored \"-Wignored-qualifiers\"" )        \
+        _Pragma( "GCC diagnostic ignored \"-Wenum-compare\"" )              \
         _Pragma( "GCC diagnostic ignored \"-Wterminate\"" )
     #define ENABLE_WARNINGS _Pragma( "GCC diagnostic pop" )
 #else
     #define DISABLE_WARNINGS
     #define ENABLE_WARNINGS
 #endif
+#endif
 // clang-format on
 
 
diff --git a/cpu/BGK.cpp b/cpu/BGK.cpp
index 436ab381..bccc5b77 100644
--- a/cpu/BGK.cpp
+++ b/cpu/BGK.cpp
@@ -1,5 +1,4 @@
 extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
-	int n;
 	// conserved momemnts
 	double rho,ux,uy,uz,uu;
 	// non-conserved moments
@@ -111,14 +110,12 @@ extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
-	int n;
 	// conserved momemnts
 	double rho,ux,uy,uz,uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
 
-	int nread;
 	for (int n=start; n<finish; n++){
 		
 		// q=0
@@ -275,4 +272,4 @@ extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int star
 				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
 
 	}
-}
\ No newline at end of file
+}
diff --git a/cpu/Color.cpp b/cpu/Color.cpp
index 7ae84341..35b34a5c 100644
--- a/cpu/Color.cpp
+++ b/cpu/Color.cpp
@@ -920,21 +920,17 @@ extern "C" void ScaLBL_D3Q7_ColorCollideMass(char *ID, double *A_even, double *A
 		double *Den, double *Phi, double *ColorGrad, double *Velocity, double beta, int N, bool pBC)
 {
 	char id;
-
-	int idx,n,q,Cqx,Cqy,Cqz;
-	//	int sendLoc;
-
 	double f0,f1,f2,f3,f4,f5,f6;
 	double na,nb,nab;		// density values
 	double ux,uy,uz;	// flow velocity
 	double nx,ny,nz,C;	// color gradient components
 	double a1,a2,b1,b2;
-	double sp,delta;
+	double delta;
 	//double feq[6];		// equilibrium distributions
 	// Set of Discrete velocities for the D3Q19 Model
 	//int D3Q7[3][3]={{1,0,0},{0,1,0},{0,0,1}};
 
-	for (n=0; n<N; n++){
+	for (int n=0; n<N; n++){
 		id = ID[n];
 		if (id != 0 ){
 
@@ -1224,25 +1220,20 @@ extern "C" void DensityStreamD3Q7(char *ID, double *Den, double *Copy, double *P
 
 extern "C" void ScaLBL_ComputePhaseField(char *ID, double *Phi, double *Den, int N)
 {
-	int n;
-	double Na,Nb;
-	//...................................................................
 	// Update Phi
-	for (n=0; n<N; n++){
+	for (int n=0; n<N; n++){
 
 		if (ID[n] > 0 ){
 			// Get the density value (Streaming already performed)
-			Na = Den[n];
-			Nb = Den[N+n];
+			double Na = Den[n];
+			double Nb = Den[N+n];
 			Phi[n] = (Na-Nb)/(Na+Nb);
 		}
 	}
-	//...................................................................
 }
 
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice){
-	int n;
-	for (n=Slice*Nx*Ny; n<(Slice+1)*Nx*Ny; n++){
+	for (int n=Slice*Nx*Ny; n<(Slice+1)*Nx*Ny; n++){
 		Phi[n] = value;
 	}
 }
@@ -1255,7 +1246,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Color(int *Map, double *dist, double *Aq, do
 		double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
 		double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){
 
-	int ijk,nn,n;
+	int ijk,nn;
 	double fq;
 	// conserved momemnts
 	double rho,jx,jy,jz;
@@ -1838,7 +1829,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *di
 		double *Phi, double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
 		double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){
 	
-	int n,nn,ijk,nread;
+	int nn,ijk,nread;
 	int nr1,nr2,nr3,nr4,nr5,nr6;
 	int nr7,nr8,nr9,nr10;
 	int nr11,nr12,nr13,nr14;
@@ -2492,7 +2483,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *di
 extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double *Aq, double *Bq, 
 			double *Den, double *Phi, int start, int finish, int Np){
 
-	int idx,n,nread;
+	int idx, nread;
 	double fq,nA,nB;
 
 	for (int n=start; n<finish; n++){
@@ -2578,11 +2569,11 @@ extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double
 }
 
 extern "C" void ScaLBL_D3Q7_AAeven_PhaseField(int *Map, double *Aq, double *Bq, double *Den, double *Phi, 
-			int start, int finish, int Np){
-	int idx,n,nread;
-	double fq,nA,nB;
+			int start, int finish, int Np)
+{
 	for (int n=start; n<finish; n++){
-		
+		double fq,nA,nB;
+
 		// compute number density for component A
 		// q=0
 		fq = Aq[n];
@@ -2646,27 +2637,25 @@ extern "C" void ScaLBL_D3Q7_AAeven_PhaseField(int *Map, double *Aq, double *Bq,
 		Den[Np+n] = nB;
 		
 		// save the phase indicator field
-		idx = Map[n];
+		int idx = Map[n];
 		Phi[idx] = (nA-nB)/(nA+nB); 	
 	}	
 }
 
 extern "C" void ScaLBL_D3Q19_Gradient(int *Map, double *phi, double *ColorGrad, int start, int finish, int Np, int Nx, int Ny, int Nz){
-	int idx,n,N,i,j,k,nn;
 	// distributions
 	double f1,f2,f3,f4,f5,f6,f7,f8,f9;
 	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double nx,ny,nz;
-
-	for (idx=0; idx<Np; idx++){
+	for (int idx=0; idx<Np; idx++){
 
 		// Get the 1D index based on regular data layout
-		n = Map[idx];
+		int n = Map[idx];
 		
 		//.......Back out the 3D indices for node n..............
-		k = n/(Nx*Ny);
-		j = (n-Nx*Ny*k)/Nx;
-		i = n-Nx*Ny*k-Nx*j;
+		int k = n/(Nx*Ny);
+		int j = (n-Nx*Ny*k)/Nx;
+		int i = n-Nx*Ny*k-Nx*j;
 		//........................................................................
 		//........Get 1-D index for this thread....................
 		//		n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
@@ -2675,7 +2664,7 @@ extern "C" void ScaLBL_D3Q19_Gradient(int *Map, double *phi, double *ColorGrad,
 		//........................................................................
 		//.................Read Phase Indicator Values............................
 		//........................................................................
-		nn = n-1;							// neighbor index (get convention)
+		int nn = n-1;						// neighbor index (get convention)
 		if (i-1<0)		nn += Nx;			// periodic BC along the x-boundary
 		f1 = phi[nn];						// get neighbor for phi - 1
 		//........................................................................
diff --git a/cpu/exe/lb2_Color_mpi.cpp b/cpu/exe/lb2_Color_mpi.cpp
index 6c92f014..0cade21e 100644
--- a/cpu/exe/lb2_Color_mpi.cpp
+++ b/cpu/exe/lb2_Color_mpi.cpp
@@ -4,7 +4,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
diff --git a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
index 47fbbf43..298e3fae 100644
--- a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
+++ b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
@@ -10,7 +10,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 3ce8149e..bc8681af 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -9,7 +9,7 @@ color lattice boltzmann model
 #include <stdlib.h>
 #include <time.h>
 
-ScaLBL_ColorModel::ScaLBL_ColorModel(int RANK, int NP, MPI_Comm COMM):
+ScaLBL_ColorModel::ScaLBL_ColorModel(int RANK, int NP, const Utilities::MPI& COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tauA(0),tauB(0),rhoA(0),rhoB(0),alpha(0),beta(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),inletA(0),inletB(0),outletA(0),outletB(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -167,9 +167,9 @@ void ScaLBL_ColorModel::SetDomain(){
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
 	//Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
 	Averages = std::shared_ptr<SubPhase> ( new SubPhase(Dm) ); // TwoPhase analysis object
-	MPI_Barrier(comm);
+	comm.barrier();
 	Dm->CommInit();
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Read domain parameters
 	rank = Dm->rank();	
 	nprocx = Dm->nprocx();
@@ -292,7 +292,7 @@ void ScaLBL_ColorModel::AssignComponentLabels(double *phase)
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
 	for (size_t idx=0; idx<NLABELS; idx++)
-		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
+		label_count_global[idx] = Dm->Comm.sumReduce( label_count[idx] );
 
 	if (rank==0){
 		printf("Component labels: %lu \n",NLABELS);
@@ -333,7 +333,7 @@ void ScaLBL_ColorModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
@@ -465,7 +465,7 @@ void ScaLBL_ColorModel::Initialize(){
 		ScaLBL_CopyToDevice(Phi,cPhi,N*sizeof(double));
 		ScaLBL_DeviceBarrier();
 
-		MPI_Barrier(comm);
+		comm.barrier();
 	}
 
 	if (rank==0)	printf ("Initializing phase field \n");
@@ -651,7 +651,7 @@ void ScaLBL_ColorModel::Run(){
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	comm.barrier();
 	starttime = MPI_Wtime();
 	//.........................................
 
@@ -700,7 +700,8 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -735,10 +736,10 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 		//************************************************************************
 		
-		MPI_Barrier(comm);
 		PROFILE_STOP("Update");
 
 		if (rank==0 && timestep%analysis_interval == 0 && BoundaryCondition > 0){
@@ -979,7 +980,7 @@ void ScaLBL_ColorModel::Run(){
 					//morph_delta *= (-1.0);
 					REVERSE_FLOW_DIRECTION = false;
 				}
-				MPI_Barrier(comm);
+				comm.barrier();
 			}
 			morph_timesteps += analysis_interval;
 		}
@@ -989,7 +990,7 @@ void ScaLBL_ColorModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	comm.barrier();
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
@@ -1034,17 +1035,17 @@ double ScaLBL_ColorModel::ImageInit(std::string Filename){
 		}
 	}
 
-	Count=sumReduce( Dm->Comm, Count);
-	PoreCount=sumReduce( Dm->Comm, PoreCount);
+	Count = Dm->Comm.sumReduce( Count );
+	PoreCount = Dm->Comm.sumReduce( PoreCount );
 	
 	if (rank==0) printf("   new saturation: %f (%f / %f) \n", Count / PoreCount, Count, PoreCount);
 	ScaLBL_CopyToDevice(Phi, PhaseLabel, Nx*Ny*Nz*sizeof(double));
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	ScaLBL_D3Q19_Init(fq, Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	ScaLBL_CopyToHost(Averages->Phi.data(),Phi,Nx*Ny*Nz*sizeof(double));
 
@@ -1076,7 +1077,7 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,Averages->SDs,vF,vS,phase_label,Dm->Comm);
-		MPI_Barrier(Dm->Comm);
+		Dm->Comm.barrier();
 
 		long long count_connected=0;
 		long long count_porespace=0;
@@ -1098,9 +1099,9 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 				}
 			}
 		}
-		count_connected=sumReduce( Dm->Comm, count_connected);
-		count_porespace=sumReduce( Dm->Comm, count_porespace);
-		count_water=sumReduce( Dm->Comm, count_water);
+		count_connected = Dm->Comm.sumReduce( count_connected);
+		count_porespace = Dm->Comm.sumReduce( count_porespace);
+		count_water = Dm->Comm.sumReduce( count_water);
 
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -1172,7 +1173,7 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 				}
 			}
 		}
-		count_morphopen=sumReduce( Dm->Comm, count_morphopen);
+		count_morphopen = Dm->Comm.sumReduce( count_morphopen);
 		volume_change = double(count_morphopen - count_connected);
 		
 		if (rank==0)  printf("   opening of connected oil %f \n",volume_change/count_connected);
@@ -1278,8 +1279,8 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 		mass_loss += random_value*seed_water_in_oil;
 	}
 
-	count= sumReduce( Dm->Comm, count);
-	mass_loss= sumReduce( Dm->Comm, mass_loss);
+	count = Dm->Comm.sumReduce( count );
+	mass_loss = Dm->Comm.sumReduce( mass_loss );
 	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
 
 	// Need to initialize Aq, Bq, Den, Phi directly
@@ -1316,7 +1317,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	double volume_initial = sumReduce( Dm->Comm, count);
+	double volume_initial = Dm->Comm.sumReduce(  count);
 	/*
 	sprintf(LocalRankFilename,"phi_initial.%05i.raw",rank);
 	FILE *INPUT = fopen(LocalRankFilename,"wb");
@@ -1326,7 +1327,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	// 2. Identify connected components of phase field -> phase_label
 	BlobIDstruct new_index;
 	ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	// only operate on component "0"
 	count = 0.0;
@@ -1348,8 +1349,8 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}	
-	double volume_connected = sumReduce( Dm->Comm, count);
-	second_biggest = sumReduce( Dm->Comm, second_biggest);
+	double volume_connected = Dm->Comm.sumReduce( count );
+	second_biggest = Dm->Comm.sumReduce( second_biggest );
 
 	/*int reach_x, reach_y, reach_z;
 	for (int k=0; k<Nz; k++){
@@ -1436,7 +1437,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	double volume_final= sumReduce( Dm->Comm, count);
+	double volume_final = Dm->Comm.sumReduce( count );
 
 	delta_volume = (volume_final-volume_initial);
 	if (rank == 0)  printf("MorphInit: change fluid volume fraction by %f \n", delta_volume/volume_initial);
diff --git a/models/ColorModel.h b/models/ColorModel.h
index a3b3a124..c52f04c3 100644
--- a/models/ColorModel.h
+++ b/models/ColorModel.h
@@ -12,13 +12,13 @@ Implementation of color lattice boltzmann model
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
 class ScaLBL_ColorModel{
 public:
-	ScaLBL_ColorModel(int RANK, int NP, MPI_Comm COMM);
+	ScaLBL_ColorModel(int RANK, int NP, const Utilities::MPI& COMM);
 	~ScaLBL_ColorModel();	
 	
 	// functions in they should be run
@@ -68,7 +68,7 @@ public:
 	double *Pressure;
 		
 private:
-	MPI_Comm comm;
+	Utilities::MPI comm;
     
 	int dist_mem_size;
 	int neighborSize;
diff --git a/models/DFHModel.cpp b/models/DFHModel.cpp
index 4eb03bea..ced5853f 100644
--- a/models/DFHModel.cpp
+++ b/models/DFHModel.cpp
@@ -3,7 +3,7 @@ color lattice boltzmann model
  */
 #include "models/DFHModel.h"
 
-ScaLBL_DFHModel::ScaLBL_DFHModel(int RANK, int NP, MPI_Comm COMM):
+ScaLBL_DFHModel::ScaLBL_DFHModel(int RANK, int NP, const Utilities::MPI& COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tauA(0),tauB(0),rhoA(0),rhoB(0),alpha(0),beta(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),inletA(0),inletB(0),outletA(0),outletB(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -100,16 +100,16 @@ void ScaLBL_DFHModel::ReadParams(string filename){
 
 }
 void ScaLBL_DFHModel::SetDomain(){
-	Dm  = std::shared_ptr<Domain>(new Domain(domain_db,comm));      // full domain for analysis
-	Mask  = std::shared_ptr<Domain>(new Domain(domain_db,comm));    // mask domain removes immobile phases
+	Dm   = std::make_shared<Domain>(domain_db,comm); // full domain for analysis
+	Mask = std::make_shared<Domain>(domain_db,comm); // mask domain removes immobile phases
 	Nx+=2; Ny+=2; Nz += 2;
 	N = Nx*Ny*Nz;
 	id = new char [N];
-	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
-	Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
-	MPI_Barrier(comm);
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;   // initialize this way
+	Averages = std::make_shared<TwoPhase>( Dm );    // TwoPhase analysis object
+	comm.barrier();
 	Dm->CommInit();
-	MPI_Barrier(comm);
+	comm.barrier();
 	rank = Dm->rank();
 }
 
@@ -131,7 +131,7 @@ void ScaLBL_DFHModel::ReadInput(){
 	sprintf(LocalRankString,"%05d",rank);
 	sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 	ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 }
 
@@ -206,7 +206,7 @@ void ScaLBL_DFHModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
@@ -424,7 +424,7 @@ void ScaLBL_DFHModel::Initialize(){
 			}
 		}
 	}
-	MPI_Allreduce(&count_wet,&count_wet_global,1,MPI_DOUBLE,MPI_SUM,comm);
+	count_wet_global = comm.sumReduce( count_wet );
 	if (rank==0)	printf("Wetting phase volume fraction =%f \n",count_wet_global/double(Nx*Ny*Nz*nprocs));
 	// initialize phi based on PhaseLabel (include solid component labels)
 	ScaLBL_CopyToDevice(Phi, PhaseLabel, Np*sizeof(double));
@@ -446,7 +446,7 @@ void ScaLBL_DFHModel::Initialize(){
 				timestep=0;
 			}
 		}
-		MPI_Bcast(&timestep,1,MPI_INT,0,comm);
+		comm.bcast(&timestep,1,0);
 		// Read in the restart file to CPU buffers
 		double *cPhi = new double[Np];
 		double *cDist = new double[19*Np];
@@ -468,7 +468,7 @@ void ScaLBL_DFHModel::Initialize(){
 		ScaLBL_DeviceBarrier();
 		delete [] cPhi;
 		delete [] cDist;
-		MPI_Barrier(comm);
+		comm.barrier();
 	}
 
 	if (rank==0)    printf ("Initializing phase field \n");
@@ -486,7 +486,7 @@ void ScaLBL_DFHModel::Run(){
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	comm.barrier();
 	starttime = MPI_Wtime();
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
@@ -532,7 +532,8 @@ void ScaLBL_DFHModel::Run(){
 		}
 		ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -568,9 +569,9 @@ void ScaLBL_DFHModel::Run(){
 		}
 		ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 		//************************************************************************
-		MPI_Barrier(comm);
 		PROFILE_STOP("Update");
 
 		// Run the analysis
@@ -581,7 +582,7 @@ void ScaLBL_DFHModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	comm.barrier();
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
diff --git a/models/DFHModel.h b/models/DFHModel.h
index 883ec6f8..00e6e6b3 100644
--- a/models/DFHModel.h
+++ b/models/DFHModel.h
@@ -12,13 +12,13 @@ Implementation of color lattice boltzmann model
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
 class ScaLBL_DFHModel{
 public:
-	ScaLBL_DFHModel(int RANK, int NP, MPI_Comm COMM);
+	ScaLBL_DFHModel(int RANK, int NP, const Utilities::MPI& COMM);
 	~ScaLBL_DFHModel();	
 	
 	// functions in they should be run
@@ -66,7 +66,7 @@ public:
     double *Pressure;
 		
 private:
-	MPI_Comm comm;
+	Utilities::MPI comm;
     
 	int dist_mem_size;
 	int neighborSize;
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index 9ba733ae..d9b8069d 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -4,7 +4,7 @@
 #include "models/MRTModel.h"
 #include "analysis/distance.h"
 
-ScaLBL_MRTModel::ScaLBL_MRTModel(int RANK, int NP, MPI_Comm COMM):
+ScaLBL_MRTModel::ScaLBL_MRTModel(int RANK, int NP, const Utilities::MPI& COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),mu(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -82,9 +82,9 @@ void ScaLBL_MRTModel::SetDomain(){
 	
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
 	//Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
-	MPI_Barrier(comm);
+	comm.barrier();
 	Dm->CommInit();
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	rank = Dm->rank();	
 	nprocx = Dm->nprocx();
@@ -152,7 +152,7 @@ void ScaLBL_MRTModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	MPI_Barrier(comm);
+	comm.barrier();
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
 	//...........................................................................
@@ -171,7 +171,7 @@ void ScaLBL_MRTModel::Create(){
 	if (rank==0)    printf ("Setting up device map and neighbor list \n");
 	// copy the neighbor list 
 	ScaLBL_CopyToDevice(NeighborList, neighborList, neighborSize);
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 }        
 
@@ -206,7 +206,8 @@ void ScaLBL_MRTModel::Run(){
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+	ScaLBL_DeviceBarrier();
+    comm.barrier();
 	starttime = MPI_Wtime();
 	if (rank==0) printf("Beginning AA timesteps, timestepMax = %i \n", timestepMax);
 	if (rank==0) printf("********************************************************\n");
@@ -220,18 +221,21 @@ void ScaLBL_MRTModel::Run(){
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
 		ScaLBL_D3Q19_AAeven_MRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_D3Q19_AAeven_MRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier();
+        comm.barrier();
 		//************************************************************************/
 		
 		if (timestep%1000==0){
 			ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier();
+            comm.barrier();
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
@@ -253,10 +257,10 @@ void ScaLBL_MRTModel::Run(){
 					}
 				}
 			}
-			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+            vax = Mask->Comm.sumReduce( vax_loc );
+            vay = Mask->Comm.sumReduce( vay_loc );
+            vaz = Mask->Comm.sumReduce( vaz_loc );
+            count = Mask->Comm.sumReduce( count_loc );
 			
 			vax /= count;
 			vay /= count;
@@ -286,10 +290,10 @@ void ScaLBL_MRTModel::Run(){
 			double As = Morphology.A();
 			double Hs = Morphology.H();
 			double Xs = Morphology.X();
-			Vs=sumReduce( Dm->Comm, Vs);
-			As=sumReduce( Dm->Comm, As);
-			Hs=sumReduce( Dm->Comm, Hs);
-			Xs=sumReduce( Dm->Comm, Xs);
+			Vs = Dm->Comm.sumReduce( Vs);
+			As = Dm->Comm.sumReduce( As);
+			Hs = Dm->Comm.sumReduce( Hs);
+			Xs = Dm->Comm.sumReduce( Xs);
 			double h = Dm->voxel_length;
 			double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
 			if (rank==0) {
@@ -323,7 +327,8 @@ void ScaLBL_MRTModel::VelocityField(){
 /*	Minkowski Morphology(Mask);
 	int SIZE=Np*sizeof(double);
 	ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+	ScaLBL_DeviceBarrier();.
+    comm.barrier();
 	ScaLBL_CopyToHost(&VELOCITY[0],&Velocity[0],3*SIZE);
 
 	memcpy(Morphology.SDn.data(), Distance.data(), Nx*Ny*Nz*sizeof(double));
@@ -350,10 +355,10 @@ void ScaLBL_MRTModel::VelocityField(){
 		vaz_loc += VELOCITY[2*Np+n];
 		count_loc+=1.0;
 	}
-	MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-	MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-	MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-	MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+    vax = Mask->Comm.sumReduce( vax_loc );
+    vay = Mask->Comm.sumReduce( vay_loc );
+    vaz = Mask->Comm.sumReduce( vaz_loc );
+    count = Mask->Comm.sumReduce( count_loc );
 	
 	vax /= count;
 	vay /= count;
diff --git a/models/MRTModel.h b/models/MRTModel.h
index aa4ee1f0..7e23cc44 100644
--- a/models/MRTModel.h
+++ b/models/MRTModel.h
@@ -11,13 +11,13 @@
 
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "analysis/Minkowski.h"
 #include "ProfilerApp.h"
 
 class ScaLBL_MRTModel{
 public:
-	ScaLBL_MRTModel(int RANK, int NP, MPI_Comm COMM);
+	ScaLBL_MRTModel(int RANK, int NP, const Utilities::MPI& COMM);
 	~ScaLBL_MRTModel();	
 	
 	// functions in they should be run
@@ -63,7 +63,7 @@ public:
     DoubleArray Velocity_y;
     DoubleArray Velocity_z;
 private:
-	MPI_Comm comm;
+	Utilities::MPI comm;
 	
 	// filenames
     char LocalRankString[8];
diff --git a/tests/BlobAnalyzeParallel.cpp b/tests/BlobAnalyzeParallel.cpp
index c9e3f8fc..48e9e230 100644
--- a/tests/BlobAnalyzeParallel.cpp
+++ b/tests/BlobAnalyzeParallel.cpp
@@ -100,11 +100,10 @@ inline void  WriteBlobStates(TwoPhase TCAT, double D, double porosity){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     Utilities::setAbortBehavior( true, true, true );
     Utilities::setErrorHandlers();
 	PROFILE_ENABLE(0);
@@ -137,7 +136,7 @@ int main(int argc, char **argv)
     	domain >> Ly;
     	domain >> Lz;
     }
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -150,7 +149,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 
     // Check that the number of processors >= the number of ranks
     if ( rank==0 ) {
@@ -209,7 +208,7 @@ int main(int argc, char **argv)
 //	WriteLocalSolidID(LocalRankFilename, id, N);
 	sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 	ReadBinaryFile(LocalRankFilename, Averages.SDs.get(), N);
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
     //.......................................................................
 	//copies of data needed to perform checkpointing from cpu
@@ -221,7 +220,7 @@ int main(int argc, char **argv)
 	if (rank==0) printf("Reading restart file! \n");
 	// Read in the restart file to CPU buffers
 	ReadCheckpoint(LocalRestartFile, Den, DistEven, DistOdd, N);
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.........................................................................
 	// Populate the arrays needed to perform averaging
 	if (rank==0) printf("Populate arrays \n");
@@ -329,14 +328,14 @@ int main(int argc, char **argv)
     //      BlobContainer Blobs;
     DoubleArray RecvBuffer(dimx);
     //    MPI_Allreduce(&Averages.BlobAverages.get(),&Blobs.get(),1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
-    MPI_Barrier(comm);
+    comm.barrier();
     if (rank==0) printf("Number of components is %i \n",dimy);
 
     for (int b=0; b<dimy; b++){
 
     	MPI_Allreduce(&Averages.BlobAverages(0,b),&RecvBuffer(0),dimx,MPI_DOUBLE,MPI_SUM,comm);
     	for (int idx=0; idx<dimx-1; idx++) Averages.BlobAverages(idx,b)=RecvBuffer(idx);
-    	MPI_Barrier(comm);
+    	comm.barrier();
 
     	if (Averages.BlobAverages(0,b) > 0.0){
     		double Vn,pn,awn,ans,Jwn,Kwn,lwns,cwns,trawn,trJwn;
@@ -482,7 +481,7 @@ int main(int argc, char **argv)
     fclose(BLOBS);*/
     PROFILE_STOP("main");
     PROFILE_SAVE("BlobIdentifyParallel",false);
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return 0;  
 }
diff --git a/tests/BlobIdentifyParallel.cpp b/tests/BlobIdentifyParallel.cpp
index f93371cb..b8929a11 100644
--- a/tests/BlobIdentifyParallel.cpp
+++ b/tests/BlobIdentifyParallel.cpp
@@ -47,11 +47,10 @@ void readRankData( int proc, int nx, int ny, int nz, DoubleArray& Phase, DoubleA
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 #ifdef PROFILE
 	PROFILE_ENABLE(0);
     PROFILE_DISABLE_TRACE();
@@ -129,7 +128,7 @@ int main(int argc, char **argv)
     PROFILE_STOP("main");
     PROFILE_SAVE("BlobIdentifyParallel",false);
 #endif
-    MPI_Barrier(comm);
+    comm.barrier();
 	MPI_Finalize();
     return 0;  
 }
diff --git a/tests/ColorToBinary.cpp b/tests/ColorToBinary.cpp
index 7ac740bc..fae156d1 100644
--- a/tests/ColorToBinary.cpp
+++ b/tests/ColorToBinary.cpp
@@ -114,11 +114,10 @@ inline void ReadFromRank(char *FILENAME, DoubleArray &Phase, int nx, int ny, int
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
 	printf("----------------------------------------------------------\n");
 	printf("Creating single Binary file from restart (8-bit integer)\n");
@@ -276,7 +275,7 @@ int main(int argc, char **argv)
 	*/
 	
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/ComponentLabel.cpp b/tests/ComponentLabel.cpp
index 07ef6555..624ce8f4 100644
--- a/tests/ComponentLabel.cpp
+++ b/tests/ComponentLabel.cpp
@@ -119,11 +119,10 @@ inline void ReadFromRank(char *FILENAME, DoubleArray &Phase, DoubleArray &Pressu
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
 	printf("----------------------------------------------------------\n");
 	printf("COMPUTING TCAT ANALYSIS FOR NON-WETTING PHASE FEATURES \n");
@@ -433,7 +432,7 @@ int main(int argc, char **argv)
 	fclose(DISTANCE);
 	*/
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/GenerateSphereTest.cpp b/tests/GenerateSphereTest.cpp
index 53fc8746..5886be21 100644
--- a/tests/GenerateSphereTest.cpp
+++ b/tests/GenerateSphereTest.cpp
@@ -9,7 +9,7 @@
 //#include "common/pmmc.h"
 #include "common/Domain.h"
 #include "common/SpherePack.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 
 /*
@@ -70,8 +70,8 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 		}
 	}
 	// total Global is the number of nodes in the pore-space
-	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
-	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm.Comm);
+	totalGlobal = Dm.Comm.sumReduce( count );
+	maxdistGlobal = Dm.Comm.sumReduce( maxdist );
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double porosity=totalGlobal/volume;
 	if (rank==0) printf("Media Porosity: %f \n",porosity);
@@ -148,7 +148,6 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 	double Rcrit_old;
 	double Rcrit_new;
 
-	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
     
 	Rcrit_new = maxdistGlobal;
@@ -215,41 +214,41 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
         //......................................................................................
         MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
-		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
-		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
-		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
-		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
-		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
-		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
-		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
-		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
-		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
-		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
-		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
-		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
-		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
-		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
-		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
-		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
-		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
-		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
+		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         //......................................................................................
         UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
         UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
@@ -271,7 +270,7 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         UnpackID(Dm.recvList_YZ, Dm.recvCount_YZ ,recvID_YZ, id);
         //......................................................................................
 
-        MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
+        //double GlobalNumber = Dm.Comm.sumReduce( LocalNumber );
 
         count = 0.f;
         for (int k=1; k<Nz-1; k++){
@@ -284,7 +283,7 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
                 }
             }
         }
-        MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
+        countGlobal = Dm.Comm.sumReduce( count );
         sw_new = countGlobal/totalGlobal;
         sw_diff_new = abs(sw_new-SW);
         // for test only
@@ -314,15 +313,11 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 		// parallel domain size (# of sub-domains)
 		int nprocx,nprocy,nprocz;
@@ -412,14 +407,14 @@ int main(int argc, char **argv)
 		//.......................................................................
 		if (rank == 0)	printf("Reading the sphere packing \n");
 		if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Broadcast the sphere packing to all processes
-		MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-		MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-		MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-		MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+		comm.bcast(cx,nspheres,0);
+		comm.bcast(cy,nspheres,0);
+		comm.bcast(cz,nspheres,0);
+		comm.bcast(rad,nspheres,0);
 		//...........................................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank == 0){
 			// Compute the Sauter mean diameter
@@ -433,7 +428,7 @@ int main(int argc, char **argv)
 			D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 			printf("Sauter Mean Diameter (computed from sphere packing) = %f \n",D);
 		}
-		MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&D,1,0);
 
 		//.......................................................................
 		SignedDistance(SignDist.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
@@ -465,7 +460,7 @@ int main(int argc, char **argv)
 			}
 		}
 		sum_local = 1.0*sum;
-		MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+		porosity = comm.sumReduce(sum_local);
 		porosity = porosity*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -498,7 +493,7 @@ int main(int argc, char **argv)
 		//......................................................................
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/TestBlobAnalyze.cpp b/tests/TestBlobAnalyze.cpp
index d2df9f86..63d928c1 100644
--- a/tests/TestBlobAnalyze.cpp
+++ b/tests/TestBlobAnalyze.cpp
@@ -127,11 +127,10 @@ inline void  WriteBlobStates(TwoPhase TCAT, double D, double porosity){
 int main(int argc, char **argv)
 {
   // Initialize MPI
-  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&nprocs);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -189,7 +188,7 @@ int main(int argc, char **argv)
     		Lx=Ly=Lz=1;
     	}
     }
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -202,7 +201,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 
     // Check that the number of processors >= the number of ranks
     if ( rank==0 ) {
@@ -254,14 +253,14 @@ int main(int argc, char **argv)
 	cz[0]=0.25*Lz; cx[1]=0.75*Lz; cx[2]=0.25*Lz; cx[3]=0.25*Lz;
 	rad[0]=rad[1]=rad[2]=rad[3]=0.1*Lx;
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.......................................................................
 	SignedDistance(Averages.Phase.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
 		       Dm->iproc(),Dm->jproc(),Dm->kproc(),Dm->nprocx(),Dm->nprocy(),Dm->nprocz());
@@ -317,7 +316,7 @@ int main(int argc, char **argv)
 	delete [] rad;
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Barrier(comm);
+  comm.barrier();
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestBlobIdentify.cpp b/tests/TestBlobIdentify.cpp
index ccfc6afc..7eb5c270 100644
--- a/tests/TestBlobIdentify.cpp
+++ b/tests/TestBlobIdentify.cpp
@@ -23,21 +23,19 @@ inline double rand2()
 
 
 // Test if all ranks agree on a value
-bool allAgree( int x, MPI_Comm comm ) {
+bool allAgree( int x, const Utilities::MPI& comm ) {
     int x2 = x;
-    MPI_Bcast(&x2,1,MPI_INT,0,comm);
+    comm.bcast(&x2,1,0);
     int diff = x==x2 ? 0:1;
-    int diff2 = 0;
-    MPI_Allreduce(&diff,&diff2,1,MPI_INT,MPI_SUM,comm);
+    int diff2 = comm.sumReduce( diff );
     return diff2==0;
 }
 template<class T>
-bool allAgree( const std::vector<T>& x, MPI_Comm comm ) {
+bool allAgree( const std::vector<T>& x, const Utilities::MPI& comm ) {
     std::vector<T> x2 = x;
-    MPI_Bcast(&x2[0],x.size()*sizeof(T)/sizeof(int),MPI_INT,0,comm);
+    comm.bcast(&x2[0],x.size()*sizeof(T)/sizeof(int),0);
     int diff = x==x2 ? 0:1;
-    int diff2 = 0;
-    MPI_Allreduce(&diff,&diff2,1,MPI_INT,MPI_SUM,comm);
+    int diff2 = comm.sumReduce( diff );
     return diff2==0;
 }
 
@@ -74,9 +72,9 @@ struct bubble_struct {
 
 
 // Create a random set of bubles
-std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly, double Lz, MPI_Comm comm )
+std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly, double Lz, const Utilities::MPI& comm )
 {
-    int rank = comm_rank(comm);
+    int rank = comm.getRank();
     std::vector<bubble_struct> bubbles(N_bubbles);
     if ( rank == 0 ) {
         double R0 = 0.2*Lx*Ly*Lz/pow((double)N_bubbles,0.333);
@@ -91,7 +89,7 @@ std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly,
         }
     }
     size_t N_bytes = N_bubbles*sizeof(bubble_struct);
-    MPI_Bcast((char*)&bubbles[0],N_bytes,MPI_CHAR,0,comm);
+    comm.bcast((char*)&bubbles[0],N_bytes,0);
     return bubbles;
 }
 
@@ -124,7 +122,7 @@ void fillBubbleData( const std::vector<bubble_struct>& bubbles, DoubleArray& Pha
 
 
 // Shift all of the data by the given number of cells
-void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct& rank_info, MPI_Comm comm )
+void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct& rank_info, const Utilities::MPI& comm )
 {
     int nx = data.size(0)-2;
     int ny = data.size(1)-2;
@@ -154,11 +152,10 @@ void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct
 int main(int argc, char **argv)
 {
     // Initialize MPI
-    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     PROFILE_ENABLE(1);
     PROFILE_DISABLE_TRACE();
     PROFILE_SYNCHRONIZE();
@@ -297,7 +294,7 @@ int main(int argc, char **argv)
             velocity[i].z = bubbles[i].radius*(2*rand2()-1);
         }
     }
-    MPI_Bcast((char*)&velocity[0],bubbles.size()*sizeof(Point),MPI_CHAR,0,comm);
+    comm.bcast((char*)&velocity[0],bubbles.size()*sizeof(Point),0);
     fillBubbleData( bubbles, Phase, SignDist, Lx, Ly, Lz, rank_info );
     fillData.fill(Phase);
     fillData.fill(SignDist);
@@ -391,8 +388,8 @@ int main(int argc, char **argv)
                 printf("\n");
             }
         }
-        MPI_Bcast(&N1,1,MPI_INT,0,comm);
-        MPI_Bcast(&N2,1,MPI_INT,0,comm);
+        comm.bcast(&N1,1,0);
+        comm.bcast(&N2,1,0);
         if ( N1!=nblobs || N2!=nblobs2 ) {
             if ( rank==0 )
                 printf("Error, blob ids do not map in moving bubble test (%i,%i,%i,%i)\n",
@@ -412,7 +409,7 @@ int main(int argc, char **argv)
     // Finished
     PROFILE_STOP("main");
     PROFILE_SAVE("TestBlobIdentify",false);
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return N_errors;  
 }
diff --git a/tests/TestBlobIdentifyCorners.cpp b/tests/TestBlobIdentifyCorners.cpp
index 4795f610..904e52e0 100644
--- a/tests/TestBlobIdentifyCorners.cpp
+++ b/tests/TestBlobIdentifyCorners.cpp
@@ -18,10 +18,9 @@
 int main(int argc, char **argv)
 {
     // Initialize MPI
-    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
-    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     /*if ( nprocs != 8 ) {
         printf("This tests requires 8 processors\n");
         return -1;
diff --git a/tests/TestBubble.cpp b/tests/TestBubble.cpp
index c03e5dea..e7e0ced8 100644
--- a/tests/TestBubble.cpp
+++ b/tests/TestBubble.cpp
@@ -7,7 +7,7 @@
 
 #include "analysis/pmmc.h"
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -32,14 +32,15 @@ int main(int argc, char **argv)
   // Initialize MPI
   int provided_thread_support = -1;
   MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-  int rank = comm_rank(comm);
-  int nprocs = comm_size(comm);
-  if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
-    std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
+    auto comm = Utilities::MPI( MPI_COMM_WORLD ).dup();
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
+    if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
+      std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
+
     // parallel domain size (# of sub-domains)
     int nprocx,nprocy,nprocz;
 
@@ -119,7 +120,7 @@ int main(int argc, char **argv)
     int jproc = rank_info.jy;
     int kproc = rank_info.kz;
 
-    MPI_Barrier(comm);
+    comm.barrier();
     // **************************************************************
     // **************************************************************
     double Ps = -(das-dbs)/(das+dbs);
@@ -162,7 +163,7 @@ int main(int argc, char **argv)
     // Mask that excludes the solid phase
     Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,pBC);
      
-     MPI_Barrier(comm);
+     comm.barrier();
 
     Nx+=2; Ny+=2; Nz += 2;
 
@@ -432,7 +433,7 @@ int main(int argc, char **argv)
 
     //.......create and start timer............
     double starttime,stoptime,cputime;
-    MPI_Barrier(comm);
+    comm.barrier();
     starttime = MPI_Wtime();
     //.........................................
     //...........................................................................
@@ -517,7 +518,7 @@ int main(int argc, char **argv)
             ScaLBL_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
             ScaLBL_CopyToDevice(Den,cDen,2*N*sizeof(double));
             ScaLBL_DeviceBarrier();
-            MPI_Barrier(comm);
+            comm.barrier();
         }
 
         //*************************************************************************
@@ -529,7 +530,7 @@ int main(int argc, char **argv)
         ScaLBL_Comm.SendHalo(Phi);
         ScaLBL_Comm.RecvHalo(Phi);
         ScaLBL_DeviceBarrier();
-        MPI_Barrier(comm);
+        comm.barrier();
         //*************************************************************************
 
         if (rank==0 && pBC){
@@ -560,7 +561,7 @@ int main(int argc, char **argv)
         ScaLBL_D3Q19_Pressure(ID,f_even,f_odd,Pressure,Nx,Ny,Nz);
         ScaLBL_CopyToHost(Phase.data(),Phi,N*sizeof(double));
         ScaLBL_CopyToHost(Press.data(),Pressure,N*sizeof(double));
-        MPI_Barrier(comm);
+        comm.barrier();
         //...........................................................................
         
         int timestep = 0;
@@ -591,7 +592,7 @@ int main(int argc, char **argv)
             //*************************************************************************
 
             ScaLBL_DeviceBarrier();
-            MPI_Barrier(comm);
+            comm.barrier();
             //*************************************************************************
             //         Swap the distributions for momentum transport
             //*************************************************************************
@@ -599,7 +600,7 @@ int main(int argc, char **argv)
             //*************************************************************************
 
             ScaLBL_DeviceBarrier();
-            MPI_Barrier(comm);
+            comm.barrier();
             //*************************************************************************
             // Wait for communications to complete and unpack the distributions
             ScaLBL_Comm.RecvD3Q19(f_even, f_odd);
@@ -616,7 +617,7 @@ int main(int argc, char **argv)
             ScaLBL_D3Q7_Swap(ID, B_even, B_odd, Nx, Ny, Nz);
 
             ScaLBL_DeviceBarrier();
-            MPI_Barrier(comm);
+            comm.barrier();
 
             //*************************************************************************
             // Wait for communication and unpack the D3Q7 distributions
@@ -633,7 +634,7 @@ int main(int argc, char **argv)
             //*************************************************************************
             // ScaLBL_ComputePhaseField(ID, Phi, Copy, Den, N);
             ScaLBL_DeviceBarrier();
-            MPI_Barrier(comm);
+            comm.barrier();
         
             ScaLBL_ComputePhaseField(ID, Phi, Den, N);
             //*************************************************************************
@@ -659,7 +660,7 @@ int main(int argc, char **argv)
             
             //...................................................................................
 
-            MPI_Barrier(comm);
+            comm.barrier();
 
             // Timestep completed!
             timestep++;
@@ -807,7 +808,7 @@ int main(int argc, char **argv)
             //...........................................................................
         }
         //...........................................................................
-        MPI_Barrier(comm);
+        comm.barrier();
         MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
@@ -827,7 +828,7 @@ int main(int argc, char **argv)
         MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Barrier(comm);
+        comm.barrier();
         //.........................................................................
         // Compute the change in the total surface energy based on the defined interval
         // See McClure, Prins and Miller (2013) 
@@ -950,7 +951,7 @@ int main(int argc, char **argv)
 
     //************************************************************************/
     ScaLBL_DeviceBarrier();
-    MPI_Barrier(comm);
+    comm.barrier();
     stoptime = MPI_Wtime();
     if (rank==0) printf("-------------------------------------------------------------------\n");
     // Compute the walltime per timestep
@@ -989,9 +990,8 @@ int main(int argc, char **argv)
     PROFILE_SAVE("TestBubble");
     
     // ****************************************************
-    MPI_Barrier(comm);
+    comm.barrier();
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Comm_free(&comm);
   MPI_Finalize();
   return 0;
 }
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index a8ba0cde..7f5d0047 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -9,7 +9,7 @@
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -29,10 +29,9 @@ int main(int argc, char **argv)
 	// Initialize MPI
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-	MPI_Comm comm;
-	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-	int rank = comm_rank(comm);
-	int nprocs = comm_size(comm);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	int check=0;
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 	  int i,j,k,n,Np;
@@ -45,7 +44,7 @@ int main(int argc, char **argv)
 		int device=ScaLBL_SetDevice(rank);
 		printf("Using GPU ID %i for rank %i \n",device,rank);
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		PROFILE_ENABLE(1);
 		//PROFILE_ENABLE_TRACE();
@@ -72,7 +71,7 @@ int main(int argc, char **argv)
         // Initialize compute device
         //        int device=ScaLBL_SetDevice(rank);
         ScaLBL_DeviceBarrier();
-        MPI_Barrier(comm);
+        comm.barrier();
 
         Utilities::setErrorHandlers();
 
@@ -118,7 +117,7 @@ int main(int argc, char **argv)
         // Get the rank info
         const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-        MPI_Barrier(comm);
+        comm.barrier();
 
         if (nprocs != nprocx*nprocy*nprocz){
             printf("nprocx =  %i \n",nprocx);
@@ -167,7 +166,7 @@ int main(int argc, char **argv)
 
         // Mask that excludes the solid phase
         auto Mask = std::make_shared<Domain>(domain_db,comm);
-        MPI_Barrier(comm);
+        comm.barrier();
 
         Nx+=2; Ny+=2; Nz += 2;
         int N = Nx*Ny*Nz;
@@ -250,7 +249,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		auto neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//...........................................................................
 		//				MAIN  VARIABLES ALLOCATED HERE
@@ -387,7 +386,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -437,7 +436,7 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->next, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 
 			// *************EVEN TIMESTEP*************
 			timestep++;
@@ -473,9 +472,9 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->next, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			//************************************************************************
-			MPI_Barrier(comm);
+			comm.barrier();
 			PROFILE_STOP("Update");
 
 			// Run the analysis
@@ -487,7 +486,7 @@ int main(int argc, char **argv)
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -547,9 +546,8 @@ int main(int argc, char **argv)
 		PROFILE_STOP("Main");
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		// ****************************************************
-		MPI_Barrier(comm);
+		comm.barrier();
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	MPI_Comm_free(&comm);
 	MPI_Finalize();
 	return check;
 }
diff --git a/tests/TestColorBubble.cpp b/tests/TestColorBubble.cpp
index 0e6ea25a..1f42a71e 100644
--- a/tests/TestColorBubble.cpp
+++ b/tests/TestColorBubble.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/ColorModel.h"
 
 using namespace std;
@@ -64,15 +64,11 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		if (rank == 0){
@@ -97,7 +93,7 @@ int main(int argc, char **argv)
 		ColorModel.WriteDebug();
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorGrad.cpp b/tests/TestColorGrad.cpp
index 5cd6d924..df1c1daf 100644
--- a/tests/TestColorGrad.cpp
+++ b/tests/TestColorGrad.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -15,15 +15,11 @@ using namespace std;
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check;
 	{
 		// parallel domain size (# of sub-domains)
@@ -116,7 +112,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		MPI_Barrier(comm);
+		comm.barrier();
 		//.................................................
 		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -129,7 +125,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		// **************************************************************
 		// **************************************************************
 
@@ -146,7 +142,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		double iVol_global = 1.0/Nx/Ny/Nz/nprocx/nprocy/nprocz;
 		int BoundaryCondition=0;
@@ -175,7 +171,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm.CommInit();
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -192,7 +188,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Np];
 
 		ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm.id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -260,7 +256,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorGradDFH.cpp b/tests/TestColorGradDFH.cpp
index d6376d82..b04aebce 100644
--- a/tests/TestColorGradDFH.cpp
+++ b/tests/TestColorGradDFH.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -25,15 +25,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -82,7 +78,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -105,7 +101,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int neighborSize=18*Np*sizeof(int);
@@ -211,7 +207,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorMassBounceback.cpp b/tests/TestColorMassBounceback.cpp
index c05c245e..78508f9b 100644
--- a/tests/TestColorMassBounceback.cpp
+++ b/tests/TestColorMassBounceback.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -15,15 +15,11 @@ using namespace std;
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -42,7 +38,7 @@ int main(int argc, char **argv)
         // Initialize compute device
         //        int device=ScaLBL_SetDevice(rank);
         ScaLBL_DeviceBarrier();
-        MPI_Barrier(comm);
+        comm.barrier();
         Utilities::setErrorHandlers();
 
         // Variables that specify the computational domain  
@@ -77,7 +73,7 @@ int main(int argc, char **argv)
         // Get the rank info
         const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-        MPI_Barrier(comm);
+        comm.barrier();
 
         if (nprocs != nprocx*nprocy*nprocz){
             printf("nprocx =  %i \n",nprocx);
@@ -121,7 +117,7 @@ int main(int argc, char **argv)
         std::shared_ptr<Domain> Dm(new Domain(domain_db,comm));
         for (int i=0; i<Dm->Nx*Dm->Ny*Dm->Nz; i++) Dm->id[i] = 1;
         Dm->CommInit();
-        MPI_Barrier(comm);
+        comm.barrier();
 
         Nx+=2; Ny+=2; Nz += 2;
         int N = Nx*Ny*Nz;
@@ -153,7 +149,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -170,7 +166,7 @@ int main(int argc, char **argv)
 		Npad=Np+32;
 		neighborList= new int[18*Npad];
 		Np=ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -272,7 +268,7 @@ int main(int argc, char **argv)
 
         ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                 alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-        ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+        ScaLBL_DeviceBarrier(); comm.barrier();
 
 		timestep++;
 
@@ -332,7 +328,7 @@ int main(int argc, char **argv)
          ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
          ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                  alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-         ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+         ScaLBL_DeviceBarrier(); comm.barrier();
          timestep++;
          //************************************************************************
 		printf("Check after even time \n");
@@ -415,7 +411,7 @@ int main(int argc, char **argv)
 
         ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                 alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-        ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+        ScaLBL_DeviceBarrier(); comm.barrier();
 
 		timestep++;
 
@@ -476,7 +472,7 @@ int main(int argc, char **argv)
          ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
          ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                  alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-         ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+         ScaLBL_DeviceBarrier(); comm.barrier();
          timestep++;
          //************************************************************************
 		printf("Check after even time \n");
@@ -523,7 +519,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 	return check;
diff --git a/tests/TestColorSquareTube.cpp b/tests/TestColorSquareTube.cpp
index 9807f0e8..cf8a9566 100644
--- a/tests/TestColorSquareTube.cpp
+++ b/tests/TestColorSquareTube.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/ColorModel.h"
 
 std::shared_ptr<Database> loadInputs( int nprocs )
@@ -84,15 +84,11 @@ void InitializeSquareTube(ScaLBL_ColorModel &ColorModel){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		if (rank == 0){
@@ -113,7 +109,7 @@ int main(int argc, char **argv)
  
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestCommD3Q19.cpp b/tests/TestCommD3Q19.cpp
index e1fa821f..d2799355 100644
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@@ -6,7 +6,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -164,11 +164,10 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check;
 	{
 
@@ -263,14 +262,14 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+		sum = comm.sumReduce( sum_local );
 		double iVol_global=1.f/double((Nx-2)*(Ny-2)*(Nz-2)*nprocx*nprocy*nprocz);
 	    	porosity = 1.0-sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 		//.......................................................................
 
 		//...........................................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		//...........................................................................
 
@@ -285,7 +284,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		Map.fill(-2);		
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 		int neighborSize=18*Np*sizeof(int);
 		//......................device distributions.................................
 		dist_mem_size = Np*sizeof(double);
@@ -355,7 +354,7 @@ int main(int argc, char **argv)
 		GlobalFlipScaLBL_D3Q19_Init(fq_host, Map, Np, Nx-2, Ny-2, Nz-2, iproc,jproc,kproc,nprocx,nprocy,nprocz);
 		ScaLBL_CopyToDevice(fq, fq_host, 19*dist_mem_size);
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		//*************************************************************************
 		// First timestep
 		ScaLBL_Comm.SendD3Q19AA(fq); //READ FROM NORMAL
@@ -378,7 +377,7 @@ int main(int argc, char **argv)
 
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
-		MPI_Barrier(comm);
+		comm.barrier();
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -398,7 +397,7 @@ int main(int argc, char **argv)
 			//*********************************************
 
 			ScaLBL_DeviceBarrier();
-			MPI_Barrier(comm);
+			comm.barrier();
 			// Iteration completed!
 			timestep++;
 			//...................................................................
@@ -427,7 +426,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Aggregated communication bandwidth = %f Gbit/sec \n",nprocs*ScaLBL_Comm.CommunicationCount*64*timestep/1e9);
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestDatabase.cpp b/tests/TestDatabase.cpp
index 00bf87e2..ced704e2 100644
--- a/tests/TestDatabase.cpp
+++ b/tests/TestDatabase.cpp
@@ -9,7 +9,7 @@
 
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Database.h"
 #include "ProfilerApp.h"
 
@@ -17,11 +17,8 @@
 // Main
 int main(int argc, char **argv)
 {
-    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
     Utilities::setAbortBehavior(true,2);
     Utilities::setErrorHandlers();
     UnitTest ut;
@@ -69,7 +66,7 @@ int main(int argc, char **argv)
 
     // Finished
     PROFILE_SAVE("TestDatabase",true);
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return err;
 }
diff --git a/tests/TestFluxBC.cpp b/tests/TestFluxBC.cpp
index 020bbd89..3e999715 100644
--- a/tests/TestFluxBC.cpp
+++ b/tests/TestFluxBC.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 #include "common/ScaLBL.h"
 
@@ -18,9 +18,9 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	int rank = MPI_WORLD_RANK();
-	int nprocs = MPI_WORLD_SIZE();
+	Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 
 	// set the error code
 	// Note: the error code should be consistent across all processors
@@ -89,7 +89,7 @@ int main (int argc, char **argv)
 		neighborList= new int[18*Npad];
 
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -149,7 +149,7 @@ int main (int argc, char **argv)
     	double *VEL;
     	VEL= new double [3*Np];
     	int SIZE=3*Np*sizeof(double);
-    	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+    	ScaLBL_DeviceBarrier(); comm.barrier();
     	ScaLBL_CopyToHost(&VEL[0],&dvc_vel[0],SIZE);
 
     	double Q = 0.f;    	
@@ -192,7 +192,7 @@ int main (int argc, char **argv)
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 
 			ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
@@ -201,7 +201,7 @@ int main (int argc, char **argv)
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 			ScaLBL_D3Q19_AAeven_MRT(fq, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 			//************************************************************************/
 
@@ -265,7 +265,7 @@ int main (int argc, char **argv)
 
 	}
 	// Finished
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
     return error; 
 }
diff --git a/tests/TestForceD3Q19.cpp b/tests/TestForceD3Q19.cpp
index b8f88aae..f8569624 100644
--- a/tests/TestForceD3Q19.cpp
+++ b/tests/TestForceD3Q19.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 #include <math.h>
 
@@ -443,8 +443,9 @@ inline void MRT_Transform(double *dist, int Np, double Fx, double Fy, double Fz)
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	int rank = MPI_WORLD_RANK();
-	int nprocs = MPI_WORLD_SIZE();
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 
 	for (int i=0; i<nprocs; i++) {
 		if ( rank==i )
diff --git a/tests/TestForceMoments.cpp b/tests/TestForceMoments.cpp
index 1fb1e0a4..0df4a726 100644
--- a/tests/TestForceMoments.cpp
+++ b/tests/TestForceMoments.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -46,15 +46,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -98,7 +94,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		kproc = rank/(nprocx*nprocy);
 		jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -106,7 +102,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -143,7 +139,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 
 		int Np=0;  // number of local pore nodes
@@ -188,7 +184,7 @@ int main(int argc, char **argv)
 
 	        if (rank == 0) PrintNeighborList(neighborList,Np, rank);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -213,13 +209,13 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); comm.barrier();
 		starttime = MPI_Wtime();
 
 		/************ MAIN ITERATION LOOP (timing communications)***************************************/
 		//ScaLBL_Comm->SendD3Q19(dist, &dist[10*Np]);
 		//ScaLBL_Comm->RecvD3Q19(dist, &dist[10*Np]);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); comm.barrier();
 
 		if (rank==0) printf("Beginning AA timesteps...\n");
 		if (rank==0) printf("********************************************************\n");
@@ -231,14 +227,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist,  ScaLBL_Comm->first_interior, ScaLBL_Comm->last_interior, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm->RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 
 			ScaLBL_Comm->SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_MRT(dist, ScaLBL_Comm->first_interior, ScaLBL_Comm->last_interior, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm->RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_MRT(dist, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 			//************************************************************************/
 			
@@ -331,7 +327,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestInterfaceSpeed.cpp b/tests/TestInterfaceSpeed.cpp
index 40d53b47..d2c901df 100644
--- a/tests/TestInterfaceSpeed.cpp
+++ b/tests/TestInterfaceSpeed.cpp
@@ -2,7 +2,7 @@
 #include <math.h>
 
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -18,13 +18,9 @@
 int main (int argc, char *argv[])
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
-
-	int i,j,k;
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
 
     // Load inputs
 	string FILENAME = argv[1];
@@ -40,7 +36,7 @@ int main (int argc, char *argv[])
 
     Nx+=2; Ny+=2; Nz+=2;
 
-	for (i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;
 
 	Dm->CommInit();
 
@@ -51,9 +47,9 @@ int main (int argc, char *argv[])
 	double dist1,dist2;
 
 	Cx = Cy = Cz = N*0.5;
-	for (k=0; k<Nz; k++){
-		for (j=0; j<Ny; j++){
-			for (i=0; i<Nx; i++){
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
 				dist2 = fabs(Cz-k)-HEIGHT;
 
@@ -62,9 +58,9 @@ int main (int argc, char *argv[])
 		} 
 	}
 	Cz += SPEED;
-	for (k=0; k<Nz; k++){
-		for (j=0; j<Ny; j++){
-			for (i=0; i<Nx; i++){
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
 				
 				dist1 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)) - RADIUS;
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
@@ -77,9 +73,9 @@ int main (int argc, char *argv[])
 		}   
 	}
 	Cz += SPEED;
-	for (k=0; k<Nz; k++){
-		for (j=0; j<Ny; j++){
-			for (i=0; i<Nx; i++){
+	for (int k=0; k<Nz; k++){
+		for (int j=0; j<Ny; j++){
+			for (int i=0; i<Nx; i++){
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
 				dist2 = fabs(Cz-k)-HEIGHT;
 
@@ -151,7 +147,7 @@ int main (int argc, char *argv[])
 	return toReturn;
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	return 0;
 	MPI_Finalize();
 	// ****************************************************
diff --git a/tests/TestMRT.cpp b/tests/TestMRT.cpp
index 30f46689..5f2c4449 100644
--- a/tests/TestMRT.cpp
+++ b/tests/TestMRT.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -488,15 +488,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check;
 	{
 		// parallel domain size (# of sub-domains)
@@ -582,7 +578,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		MPI_Barrier(comm);
+		comm.barrier();
 		//.................................................
 		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -595,7 +591,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		// **************************************************************
 		// **************************************************************
 
@@ -613,7 +609,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		kproc = rank/(nprocx*nprocy);
 		jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -621,7 +617,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -650,7 +646,7 @@ int main(int argc, char **argv)
 		fread(Dm.id,1,N,IDFILE);
 		fclose(IDFILE);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		Dm.CommInit();
 
 		//.......................................................................
@@ -671,12 +667,12 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -706,7 +702,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Np];
 
 		ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm.id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -734,7 +730,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); comm.barrier();
 		starttime = MPI_Wtime();
 
 		while (timestep < timesteps) {
@@ -743,14 +739,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, ScaLBL_Comm.next, Np, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, 0, ScaLBL_Comm.next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 
 			ScaLBL_Comm.SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_MRT(dist, ScaLBL_Comm.next, Np, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_MRT(dist, 0, ScaLBL_Comm.next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			timestep++;
 			//************************************************************************/
 
@@ -783,7 +779,7 @@ int main(int argc, char **argv)
     	VEL= new double [3*Np];
     	int SIZE=3*Np*sizeof(double);
     	ScaLBL_D3Q19_Momentum(dist,Velocity, Np);
-    	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+    	ScaLBL_DeviceBarrier(); comm.barrier();
     	ScaLBL_CopyToHost(&VEL[0],&Velocity[0],SIZE);
 
     	sum_local=0.f;
@@ -805,7 +801,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestMap.cpp b/tests/TestMap.cpp
index a47c0d9e..f3010081 100644
--- a/tests/TestMap.cpp
+++ b/tests/TestMap.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -26,15 +26,9 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
 	int check=0;
 	{
 
@@ -45,6 +39,7 @@ int main(int argc, char **argv)
 				{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1},
 				{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1}};
 
+        int rank = comm.getRank();
 		if (rank == 0){
 			printf("********************************************************\n");
 			printf("Running unit test: TestMap	\n");
@@ -52,7 +47,7 @@ int main(int argc, char **argv)
 		}
 		
 	    // Load inputs
-	    auto db = loadInputs( nprocs );
+	    auto db = loadInputs( comm.getSize() );
 	    int Nx = db->getVector<int>( "n" )[0];
 	    int Ny = db->getVector<int>( "n" )[1];
 	    int Nz = db->getVector<int>( "n" )[2];
@@ -94,7 +89,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Npad];
 
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 		
 		// Check the neighborlist
 		printf("Check neighborlist: exterior %i, first interior %i last interior %i \n",ScaLBL_Comm->LastExterior(),ScaLBL_Comm->FirstInterior(),ScaLBL_Comm->LastInterior());
@@ -197,7 +192,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index bbfe8cae..68183cd2 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -8,7 +8,7 @@
 #include <fstream>
 
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/ColorModel.h"
 
 inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius){
@@ -67,11 +67,10 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 
 	if (rank == 0){
@@ -266,7 +265,7 @@ int main(int argc, char **argv)
 	}
 }
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/TestMicroCTReader.cpp b/tests/TestMicroCTReader.cpp
index 4a4c6aac..9a54610c 100644
--- a/tests/TestMicroCTReader.cpp
+++ b/tests/TestMicroCTReader.cpp
@@ -1,6 +1,6 @@
 // Test reading high-resolution files from the microct database
 
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/UnitTest.h"
 #include "common/Database.h"
 #include "common/Domain.h"
@@ -13,12 +13,14 @@
 
 void testReadMicroCT( const std::string& filename, UnitTest& ut )
 {
+    Utilities::MPI comm( MPI_COMM_WORLD );
+
     // Get the domain info
     auto db = std::make_shared<Database>( filename );
     auto domain_db = db->getDatabase( "Domain" );
 
     // Test reading microCT files
-    auto data = readMicroCT( *domain_db, MPI_COMM_WORLD );
+    auto data = readMicroCT( *domain_db, comm );
     
     // Check if we loaded the data correctly
     if ( data.size() == domain_db->getVector<size_t>( "n" ) )
@@ -30,7 +32,7 @@ void testReadMicroCT( const std::string& filename, UnitTest& ut )
     auto n = domain_db->getVector<int>( "n" );
     auto nproc = domain_db->getVector<int>( "nproc" );
     int N[3] = { n[0]*nproc[0], n[1]*nproc[1], n[2]*nproc[2] };
-    int rank = comm_rank(MPI_COMM_WORLD);
+    int rank = comm.getRank();
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     std::vector<IO::MeshDataStruct> meshData( 1 );
     auto Var = std::make_shared<IO::Variable>();
@@ -41,7 +43,7 @@ void testReadMicroCT( const std::string& filename, UnitTest& ut )
     meshData[0].meshName = "grid";
     meshData[0].mesh = std::make_shared<IO::DomainMesh>(rankInfo,n[0],n[1],n[2],N[0],N[1],N[2]);
     meshData[0].vars.push_back(Var);
-    IO::writeData( 0, meshData, MPI_COMM_WORLD );
+    IO::writeData( 0, meshData, comm );
 }
 
 
diff --git a/tests/TestMomentsD3Q19.cpp b/tests/TestMomentsD3Q19.cpp
index b26d7bed..6bd3e8ff 100644
--- a/tests/TestMomentsD3Q19.cpp
+++ b/tests/TestMomentsD3Q19.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 #include <math.h>
 
@@ -463,13 +463,14 @@ inline void MRT_Transform(double *dist, int Np) {
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-    int rank = MPI_WORLD_RANK();
-    int nprocs = MPI_WORLD_SIZE();
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
     for (int i=0; i<nprocs; i++) {
         if ( rank==i )
             printf("%i of %i: TestMoments\n",rank,nprocs);
-        MPI_Barrier(MPI_COMM_WORLD);
+        comm.barrier();
     }
 
     // Create a memory leak for valgrind to find
diff --git a/tests/TestNetcdf.cpp b/tests/TestNetcdf.cpp
index 5ea5139f..8768c9ea 100644
--- a/tests/TestNetcdf.cpp
+++ b/tests/TestNetcdf.cpp
@@ -1,7 +1,7 @@
 // Test reading/writing netcdf files
 
 #include "IO/netcdf.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "common/UnitTest.h"
 
@@ -13,7 +13,8 @@ void load( const std::string& );
 
 void test_NETCDF( UnitTest& ut )
 {
-    const int rank = comm_rank( MPI_COMM_WORLD );
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    const int rank = comm.getRank();
     int nprocx = 2;
     int nprocy = 2;
     int nprocz = 2;
@@ -30,7 +31,7 @@ void test_NETCDF( UnitTest& ut )
     auto dims =  netcdf::defDim( fid, {"X", "Y", "Z"}, dim );
     netcdf::write( fid, "tmp", dims, data, info );
     netcdf::close( fid );
-    MPI_Barrier( MPI_COMM_WORLD );
+    comm.barrier();
     // Read the contents of the file we created
     fid = netcdf::open( filename, netcdf::READ );
     Array<float> tmp = netcdf::getVar<float>( fid, "tmp" );
@@ -95,7 +96,8 @@ int main(int argc, char **argv)
 {
     // Initialize MPI
     MPI_Init(&argc,&argv);
-    int rank = comm_rank(MPI_COMM_WORLD);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    const int rank = comm.getRank();
     UnitTest ut;
     PROFILE_START("Main");
 
diff --git a/tests/TestPoiseuille.cpp b/tests/TestPoiseuille.cpp
index e69507e1..744d292d 100644
--- a/tests/TestPoiseuille.cpp
+++ b/tests/TestPoiseuille.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/MRTModel.h"
 
 void ParallelPlates(ScaLBL_MRTModel &MRT){
@@ -47,15 +47,11 @@ void ParallelPlates(ScaLBL_MRTModel &MRT){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	int check=0;
 	{
 		if (rank == 0){
@@ -77,7 +73,7 @@ int main(int argc, char **argv)
 
 		int SIZE=MRT.Np*sizeof(double);
 		ScaLBL_D3Q19_Momentum(MRT.fq,MRT.Velocity, MRT.Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); comm.barrier();
 		ScaLBL_CopyToHost(&Vz[0],&MRT.Velocity[0],3*SIZE);
 		
 		if (rank == 0) printf("Force: %f,%f,%f \n",MRT.Fx,MRT.Fy,MRT.Fz);
@@ -91,7 +87,7 @@ int main(int argc, char **argv)
 		j=Ny/2; k=Nz/2;
 		if (rank == 0) printf("Channel width=%f \n",W);
 		if (rank == 0) printf("ID flag vz       analytical\n");
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		if (rank == 0) {
 			for (i=0;i<Nx;i++){
@@ -130,7 +126,7 @@ int main(int argc, char **argv)
 	}
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestPressVel.cpp b/tests/TestPressVel.cpp
index e655ced9..25afd226 100644
--- a/tests/TestPressVel.cpp
+++ b/tests/TestPressVel.cpp
@@ -7,21 +7,16 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
 	int check=0;
 	{
 		if (rank == 0){
@@ -50,7 +45,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		int kproc = rank/(nprocx*nprocy);
 		int jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		int iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -58,7 +53,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -102,11 +97,11 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+        sum = comm.sumReduce( sum_local );
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -133,7 +128,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//......................device distributions.................................
 		if (rank==0)	printf ("Allocating distributions \n");
@@ -194,7 +189,7 @@ int main(int argc, char **argv)
 	   }
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 	return check;
diff --git a/tests/TestSegDist.cpp b/tests/TestSegDist.cpp
index ece3222d..b5e23ec8 100644
--- a/tests/TestSegDist.cpp
+++ b/tests/TestSegDist.cpp
@@ -39,11 +39,10 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
     // Initialize MPI
-    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     {
 
 
@@ -98,7 +97,7 @@ int main(int argc, char **argv)
         }
     }
 
-    MPI_Barrier(comm);
+    comm.barrier();
     if (rank==0) printf("Initialized! Converting to Signed Distance function \n");
 
     double t1 = MPI_Wtime();
@@ -116,7 +115,7 @@ int main(int argc, char **argv)
             }
         }
     }
-    err = sumReduce( Dm.Comm, err );
+    err = Dm.Comm.sumReduce( err );
     err = sqrt( err / (nx*ny*nz*nprocs) );
     if (rank==0)
         printf("Mean error %0.4f \n", err);
@@ -142,7 +141,7 @@ int main(int argc, char **argv)
     IO::writeData( "testSegDist", data, MPI_COMM_WORLD );
 
     }
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return 0;
 
diff --git a/tests/TestSubphase.cpp b/tests/TestSubphase.cpp
index fd6383be..9738812f 100644
--- a/tests/TestSubphase.cpp
+++ b/tests/TestSubphase.cpp
@@ -26,11 +26,10 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		if ( rank==0 ) {
@@ -137,7 +136,7 @@ int main(int argc, char **argv)
 		// Averages->Reduce();
 
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;  
 }
diff --git a/tests/TestTopo3D.cpp b/tests/TestTopo3D.cpp
index 8d00ef5a..948bb1d6 100644
--- a/tests/TestTopo3D.cpp
+++ b/tests/TestTopo3D.cpp
@@ -26,11 +26,10 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		if ( rank==0 ) {
@@ -226,7 +225,7 @@ int main(int argc, char **argv)
 		IO::writeData( timestep, visData, comm );
 
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;  
 }
diff --git a/tests/TestTorus.cpp b/tests/TestTorus.cpp
index 2d486774..5125ce92 100644
--- a/tests/TestTorus.cpp
+++ b/tests/TestTorus.cpp
@@ -26,11 +26,10 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
   // Initialize MPI
-  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&nprocs);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -165,7 +164,7 @@ int main(int argc, char **argv)
    // Averages->Reduce();
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Barrier(comm);
+  comm.barrier();
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestTorusEvolve.cpp b/tests/TestTorusEvolve.cpp
index 1a65d268..32cf7fd8 100644
--- a/tests/TestTorusEvolve.cpp
+++ b/tests/TestTorusEvolve.cpp
@@ -26,11 +26,10 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
   // Initialize MPI
-  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&nprocs);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -157,7 +156,7 @@ int main(int argc, char **argv)
 
     }
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Barrier(comm);
+  comm.barrier();
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestTwoPhase.cpp b/tests/TestTwoPhase.cpp
index a979314a..fa54d98d 100644
--- a/tests/TestTwoPhase.cpp
+++ b/tests/TestTwoPhase.cpp
@@ -8,7 +8,7 @@
 #include <fstream>
 
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -17,11 +17,10 @@
 int main(int argc, char **argv)
 {
   // Initialize MPI
-  int rank,nprocs;
   MPI_Init(&argc,&argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&nprocs);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   { // Limit scope so Domain can free it's communicator
 
 	printf("Running two-phase averaging test on %i processors \n",nprocs);
@@ -110,7 +109,7 @@ int main(int argc, char **argv)
 		fclose(PHASE);
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
   } // Limit scope so Domain will free it's communicator
   MPI_Finalize();
   return 0;
diff --git a/tests/TestWriter.cpp b/tests/TestWriter.cpp
index 8936aaff..37858202 100644
--- a/tests/TestWriter.cpp
+++ b/tests/TestWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -34,11 +34,9 @@ inline double distance( const Point& p )
 // Test writing and reading the given format
 void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& meshData, UnitTest& ut )
 {
-    int rank, nprocs;
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
-    MPI_Barrier(comm);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int nprocs = comm.getSize();
+    comm.barrier();
 
     // Get the format
     std::string format2 = format;
@@ -63,7 +61,7 @@ void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& mes
     IO::initialize( "test_"+format, format2, false );
     IO::writeData( 0, meshData, comm );
     IO::writeData( 3, meshData, comm );
-    MPI_Barrier(comm);
+    comm.barrier();
     PROFILE_STOP(format+"-write");
 
     // Get the summary name for reading
@@ -228,11 +226,10 @@ void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& mes
 // Main
 int main(int argc, char **argv)
 {
-    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     Utilities::setAbortBehavior(true,2);
     Utilities::setErrorHandlers();
     UnitTest ut;
@@ -389,7 +386,7 @@ int main(int argc, char **argv)
     ut.report();
     PROFILE_SAVE("TestWriter",true);
     int N_errors = ut.NumFailGlobal();
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/convertIO.cpp b/tests/convertIO.cpp
index 0937729f..27605237 100644
--- a/tests/convertIO.cpp
+++ b/tests/convertIO.cpp
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include <fstream>
 
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
 #include "IO/Mesh.h"
@@ -17,11 +17,10 @@
 int main(int argc, char **argv)
 {
   // Initialize MPI
-  int rank,nprocs;
   MPI_Init(&argc,&argv);
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm,&rank);
-  MPI_Comm_size(comm,&nprocs);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   Utilities::setErrorHandlers();
   PROFILE_ENABLE(2);
   PROFILE_ENABLE_TRACE();
@@ -70,20 +69,20 @@ int main(int argc, char **argv)
 
             i++;
         }
-        MPI_Barrier(comm);
+        comm.barrier();
         PROFILE_STOP("Read");
 
         // Save the mesh data to a new file
         PROFILE_START("Write");
         IO::writeData( timestep, meshData, MPI_COMM_WORLD );
-        MPI_Barrier(comm);
+        comm.barrier();
         PROFILE_STOP("Write");
     }
 
   } // Limit scope
   PROFILE_STOP("Main");
   PROFILE_SAVE("convertData",true);
-  MPI_Barrier(comm);
+  comm.barrier();
   MPI_Finalize();
   return 0;
 }
diff --git a/tests/hello_world.cpp b/tests/hello_world.cpp
index d236bf0e..810d3a9c 100644
--- a/tests/hello_world.cpp
+++ b/tests/hello_world.cpp
@@ -1,18 +1,19 @@
 #include <iostream>
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Utilities.h"
 
 
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-    int rank = MPI_WORLD_RANK();
-    int nprocs = MPI_WORLD_SIZE();
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
     for (int i=0; i<nprocs; i++) {
         if ( rank==i )
             printf("%i of %i: Hello world\n",rank,nprocs);
-        MPI_Barrier(MPI_COMM_WORLD);
+        comm.barrier();
     }
 
     // Create a memory leak for valgrind to find
@@ -26,7 +27,7 @@ int main (int argc, char **argv)
     int error = 0;
     
     // Finished
-	MPI_Barrier(MPI_COMM_WORLD);
+    comm.barrier();
 	MPI_Finalize();
     return error; 
 }
diff --git a/tests/lb2_CMT_wia.cpp b/tests/lb2_CMT_wia.cpp
index e0f5a627..820428a3 100644
--- a/tests/lb2_CMT_wia.cpp
+++ b/tests/lb2_CMT_wia.cpp
@@ -11,7 +11,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-//#include "common/MPI_Helpers.h"
+//#include "common/MPI.h"
 //#include "Communication.h"
 
 //#define CBUB
diff --git a/tests/lb2_Color_blob_wia_mpi.cpp b/tests/lb2_Color_blob_wia_mpi.cpp
index 1c11ebd3..70342176 100644
--- a/tests/lb2_Color_blob_wia_mpi.cpp
+++ b/tests/lb2_Color_blob_wia_mpi.cpp
@@ -12,7 +12,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "Communication.h"
 
 #define WRITE_SURFACES
@@ -96,15 +96,11 @@ inline void ZeroHalo(double *Data, int Nx, int Ny, int Nz)
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -209,7 +205,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
 	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
@@ -242,7 +238,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	RESTART_INTERVAL=interval;
 	// **************************************************************
@@ -284,7 +280,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	 MPI_Barrier(comm);
+	 comm.barrier();
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -401,14 +397,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	if (rank == 0){
 		// Compute the Sauter mean diameter
@@ -596,7 +592,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -783,7 +779,7 @@ int main(int argc, char **argv)
 	ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_Yz, recvCount_Yz*sizeof(int));	// Allocate device memory
 	ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_YZ, recvCount_YZ*sizeof(int));	// Allocate device memory
 	//......................................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("Prepare to copy send/recv Lists to device \n");
 	ScaLBL_CopyToDevice(dvcSendList_x,sendList_x,sendCount_x*sizeof(int));
 	ScaLBL_CopyToDevice(dvcSendList_X,sendList_X,sendCount_X*sizeof(int));
@@ -993,7 +989,7 @@ int main(int argc, char **argv)
 	recvMeshData_YZ = new double [recvCount_YZ];
 	recvMeshData_XZ = new double [recvCount_XZ];
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1220,7 +1216,7 @@ int main(int argc, char **argv)
 		ScaLBL_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
 		ScaLBL_CopyToDevice(Den,cDen,2*N*sizeof(double));
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 	}
 	// Set up the cube list (very regular in this case due to lack of blob-ID)
 	// Set up kstart, kfinish so that the reservoirs are excluded from averaging
@@ -1487,7 +1483,7 @@ int main(int argc, char **argv)
 	ScaLBL_CopyToHost(Vel_x.data,&Velocity[0],N*sizeof(double));
 	ScaLBL_CopyToHost(Vel_y.data,&Velocity[N],N*sizeof(double));
 	ScaLBL_CopyToHost(Vel_z.data,&Velocity[2*N],N*sizeof(double));
-	MPI_Barrier(comm);
+	comm.barrier();
 	//...........................................................................
 	
 	int timestep = 0;
@@ -1500,7 +1496,7 @@ int main(int argc, char **argv)
 	
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
+	comm.barrier();
 	starttime = MPI_Wtime();
 	//.........................................
 	
@@ -1915,7 +1911,7 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Timestep completed!
 		timestep++;
@@ -1940,7 +1936,7 @@ int main(int argc, char **argv)
 			ScaLBL_CopyToHost(Vel_x.data,&Velocity[0],N*sizeof(double));
 			ScaLBL_CopyToHost(Vel_y.data,&Velocity[N],N*sizeof(double));
 			ScaLBL_CopyToHost(Vel_z.data,&Velocity[2*N],N*sizeof(double));
-			MPI_Barrier(comm);
+			comm.barrier();
 		}
 		if (timestep%1000 == 5){
 			//...........................................................................
@@ -2445,7 +2441,7 @@ int main(int argc, char **argv)
 			}
 			
 			//...........................................................................
-			MPI_Barrier(comm);
+			comm.barrier();
 			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
@@ -2468,7 +2464,7 @@ int main(int argc, char **argv)
 			MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Barrier(comm);
+			comm.barrier();
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
 			// See McClure, Prins and Miller (2014) 
@@ -2547,7 +2543,7 @@ int main(int argc, char **argv)
 			if (rank==0){
 				mkdir(tmpstr,0777);
 			}
-			MPI_Barrier(comm);
+			comm.barrier();
 			
 			FILE *WN_TRIS;
 			sprintf(LocalRankFilename,"%s/%s%s",tmpstr,"wn-tris.",LocalRankString);
@@ -2692,7 +2688,7 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	comm.barrier();
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
@@ -2816,7 +2812,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_BGK_simulator.cpp b/tests/lbpm_BGK_simulator.cpp
index 095a6c5f..8b079900 100644
--- a/tests/lbpm_BGK_simulator.cpp
+++ b/tests/lbpm_BGK_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 //#define WRITE_SURFACES
 
@@ -23,15 +23,12 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 		// parallel domain size (# of sub-domains)
 		int nprocx,nprocy,nprocz;
@@ -98,7 +95,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		MPI_Barrier(comm);
+		comm.barrier();
 		//.................................................
 		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
@@ -123,7 +120,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		RESTART_INTERVAL=interval;
 		// **************************************************************
@@ -158,7 +155,7 @@ int main(int argc, char **argv)
 
 		// Mask that excludes the solid phase
 		Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BC);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		Nx += 2;	Ny += 2;	Nz += 2;
 		int N = Nx*Ny*Nz;
@@ -194,7 +191,7 @@ int main(int argc, char **argv)
 		sprintf(LocalRankString,"%05d",rank);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages.SDs.data(), N);
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 
 		//.......................................................................
@@ -261,7 +258,7 @@ int main(int argc, char **argv)
 		id[0] = id[Nx-1] = id[(Ny-1)*Nx] = id[(Ny-1)*Nx + Nx-1] = 0;
 		id[(Nz-1)*Nx*Ny] = id[(Nz-1)*Nx*Ny+Nx-1] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx + Nx-1] = 0;
 		//.........................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Initialize communication structures in averaging domain
 		for (i=0; i<Mask.Nx*Mask.Ny*Mask.Nz; i++) Mask.id[i] = id[i];
@@ -277,7 +274,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Mask.id,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 		
 		// LBM variables
 		if (rank==0)	printf ("Allocating distributions \n");
@@ -333,7 +330,7 @@ int main(int argc, char **argv)
 
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
-		MPI_Barrier(comm);
+		comm.barrier();
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -348,14 +345,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_BGK(NeighborList, dist, ScaLBL_Comm.first_interior, ScaLBL_Comm.last_interior, Np, rlx, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_BGK(NeighborList, dist, 0, ScaLBL_Comm.next, Np, rlx, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 
 			timestep++;
 			ScaLBL_Comm.SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_BGK(dist, ScaLBL_Comm.first_interior, ScaLBL_Comm.last_interior, Np, rlx, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_BGK(dist, 0, ScaLBL_Comm.next, Np, rlx, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			//************************************************************************/
 
 			if (timestep%500 == 0){
@@ -412,7 +409,7 @@ int main(int argc, char **argv)
 		}
 		//************************************************************************/
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -430,7 +427,7 @@ int main(int argc, char **argv)
 		NULL_USE(RESTART_INTERVAL);
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_captube_pp.cpp b/tests/lbpm_captube_pp.cpp
index ce82a4bb..b90ebb2a 100644
--- a/tests/lbpm_captube_pp.cpp
+++ b/tests/lbpm_captube_pp.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 
 std::shared_ptr<Database> loadInputs( )
@@ -24,15 +24,11 @@ std::shared_ptr<Database> loadInputs( )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 	//*****************************************
 	// MPI ranks for all 18 neighbors
@@ -96,7 +92,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -185,7 +181,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_color_macro_simulator.cpp b/tests/lbpm_color_macro_simulator.cpp
index 1c619c5a..97df6812 100644
--- a/tests/lbpm_color_macro_simulator.cpp
+++ b/tests/lbpm_color_macro_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -30,10 +30,9 @@ int main(int argc, char **argv)
 	// Initialize MPI
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-	MPI_Comm comm;
-	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-	int rank = comm_rank(comm);
-	int nprocs = comm_size(comm);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		// parallel domain size (# of sub-domains)
@@ -52,7 +51,7 @@ int main(int argc, char **argv)
 		//		int device=ScaLBL_SetDevice(rank);
 		//printf("Using GPU ID %i for rank %i \n",device,rank);
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		PROFILE_ENABLE(1);
 		//PROFILE_ENABLE_TRACE();
@@ -171,7 +170,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		MPI_Barrier(comm);
+		comm.barrier();
 		//.................................................
 		MPI_Bcast(&tauA,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&tauB,1,MPI_DOUBLE,0,comm);
@@ -207,7 +206,7 @@ int main(int argc, char **argv)
 		// Get the rank info
 		const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		if (nprocs != nprocx*nprocy*nprocz){
 			printf("nprocx =  %i \n",nprocx);
@@ -262,7 +261,7 @@ int main(int argc, char **argv)
 
 		// Mask that excludes the solid phase
 		Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		Nx+=2; Ny+=2; Nz += 2;
 		int N = Nx*Ny*Nz;
@@ -297,7 +296,7 @@ int main(int argc, char **argv)
 		sprintf(LocalRankString,"%05d",rank);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;
 
 		if (rank==0) printf("Initialize from segmented data: solid=0, NWP=1, WP=2 \n");
@@ -341,7 +340,7 @@ int main(int argc, char **argv)
 			delete [] cDen;
 			delete [] cfq;
 			*/
-			MPI_Barrier(comm);
+			comm.barrier();
 		}
 		
 		fflush(stdout);
@@ -416,7 +415,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Mask.id,Np);
 		if (rank==0)	printf ("Set up memory efficient layout Npad=%i, Np=%i \n",Npad,Np);
-		MPI_Barrier(comm);
+		comm.barrier();
 		//...........................................................................
 		//				MAIN  VARIABLES ALLOCATED HERE
 		//...........................................................................
@@ -537,7 +536,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -589,7 +588,7 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm.next, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 
 			// *************EVEN TIMESTEP*************
 			timestep++;
@@ -622,10 +621,10 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm.next, Np);
-			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+			ScaLBL_DeviceBarrier(); comm.barrier();
 			//************************************************************************
 			
-			MPI_Barrier(comm);
+			comm.barrier();
 			PROFILE_STOP("Update");
 
 			// Run the analysis
@@ -637,7 +636,7 @@ int main(int argc, char **argv)
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -657,9 +656,8 @@ int main(int argc, char **argv)
 		PROFILE_STOP("Main");
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		// ****************************************************
-		MPI_Barrier(comm);
+		comm.barrier();
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	MPI_Comm_free(&comm);
 	MPI_Finalize();
 }
 
diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index 1f63c653..cef13189 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -28,10 +28,9 @@ int main(int argc, char **argv)
 
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
-    MPI_Comm comm;
-    MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-    int rank = comm_rank(comm);
-    int nprocs = comm_size(comm);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
     if (rank == 0){
 	    printf("********************************************************\n");
@@ -41,7 +40,7 @@ int main(int argc, char **argv)
     // Initialize compute device
     ScaLBL_SetDevice(rank);
     ScaLBL_DeviceBarrier();
-    MPI_Barrier(comm);
+    comm.barrier();
 
     PROFILE_ENABLE(1);
     //PROFILE_ENABLE_TRACE();
@@ -51,7 +50,7 @@ int main(int argc, char **argv)
     Utilities::setErrorHandlers();
 
     auto filename = argv[1];
-    ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
+    ScaLBL_ColorModel ColorModel(rank,nprocs,comm.dup());
     ColorModel.ReadParams(filename);
     ColorModel.SetDomain();    
     ColorModel.ReadInput();    
@@ -64,8 +63,7 @@ int main(int argc, char **argv)
     PROFILE_SAVE("lbpm_color_simulator",1);
     // ****************************************************
 
-    MPI_Barrier(comm);
-    MPI_Comm_free(&comm);
+    comm.barrier();
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
 
diff --git a/tests/lbpm_dfh_simulator.cpp b/tests/lbpm_dfh_simulator.cpp
index 1e8dc0f9..0d5902df 100644
--- a/tests/lbpm_dfh_simulator.cpp
+++ b/tests/lbpm_dfh_simulator.cpp
@@ -26,10 +26,9 @@ int main(int argc, char **argv)
   // Initialize MPI
   int provided_thread_support = -1;
   MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-  int rank = comm_rank(comm);
-  int nprocs = comm_size(comm);
+  Utilities::MPI comm( MPI_COMM_WORLD );
+  int rank = comm.getRank();
+  int nprocs = comm.getSize();
   if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
     std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
@@ -47,7 +46,7 @@ int main(int argc, char **argv)
     Utilities::setErrorHandlers();
 
 	auto filename = argv[1];
-	ScaLBL_DFHModel DFHModel(rank,nprocs,comm);
+	ScaLBL_DFHModel DFHModel( rank, nprocs, comm.dup() );
 	DFHModel.ReadParams(filename);
 	DFHModel.SetDomain();    
 	DFHModel.ReadInput();    
@@ -59,9 +58,8 @@ int main(int argc, char **argv)
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  MPI_Comm_free(&comm);
   MPI_Finalize();
 }
 
diff --git a/tests/lbpm_disc_pp.cpp b/tests/lbpm_disc_pp.cpp
index 92036000..20d41884 100644
--- a/tests/lbpm_disc_pp.cpp
+++ b/tests/lbpm_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"    // This includes mpi.h
+#include "common/MPI.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -130,15 +130,11 @@ inline void SignedDistanceDiscPack(double *Distance, int ndiscs, double *List_cx
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -190,7 +186,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
@@ -204,7 +200,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	// **************************************************************
 	if (argc > 1)	depth=atoi(argv[1]);
@@ -222,7 +218,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	 MPI_Barrier(comm);
+	 comm.barrier();
 
 	Nx += 2;
 	Ny += 2;
@@ -277,13 +273,13 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the disc packing \n");
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0){
 		cout << "Domain set." << endl;
 		printf("************ \n");
@@ -388,7 +384,7 @@ int main(int argc, char **argv)
 	//......................................................................
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_inkbottle_pp.cpp b/tests/lbpm_inkbottle_pp.cpp
index 3c39219d..669ab8c0 100644
--- a/tests/lbpm_inkbottle_pp.cpp
+++ b/tests/lbpm_inkbottle_pp.cpp
@@ -9,19 +9,15 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -83,7 +79,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -96,7 +92,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -123,7 +119,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -221,7 +217,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_juanes_bench_disc_pp.cpp b/tests/lbpm_juanes_bench_disc_pp.cpp
index 6f04cffa..47d8cb84 100644
--- a/tests/lbpm_juanes_bench_disc_pp.cpp
+++ b/tests/lbpm_juanes_bench_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"    // This includes mpi.h
+#include "common/MPI.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -130,15 +130,11 @@ inline void SignedDistanceDiscPack(double *Distance, int ndiscs, double *List_cx
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -194,7 +190,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
@@ -208,7 +204,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	// **************************************************************
 	double Rin,Rout;
@@ -240,7 +236,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-	 MPI_Barrier(comm);
+	 comm.barrier();
 	Nx += 2;	Ny += 2;	Nz += 2;
 
 	int N = Nx*Ny*Nz;
@@ -294,13 +290,13 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the disc packing \n");
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	/*	if (rank == 0){
 		cout << "Domain set." << endl;
 		printf("************ \n");
@@ -312,7 +308,7 @@ int main(int argc, char **argv)
 	}
 	*/
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (nprocz > 1 && rank==0) printf("Disc packs are 2D -- are you sure you want nprocz > 1? \n");
 	if (rank ==0) printf("Compute the signed distance part I \n");
 	//.......................................................................
@@ -490,7 +486,7 @@ int main(int argc, char **argv)
 	//......................................................................
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_minkowski_scalar.cpp b/tests/lbpm_minkowski_scalar.cpp
index 3e3ede6d..721207a1 100644
--- a/tests/lbpm_minkowski_scalar.cpp
+++ b/tests/lbpm_minkowski_scalar.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -28,13 +28,11 @@
 
 int main(int argc, char **argv)
 {
-
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 		Utilities::setErrorHandlers();
 		PROFILE_START("Main");
@@ -87,7 +85,7 @@ int main(int argc, char **argv)
 			fclose(SEGDAT);
 			printf("Read segmented data from %s \n",Filename.c_str());
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Get the rank info
 		int N = (nx+2)*(ny+2)*(nz+2);
@@ -152,7 +150,7 @@ int main(int argc, char **argv)
 						}
 						else{
 							printf("Sending data to process %i \n", rnk);
-							MPI_Send(tmp,N,MPI_CHAR,rnk,15,comm);
+							comm.send(tmp,N,rnk,15);
 						}
 					}
 				}
@@ -161,13 +159,12 @@ int main(int argc, char **argv)
 		else{
 			// Recieve the subdomain from rank = 0
 			printf("Ready to recieve data %i at process %i \n", N,rank);
-			MPI_Recv(Dm->id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
+			comm.recv(Dm->id,N,0,15);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		
 		// Compute the Minkowski functionals
-		MPI_Barrier(comm);
-		std::shared_ptr<Minkowski> Averages(new Minkowski(Dm));
+		auto Averages = std::make_shared<Minkowski>(Dm);
 
 		// Calculate the distance		
 		// Initialize the domain and communication
@@ -212,7 +209,7 @@ int main(int argc, char **argv)
 	}
 	PROFILE_STOP("Main");
 	PROFILE_SAVE("Minkowski",true);
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_morph_pp.cpp b/tests/lbpm_morph_pp.cpp
index 8fe8b228..939fdc32 100644
--- a/tests/lbpm_morph_pp.cpp
+++ b/tests/lbpm_morph_pp.cpp
@@ -23,11 +23,9 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -127,13 +125,13 @@ int main(int argc, char **argv)
 
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
-		MPI_Barrier(comm);
+		comm.barrier();
 		
 		// Extract only the connected part of NWP
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		MPI_Barrier(Dm->Comm);
+		Dm->Comm.barrier();
 			
 		int count_connected=0;
 		int count_porespace=0;
@@ -155,9 +153,9 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		count_connected=sumReduce( Dm->Comm, count_connected);
-		count_porespace=sumReduce( Dm->Comm, count_porespace);
-		count_water=sumReduce( Dm->Comm, count_water);
+		count_connected = Dm->Comm.sumReduce( count_connected );
+		count_porespace = Dm->Comm.sumReduce( count_porespace );
+		count_water = Dm->Comm.sumReduce( count_water );
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -215,7 +213,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		count_water=sumReduce( Dm->Comm, count_water);
+		count_water = Dm->Comm.sumReduce( count_water );
 		
 		SW = double(count_water) / count_porespace;
 		if(rank==0) printf("Final saturation: %f \n", SW);
@@ -236,13 +234,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
         auto filename2 = READFILE + ".morph.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.c_str());
 		Mask->AggregateLabels(filename2);
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_morphdrain_pp.cpp b/tests/lbpm_morphdrain_pp.cpp
index 8d73b1e4..d3c5a428 100644
--- a/tests/lbpm_morphdrain_pp.cpp
+++ b/tests/lbpm_morphdrain_pp.cpp
@@ -23,11 +23,9 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -121,7 +119,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Run the morphological opening
 		MorphDrain(SignDist, id, Dm, SW);
@@ -196,13 +194,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
         auto filename2 = READFILE + ".morphdrain.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.data() );
 		Mask->AggregateLabels( filename2 );
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_morphopen_pp.cpp b/tests/lbpm_morphopen_pp.cpp
index f8819348..a6209240 100644
--- a/tests/lbpm_morphopen_pp.cpp
+++ b/tests/lbpm_morphopen_pp.cpp
@@ -23,11 +23,9 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -123,7 +121,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Run the morphological opening
 		MorphOpen(SignDist, id, Dm, SW, ErodeLabel, OpenLabel);
@@ -198,13 +196,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
         auto filename2 = READFILE + ".morphopen.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.data());
 		Mask->AggregateLabels(filename2);
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_nondarcy_simulator.cpp b/tests/lbpm_nondarcy_simulator.cpp
index 40672375..096dc790 100644
--- a/tests/lbpm_nondarcy_simulator.cpp
+++ b/tests/lbpm_nondarcy_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 //#define WRITE_SURFACES
 
@@ -77,15 +77,11 @@ int main(int argc, char **argv)
 	}
 	else {
 
-		//*****************************************
-		// ***** MPI STUFF ****************
-		//*****************************************
 		// Initialize MPI
-		int rank,nprocs;
 		MPI_Init(&argc,&argv);
-		MPI_Comm comm = MPI_COMM_WORLD;
-		MPI_Comm_rank(comm,&rank);
-		MPI_Comm_size(comm,&nprocs);
+		Utilities::MPI comm( MPI_COMM_WORLD );
+        int rank = comm.getRank();
+        int nprocs = comm.getSize();
 		{
 			// parallel domain size (# of sub-domains)
 			int nprocx,nprocy,nprocz;
@@ -160,7 +156,7 @@ int main(int argc, char **argv)
 			}
 			// **************************************************************
 			// Broadcast simulation parameters from rank 0 to all other procs
-			MPI_Barrier(comm);
+			comm.barrier();
 			//.................................................
 			MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 			//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
@@ -185,7 +181,7 @@ int main(int argc, char **argv)
 			MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 			MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 			//.................................................
-			MPI_Barrier(comm);
+			comm.barrier();
 
 			RESTART_INTERVAL=interval;
 			// **************************************************************
@@ -222,7 +218,7 @@ int main(int argc, char **argv)
 					rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 					rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-			MPI_Barrier(comm);
+			comm.barrier();
 
 			Nx += 2;	Ny += 2;	Nz += 2;
 
@@ -262,7 +258,7 @@ int main(int argc, char **argv)
 			//	WriteLocalSolidID(LocalRankFilename, id, N);
 			sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 			ReadBinaryFile(LocalRankFilename, Averages.SDs.data(), N);
-			MPI_Barrier(comm);
+			comm.barrier();
 			if (rank == 0) cout << "Domain set." << endl;
 
 			//.......................................................................
@@ -436,7 +432,7 @@ int main(int argc, char **argv)
 
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
-			MPI_Barrier(comm);
+			comm.barrier();
 			starttime = MPI_Wtime();
 			//.........................................
 
@@ -485,7 +481,7 @@ int main(int argc, char **argv)
 					}
 					//...................................................................................
 					ScaLBL_DeviceBarrier();
-					MPI_Barrier(comm);
+					comm.barrier();
 
 
 					// Timestep completed!
@@ -557,7 +553,7 @@ int main(int argc, char **argv)
 			//************************************************************************/
 			fclose(NONDARCY);
 			ScaLBL_DeviceBarrier();
-			MPI_Barrier(comm);
+			comm.barrier();
 			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
@@ -575,7 +571,7 @@ int main(int argc, char **argv)
 			NULL_USE(RESTART_INTERVAL);
 		}
 		// ****************************************************
-		MPI_Barrier(comm);
+		comm.barrier();
 		MPI_Finalize();
 		// ****************************************************
 	}
diff --git a/tests/lbpm_nonnewtonian_simulator.cpp b/tests/lbpm_nonnewtonian_simulator.cpp
index 5c33841f..ff8792e7 100644
--- a/tests/lbpm_nonnewtonian_simulator.cpp
+++ b/tests/lbpm_nonnewtonian_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "common/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -99,21 +99,12 @@ inline void ZeroHalo(double *Data, int Nx, int Ny, int Nz)
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	//MPI_Init(&argc,&argv);
-
-	/*
-	 * Definitely seems to be an issue - let's hope James gets back to me...
-	 */
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-	MPI_Comm comm;
-	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-	int rank = comm_rank(comm);
-	int nprocs = comm_size(comm);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 
 	if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
 		std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
@@ -214,32 +205,32 @@ int main(int argc, char **argv)
 
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		MPI_Barrier(comm);
+		comm.barrier();
 		//.................................................
-		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-		//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
-		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-		MPI_Bcast(&interval,1,MPI_INT,0,comm);
-		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&tau,1,0);
+		//comm.bcast(&pBC,1,0);
+		//comm.bcast(&Restart,1,0);
+		comm.bcast(&din,1,0);
+		comm.bcast(&dout,1,0);
+		comm.bcast(&Fx,1,0);
+		comm.bcast(&Fy,1,0);
+		comm.bcast(&Fz,1,0);
+		comm.bcast(&timestepMax,1,0);
+		comm.bcast(&interval,1,0);
+		comm.bcast(&tol,1,0);
 		// Computational domain
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		comm.bcast(&nspheres,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		//?
 		RESTART_INTERVAL=interval;
@@ -252,7 +243,7 @@ int main(int argc, char **argv)
 
 		const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		/*
 		 * Set up the relaxation rates and STATIC VISCOSITY
@@ -319,7 +310,7 @@ int main(int argc, char **argv)
 		//				rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 		//				rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		Nx+=2; Ny+=2; Nz+=2;
 		int N = Nx*Ny*Nz;
 		int dist_mem_size = N*sizeof(double);
@@ -373,7 +364,7 @@ int main(int argc, char **argv)
 		//	WriteLocalSolidID(LocalRankFilename, id, N);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-		MPI_Barrier(comm);
+		comm.barrier();
 		if (rank == 0) cout << "Domain set." << endl;                                               /*    3      */
 
 		//.......................................................................
@@ -598,14 +589,14 @@ int main(int argc, char **argv)
 			delete [] cDen;
 			delete [] cDistEven;
 			delete [] cDistOdd;
-			MPI_Barrier(comm);
+			comm.barrier();
 		}                                                                                   /*  14 */
 
 //		//......................................................................
 //		ScaLBL_D3Q7_Init(ID, A_even, A_odd, &Den[0], Nx, Ny, Nz);
 //		ScaLBL_D3Q7_Init(ID, B_even, B_odd, &Den[N], Nx, Ny, Nz);
 //		ScaLBL_DeviceBarrier();
-//		MPI_Barrier(comm);																/*  15  */
+//		comm.barrier();																/*  15  */
 
 		//.......................................................................
 		// Once phase has been initialized, map solid to account for 'smeared' interface
@@ -631,7 +622,7 @@ int main(int argc, char **argv)
 //		ScaLBL_Comm.SendHalo(Phi);
 //		ScaLBL_Comm.RecvHalo(Phi);
 //		ScaLBL_DeviceBarrier();
-//		MPI_Barrier(comm);
+//		comm.barrier();
 //		//*************************************************************************   /*  18  */
 
 
@@ -670,7 +661,7 @@ int main(int argc, char **argv)
 
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
-			MPI_Barrier(comm);
+			comm.barrier();
 			starttime = MPI_Wtime();
 
 			/*
@@ -804,7 +795,7 @@ int main(int argc, char **argv)
 				}
 				//...................................................................................
 				ScaLBL_DeviceBarrier();
-				MPI_Barrier(comm);
+				comm.barrier();
 
 				// Timestep completed!
 				timestep++;
@@ -818,7 +809,7 @@ int main(int argc, char **argv)
 			}
 			//************************************************************************/
 			ScaLBL_DeviceBarrier();
-			MPI_Barrier(comm);
+			comm.barrier();
 			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
@@ -835,7 +826,7 @@ int main(int argc, char **argv)
 
 			NULL_USE(RESTART_INTERVAL);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		MPI_Finalize();
 	 //****************************************************
 }
diff --git a/tests/lbpm_nonnewtonian_simulator.h b/tests/lbpm_nonnewtonian_simulator.h
index 20da1ac3..4df5e628 100644
--- a/tests/lbpm_nonnewtonian_simulator.h
+++ b/tests/lbpm_nonnewtonian_simulator.h
@@ -1,7 +1,7 @@
 // Run the analysis, blob identification, and write restart files
 #include "common/Array.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 
 //#define ANALYSIS_INTERVAL 6
@@ -9,20 +9,9 @@
 #define BLOBID_INTERVAL 1000
 
 
-
-
-
-
 enum AnalysisType{ AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
 	CopySimState=0x04, ComputeAverages=0x08, CreateRestart=0x10, WriteVis=0x20 };
 
-
-
-
-
-
-
-
 template<class TYPE>
 void DeleteArray( const TYPE *p )
 {
@@ -30,12 +19,6 @@ void DeleteArray( const TYPE *p )
 }
 
 
-
-
-
-
-
-
 // Structure used to store ids
 struct AnalysisWaitIdStruct {
 	ThreadPool::thread_id_t blobID;
@@ -45,7 +28,6 @@ struct AnalysisWaitIdStruct {
 };
 
 
-
 // Helper class to write the restart file from a seperate thread
 class WriteRestartWorkItem: public ThreadPool::WorkItem
 {
@@ -84,9 +66,9 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //        timestep(timestep_), Nx(Nx_), Ny(Ny_), Nz(Nz_), rank_info(rank_info_),
 //        phase(phase_), dist(dist_), last_id(last_id_), new_index(new_index_), new_id(new_id_), new_list(new_list_)
 //        {
-//            MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
+//            newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
 //        }
-//    ~BlobIdentificationWorkItem1() { MPI_Comm_free(&newcomm); }
+//    ~BlobIdentificationWorkItem1() {}
 //    virtual void run() {
 //        // Compute the global blob id and compare to the previous version
 //        PROFILE_START("Identify blobs",1);
@@ -106,7 +88,7 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //    const DoubleArray& dist;
 //    BlobIDstruct last_id, new_index, new_id;
 //    BlobIDList new_list;
-//    MPI_Comm newcomm;
+//    Utilities::MPI newcomm;
 //};
 //
 
@@ -122,9 +104,9 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //        timestep(timestep_), Nx(Nx_), Ny(Ny_), Nz(Nz_), rank_info(rank_info_),
 //        phase(phase_), dist(dist_), last_id(last_id_), new_index(new_index_), new_id(new_id_), new_list(new_list_)
 //        {
-//            MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
+//            newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
 //        }
-//    ~BlobIdentificationWorkItem2() { MPI_Comm_free(&newcomm); }
+//    ~BlobIdentificationWorkItem2() { }
 //    virtual void run() {
 //        // Compute the global blob id and compare to the previous version
 //        PROFILE_START("Identify blobs maps",1);
@@ -158,7 +140,7 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //    const DoubleArray& dist;
 //    BlobIDstruct last_id, new_index, new_id;
 //    BlobIDList new_list;
-//    MPI_Comm newcomm;
+//    Utilities::MPI newcomm;
 //};
 //
 
@@ -171,9 +153,9 @@ public:
 			TwoPhase& Avgerages_, fillHalo<double>& fillData_ ):
 				timestep(timestep_), visData(visData_), Averages(Avgerages_), fillData(fillData_)
 {
-		MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
+    newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
 }
-	~WriteVisWorkItem() { MPI_Comm_free(&newcomm); }
+	~WriteVisWorkItem() {}
 	virtual void run() {
 		PROFILE_START("Save Vis",1);
 		ASSERT(visData[0].vars[0]->name=="phase");
@@ -198,7 +180,7 @@ private:
 	std::vector<IO::MeshDataStruct>& visData;
 	TwoPhase& Averages;
 	fillHalo<double>& fillData;
-	MPI_Comm newcomm;
+	Utilities::MPI newcomm;
 };
 
 
@@ -418,7 +400,7 @@ void run_analysis( int timestep, int restart_interval,
 
 	// Spawn a thread to write the restart file
 	if ( (type&CreateRestart) != 0 ) {
-		int rank = MPI_WORLD_RANK();
+		int rank = comm.getRank();
 
 		// Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
 		tpool.wait(wait.restart);
diff --git a/tests/lbpm_permeability_simulator.cpp b/tests/lbpm_permeability_simulator.cpp
index dbcfb96b..eb5e6d4b 100644
--- a/tests/lbpm_permeability_simulator.cpp
+++ b/tests/lbpm_permeability_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/MRTModel.h"
 //#define WRITE_SURFACES
 
@@ -24,11 +24,10 @@ using namespace std;
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -39,7 +38,7 @@ int main(int argc, char **argv)
 		int device=ScaLBL_SetDevice(rank);
         NULL_USE( device );
 		ScaLBL_DeviceBarrier();
-		MPI_Barrier(comm);
+		comm.barrier();
 		
 		ScaLBL_MRTModel MRT(rank,nprocs,comm);
 		auto filename = argv[1];
@@ -52,7 +51,7 @@ int main(int argc, char **argv)
 		MRT.VelocityField();
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_plates_pp.cpp b/tests/lbpm_plates_pp.cpp
index 8344df47..acd64f52 100644
--- a/tests/lbpm_plates_pp.cpp
+++ b/tests/lbpm_plates_pp.cpp
@@ -9,19 +9,15 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -79,7 +75,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -92,7 +88,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -116,7 +112,7 @@ int main(int argc, char **argv)
         std::shared_ptr<TwoPhase> Averages( new TwoPhase(Dm) );
 
 	 
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -200,7 +196,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_porenetwork_pp.cpp b/tests/lbpm_porenetwork_pp.cpp
index 496f9d86..4a6ccda7 100644
--- a/tests/lbpm_porenetwork_pp.cpp
+++ b/tests/lbpm_porenetwork_pp.cpp
@@ -9,19 +9,15 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -69,7 +65,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -82,7 +78,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -108,7 +104,7 @@ int main(int argc, char **argv)
 	Dm->CommInit();
         std::shared_ptr<TwoPhase> Averages( new TwoPhase(Dm) );
 
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	Nx += 2; Ny += 2; Nz += 2;
 
@@ -293,7 +289,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_random_pp.cpp b/tests/lbpm_random_pp.cpp
index 07c56e6f..ad4b83cc 100644
--- a/tests/lbpm_random_pp.cpp
+++ b/tests/lbpm_random_pp.cpp
@@ -52,11 +52,10 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
 	int InitialWetting;
 	double Saturation;
@@ -97,7 +96,7 @@ int main(int argc, char **argv)
 		domain >> Lz;
 
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -110,7 +109,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	// Check that the number of processors >= the number of ranks
 	if ( rank==0 ) {
@@ -422,7 +421,7 @@ int main(int argc, char **argv)
 	fwrite(id,1,N,ID);
 	fclose(ID);
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index d90dbb04..149ae673 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -16,11 +16,10 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
 	{
 		//.......................................................................
@@ -422,7 +421,7 @@ int main(int argc, char **argv)
 
 
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_segmented_decomp.cpp b/tests/lbpm_segmented_decomp.cpp
index 3384e454..1bc89adb 100644
--- a/tests/lbpm_segmented_decomp.cpp
+++ b/tests/lbpm_segmented_decomp.cpp
@@ -18,12 +18,10 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 
 
@@ -84,7 +82,7 @@ int main(int argc, char **argv)
 			image >> zStart;
 
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Computational domain
 		//.................................................
 		MPI_Bcast(&nx,1,MPI_INT,0,comm);
@@ -105,7 +103,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&yStart,1,MPI_INT,0,comm);
 		MPI_Bcast(&zStart,1,MPI_INT,0,comm);
 		//.................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Check that the number of processors >= the number of ranks
 		if ( rank==0 ) {
@@ -129,7 +127,7 @@ int main(int argc, char **argv)
 			fclose(SEGDAT);
 			printf("Read segmented data from %s \n",Filename);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		// Get the rank info
 		int N = (nx+2)*(ny+2)*(nz+2);
@@ -204,7 +202,7 @@ int main(int argc, char **argv)
 			printf("Ready to recieve data %i at process %i \n", N,rank);
 			MPI_Recv(Dm.id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
 		}
-		MPI_Barrier(comm);
+		comm.barrier();
 
 		nx+=2; ny+=2; nz+=2;
 		N=nx*ny*nz;
@@ -340,7 +338,7 @@ int main(int argc, char **argv)
 		if (!MULTINPUT){
 
 			if (rank==0) printf("Writing symmetric domain reflection\n");
-			MPI_Barrier(comm);
+			comm.barrier();
 			int symrank,sympz;
 			sympz = 2*nprocz - Dm.kproc() -1;
 			symrank = sympz*nprocx*nprocy + Dm.jproc()*nprocx + Dm.iproc();
@@ -366,6 +364,6 @@ int main(int argc, char **argv)
 			fclose(SYMID);
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_segmented_pp.cpp b/tests/lbpm_segmented_pp.cpp
index 007ff9d1..39cf0bd1 100644
--- a/tests/lbpm_segmented_pp.cpp
+++ b/tests/lbpm_segmented_pp.cpp
@@ -115,11 +115,10 @@ double ReadFromBlock( char *ID, int iproc, int jproc, int kproc, int Nx, int Ny,
 int main(int argc, char **argv)
 {
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{	
 		//.......................................................................
 		// Reading the domain information file
@@ -231,7 +230,7 @@ int main(int argc, char **argv)
 		fclose(DIST);
 
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;
 
diff --git a/tests/lbpm_sphere_pp.cpp b/tests/lbpm_sphere_pp.cpp
index 98778b8d..2e053eed 100644
--- a/tests/lbpm_sphere_pp.cpp
+++ b/tests/lbpm_sphere_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/SpherePack.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Communication.h"
 
 /*
@@ -22,15 +22,11 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int iproc,jproc,kproc;
 	int sendtag,recvtag;
@@ -127,14 +123,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	if (rank == 0){
 		// Compute the Sauter mean diameter
@@ -217,7 +213,7 @@ int main(int argc, char **argv)
 	fclose(ID);
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_squaretube_pp.cpp b/tests/lbpm_squaretube_pp.cpp
index 42715773..c1f05aee 100644
--- a/tests/lbpm_squaretube_pp.cpp
+++ b/tests/lbpm_squaretube_pp.cpp
@@ -9,19 +9,15 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -85,7 +81,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -98,7 +94,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -125,7 +121,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -259,7 +255,7 @@ int main(int argc, char **argv)
 
 	}
         // ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_uCT_maskfilter.cpp b/tests/lbpm_uCT_maskfilter.cpp
index cff41ad7..857bc4e0 100644
--- a/tests/lbpm_uCT_maskfilter.cpp
+++ b/tests/lbpm_uCT_maskfilter.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -30,13 +30,11 @@
 
 int main(int argc, char **argv)
 {
-
 	// Initialize MPI
-	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+	Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     Utilities::setErrorHandlers();
 	PROFILE_START("Main");
 
@@ -151,7 +149,7 @@ int main(int argc, char **argv)
       
     }
     netcdf::close( distid );
-	MPI_Barrier(comm);
+	comm.barrier();
 	PROFILE_STOP("ReadDistance");
 	if (rank==0) printf("Finished reading distance =\n");
 
@@ -184,7 +182,7 @@ int main(int argc, char **argv)
         fillFloat[0]->fill( LOCVOL[0] );
     }
     netcdf::close( fid );
-	MPI_Barrier(comm);
+	comm.barrier();
 	PROFILE_STOP("ReadVolume");
 	if (rank==0) printf("Read complete\n");
 
@@ -447,7 +445,7 @@ int main(int argc, char **argv)
 
 	PROFILE_STOP("Main");
 	PROFILE_SAVE("lbpm_uCT_maskfilter",true);
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_uCT_pp.cpp b/tests/lbpm_uCT_pp.cpp
index 0285b864..6e8d1bde 100644
--- a/tests/lbpm_uCT_pp.cpp
+++ b/tests/lbpm_uCT_pp.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -31,11 +31,10 @@ int main(int argc, char **argv)
 {
 
     // Initialize MPI
-    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
     {
         Utilities::setErrorHandlers();
         PROFILE_START("Main");
@@ -188,7 +187,7 @@ int main(int argc, char **argv)
             fillFloat[0]->fill( LOCVOL[0] );
         }
         netcdf::close( fid );
-        MPI_Barrier(comm);
+        comm.barrier();
         PROFILE_STOP("ReadVolume");
         if (rank==0) printf("Read complete\n");
 
@@ -251,15 +250,15 @@ int main(int argc, char **argv)
                 }
             }
         }
-        count_plus=sumReduce( Dm[0]->Comm, count_plus);
-        count_minus=sumReduce( Dm[0]->Comm, count_minus);
+        count_plus = Dm[0]->Comm.sumReduce( count_plus);
+        count_minus = Dm[0]->Comm.sumReduce( count_minus);
               if (rank==0) printf("minimum value=%f, max value=%f \n",min_value,max_value);
         if (rank==0) printf("plus=%i, minus=%i \n",count_plus,count_minus);
         ASSERT( count_plus > 0 && count_minus > 0 );
-        MPI_Barrier(comm);
-        mean_plus = sumReduce( Dm[0]->Comm, mean_plus ) / count_plus;
-        mean_minus = sumReduce( Dm[0]->Comm, mean_minus ) / count_minus;
-        MPI_Barrier(comm);
+        comm.barrier();
+        mean_plus = Dm[0]->Comm.sumReduce( mean_plus ) / count_plus;
+        mean_minus = Dm[0]->Comm.sumReduce( mean_minus ) / count_minus;
+        comm.barrier();
         if (rank==0) printf("    Region 1 mean (+): %f, Region 2 mean (-): %f \n",mean_plus, mean_minus);
 
         //if (rank==0) printf("Scale the input data (size = %i) \n",LOCVOL[0].length());
@@ -280,7 +279,7 @@ int main(int argc, char **argv)
 
         // Fill the source data for the coarse meshes
         if (rank==0) printf("Coarsen the mesh for N_levels=%i \n",N_levels);
-        MPI_Barrier(comm); 
+        comm.barrier(); 
         PROFILE_START("CoarsenMesh");
         for (int i=1; i<N_levels; i++) {
             Array<float> filter(ratio[0],ratio[1],ratio[2]);
@@ -296,7 +295,7 @@ int main(int argc, char **argv)
                 printf("   filter_x=%i, filter_y=%i, filter_z=%i \n",int(filter.size(0)),int(filter.size(1)),int(filter.size(2))  );
                 printf("   ratio= %i,%i,%i \n",int(ratio[0]),int(ratio[1]),int(ratio[2])  );
             }
-            MPI_Barrier(comm);
+            comm.barrier();
         }
         PROFILE_STOP("CoarsenMesh");
 
@@ -308,7 +307,7 @@ int main(int argc, char **argv)
                 NonLocalMean.back(), *fillFloat.back(), *Dm.back(), nprocx, 
                 rough_cutoff, lamda, nlm_sigsq, nlm_depth);
         PROFILE_STOP("Solve coarse mesh");
-        MPI_Barrier(comm);
+        comm.barrier();
 
         // Refine the solution
         PROFILE_START("Refine distance");
@@ -322,7 +321,7 @@ int main(int argc, char **argv)
                 rough_cutoff, lamda, nlm_sigsq, nlm_depth);
         }
         PROFILE_STOP("Refine distance");
-        MPI_Barrier(comm);    
+        comm.barrier();    
 
         // Perform a final filter
         PROFILE_START("Filtering final domains");
@@ -418,14 +417,14 @@ int main(int argc, char **argv)
             meshData[0].vars.push_back(filter_Dist2_var);
             fillDouble[0]->copy( filter_Dist2, filter_Dist2_var->data );
         #endif
-        MPI_Barrier(comm);
+        comm.barrier();
         if (rank==0) printf("Writing output \n");
         // Write visulization data
         IO::writeData( 0, meshData, comm );
         if (rank==0) printf("Finished. \n");
     
         // Compute the Minkowski functionals
-        MPI_Barrier(comm);
+        comm.barrier();
         auto Averages = std::make_shared<Minkowski>(Dm[0]);
         
         Array <char> phase_label(Nx[0]+2,Ny[0]+2,Nz[0]+2);
@@ -457,7 +456,7 @@ int main(int argc, char **argv)
     }
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_uCT_pp",true);
-    MPI_Barrier(comm);
+    comm.barrier();
     MPI_Finalize();
     return 0;
 }
diff --git a/tests/testCommunication.cpp b/tests/testCommunication.cpp
index 57ce0959..911ef1c5 100644
--- a/tests/testCommunication.cpp
+++ b/tests/testCommunication.cpp
@@ -6,7 +6,7 @@
 #include <fstream>
 
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Array.h"
 
 using namespace std;
@@ -15,11 +15,9 @@ using namespace std;
 
 //***************************************************************************************
 
-int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
+int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int nprocz )
 {
-    int rank,nprocs;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    int rank = comm.getRank();
     int iproc,jproc,kproc;
     int sendtag,recvtag;
     if (rank==0)    printf("\nRunning test %i %i %i\n",nprocx,nprocy,nprocz);
@@ -38,7 +36,7 @@ int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
 	    rank_xy, rank_XY, rank_xY, rank_Xy,
 	    rank_xz, rank_XZ, rank_xZ, rank_Xz,
 	    rank_yz, rank_YZ, rank_yZ, rank_Yz );
-    MPI_Barrier(comm);
+    comm.barrier();
 
     //**********************************
 
@@ -85,7 +83,7 @@ int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
     sendCount_xy = sendCount_yz = sendCount_xz = sendCount_Xy = sendCount_Yz = sendCount_xZ = 0;
     sendCount_xY = sendCount_yZ = sendCount_Xz = sendCount_XY = sendCount_YZ = sendCount_XZ = 0;
 
-    MPI_Barrier(comm);
+    comm.barrier();
     if (rank==0)    printf ("SendLists are ready on host\n");
     //......................................................................................
     // Use MPI to fill in the recvCounts form the associated processes
@@ -158,7 +156,7 @@ int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
         recvCount_yz, recvCount_YZ, recvCount_yZ, recvCount_Yz,
         rank_x, rank_y, rank_z, rank_X, rank_Y, rank_Z, rank_xy, rank_XY, rank_xY,
         rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz, rank_yz, rank_YZ, rank_yZ, rank_Yz );
-    MPI_Barrier(comm);
+    comm.barrier();
     if (rank==0)    printf ("RecvLists finished\n");
     
     // Free memory
@@ -181,11 +179,9 @@ int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
 
 
 template<class TYPE>
-int testHalo( MPI_Comm comm, int nprocx, int nprocy, int nprocz, int depth )
+int testHalo( const Utilities::MPI& comm, int nprocx, int nprocy, int nprocz, int depth )
 {
-    int rank,nprocs;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    int rank = comm.getRank();
     if ( rank==0 )
         printf("\nRunning Halo test %i %i %i %i\n",nprocx,nprocy,nprocz,depth);
 
@@ -255,11 +251,10 @@ int testHalo( MPI_Comm comm, int nprocx, int nprocy, int nprocz, int depth )
 int main(int argc, char **argv)
 {
     // Initialize MPI
-    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-    MPI_Comm_rank(comm,&rank);
-    MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+    int rank = comm.getRank();
+    int nprocs = comm.getSize();
 
     // Run the test with different domains
     int N_errors = 0;
@@ -289,10 +284,9 @@ int main(int argc, char **argv)
     }
 
     // Finished
-    MPI_Barrier(comm);
-    int N_errors_global=0;
-    MPI_Allreduce( &N_errors, &N_errors_global, 1, MPI_INT, MPI_SUM, comm );
-    MPI_Barrier(comm);
+    comm.barrier();
+    int N_errors_global = comm.sumReduce( N_errors );
+    comm.barrier();
     MPI_Finalize();
     if ( rank==0 ) {
         if ( N_errors_global==0 )
diff --git a/tests/test_dcel_minkowski.cpp b/tests/test_dcel_minkowski.cpp
index 0d6cbca9..2669b522 100644
--- a/tests/test_dcel_minkowski.cpp
+++ b/tests/test_dcel_minkowski.cpp
@@ -26,9 +26,9 @@ std::shared_ptr<Database> loadInputs( )
 int main(int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
-	//int rank = MPI_WORLD_RANK();
-	//int nprocs = MPI_WORLD_SIZE();
+	Utilities::MPI comm( MPI_COMM_WORLD );
+	//int rank = comm.getRank();
+	//int nprocs = comm.getSize();
 	int toReturn = 0;
 	{
 		int i,j,k;
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 		
 	}
     PROFILE_SAVE("test_dcel_minkowski");
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return toReturn;
 }
diff --git a/tests/test_dcel_tri_normal.cpp b/tests/test_dcel_tri_normal.cpp
index 1e85b1f3..b6497140 100644
--- a/tests/test_dcel_tri_normal.cpp
+++ b/tests/test_dcel_tri_normal.cpp
@@ -26,7 +26,7 @@ std::shared_ptr<Database> loadInputs( )
 int main(int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	MPI_Comm comm = MPI_COMM_WORLD;
+	Utilities::MPI comm( MPI_COMM_WORLD );
 	int toReturn = 0;
 	{
 		int i,j,k;
@@ -136,7 +136,7 @@ int main(int argc, char **argv)
 		if (count_check > 0)  toReturn=2;
 		else printf("Succeeded. \n");
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	return toReturn;
 }

From 0f91767b6c870101084fbae0978280c04c85a004 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Tue, 28 Jan 2020 12:33:36 -0500
Subject: [PATCH 030/121] Moving more MPI calls to the wrapper

---
 IO/netcdf.cpp                         |   2 +-
 StackTrace/ErrorHandlers.h            |   2 +-
 StackTrace/Utilities.cpp              |   2 +-
 analysis/TwoPhase.cpp                 |   7 +-
 analysis/morphology.cpp               | 132 +++--
 common/Communication.h                | 216 ++++----
 common/Domain.cpp                     | 206 ++++----
 common/Domain.h                       |   3 -
 common/MPI.I                          |  33 ++
 common/MPI.cpp                        |  48 ++
 common/MPI.h                          |   7 +
 common/ScaLBL.h                       |   1 -
 common/Utilities.cpp                  |   2 +-
 cpu/exe/lb2_Color_mpi.cpp             | 538 ++++++++++---------
 cpu/exe/lb2_Color_wia_mpi_bubble.cpp  | 711 ++++++++++++--------------
 gpu/exe/lb1_MRT_mpi.cpp               | 348 ++++++-------
 gpu/exe/lb1_MRT_mpi.cu                | 352 ++++++-------
 gpu/exe/lb2_Color.cu                  |  65 +--
 gpu/exe/lb2_Color_mpi.cpp             | 541 ++++++++++----------
 gpu/exe/lb2_Color_pBC_wia_mpi.cpp     | 621 ++++++++++------------
 models/ColorModel.cpp                 |   8 +-
 models/DFHModel.cpp                   |   4 +-
 models/MRTModel.cpp                   |   4 +-
 tests/BlobAnalyzeParallel.cpp         |  22 +-
 tests/GenerateSphereTest.cpp          |  54 +-
 tests/TestBlobAnalyze.cpp             |  28 +-
 tests/TestBubble.cpp                  |  41 +-
 tests/TestBubbleDFH.cpp               |   4 +-
 tests/TestColorGrad.cpp               |  20 +-
 tests/TestCommD3Q19.cpp               |   4 +-
 tests/TestForceD3Q19.cpp              |   4 +-
 tests/TestForceMoments.cpp            |   4 +-
 tests/TestMRT.cpp                     |  28 +-
 tests/TestMicroCTReader.cpp           |   1 -
 tests/TestMomentsD3Q19.cpp            |   2 +-
 tests/TestNetcdf.cpp                  |   2 +-
 tests/TestSegDist.cpp                 |   4 +-
 tests/lb2_CMT_wia.cpp                 |  30 +-
 tests/lb2_Color_blob_wia_mpi.cpp      | 427 ++++++++--------
 tests/lbpm_BGK_simulator.cpp          |  48 +-
 tests/lbpm_color_macro_simulator.cpp  |  61 ++-
 tests/lbpm_disc_pp.cpp                |  34 +-
 tests/lbpm_inkbottle_pp.cpp           |  22 +-
 tests/lbpm_juanes_bench_disc_pp.cpp   |  35 +-
 tests/lbpm_nondarcy_simulator.cpp     |  52 +-
 tests/lbpm_nonnewtonian_simulator.cpp |  26 +-
 tests/lbpm_plates_pp.cpp              |  24 +-
 tests/lbpm_porenetwork_pp.cpp         |  25 +-
 tests/lbpm_random_pp.cpp              |  92 ++--
 tests/lbpm_segmented_decomp.cpp       |  48 +-
 tests/lbpm_segmented_pp.cpp           |   2 +-
 tests/lbpm_sphere_pp.cpp              |  16 +-
 tests/lbpm_squaretube_pp.cpp          |  25 +-
 53 files changed, 2360 insertions(+), 2678 deletions(-)

diff --git a/IO/netcdf.cpp b/IO/netcdf.cpp
index e061579a..6c3773e3 100644
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@@ -119,7 +119,7 @@ std::string VariableTypeName( VariableType type )
 int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm )
 {
     int fid = 0;
-    if ( comm == MPI_COMM_NULL ) {
+    if ( comm.isNull() ) {
         if ( mode == READ ) {
             int err = nc_open( filename.c_str(), NC_NOWRITE, &fid );
             CHECK_NC_ERR( err );
diff --git a/StackTrace/ErrorHandlers.h b/StackTrace/ErrorHandlers.h
index 12b8d7de..e43a4688 100644
--- a/StackTrace/ErrorHandlers.h
+++ b/StackTrace/ErrorHandlers.h
@@ -6,7 +6,7 @@
 
 #include <functional>
 
-#include "mpi.h"
+#include "common/MPI.h"
 
 
 namespace StackTrace
diff --git a/StackTrace/Utilities.cpp b/StackTrace/Utilities.cpp
index 11f05777..5fb8e9b8 100644
--- a/StackTrace/Utilities.cpp
+++ b/StackTrace/Utilities.cpp
@@ -14,7 +14,7 @@
 #include <typeinfo>
 
 #ifdef USE_MPI
-#include "mpi.h"
+#include "common/MPI.h"
 #endif
 
 #ifdef USE_TIMER
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index ea136758..1dbdfbfa 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -890,14 +890,14 @@ void TwoPhase::ComponentAverages()
 	RecvBuffer.resize(BLOB_AVG_COUNT,NumberComponents_NWP);
 
 /*	for (int b=0; b<NumberComponents_NWP; b++){
-		MPI_Barrier(Dm->Comm);
-		MPI_Allreduce(&ComponentAverages_NWP(0,b),&RecvBuffer(0),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+		Dm->Comm.barrier();
+		Dm->Comm.sumReduce(&ComponentAverages_NWP(0,b),&RecvBuffer(0),BLOB_AVG_COUNT);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_NWP(idx,b)=RecvBuffer(idx);
 	}
 	*/
 	Dm->Comm.barrier();
 	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP);
-	//	MPI_Reduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
+	//	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT);
 
 	if (Dm->rank()==0){
 		printf("rescaling... \n");
@@ -994,7 +994,6 @@ void TwoPhase::ComponentAverages()
 	// reduce the wetting phase averages
 	for (int b=0; b<NumberComponents_WP; b++){
 		Dm->Comm.barrier();
-//		MPI_Allreduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		Dm->Comm.sumReduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_WP(idx,b)=RecvBuffer(idx);
 	}
diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index a65cb237..84ed3652 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -201,42 +201,24 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		PackID(Dm->sendList_yZ, Dm->sendCount_yZ ,sendID_yZ, id);
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
-		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		Dm->Comm.sendrecv(sendID_x,Dm->sendCount_x,Dm->rank_x(),sendtag,recvID_X,Dm->recvCount_X,Dm->rank_X(),recvtag);
+		Dm->Comm.sendrecv(sendID_X,Dm->sendCount_X,Dm->rank_X(),sendtag,recvID_x,Dm->recvCount_x,Dm->rank_x(),recvtag);
+		Dm->Comm.sendrecv(sendID_y,Dm->sendCount_y,Dm->rank_y(),sendtag,recvID_Y,Dm->recvCount_Y,Dm->rank_Y(),recvtag);
+		Dm->Comm.sendrecv(sendID_Y,Dm->sendCount_Y,Dm->rank_Y(),sendtag,recvID_y,Dm->recvCount_y,Dm->rank_y(),recvtag);
+		Dm->Comm.sendrecv(sendID_z,Dm->sendCount_z,Dm->rank_z(),sendtag,recvID_Z,Dm->recvCount_Z,Dm->rank_Z(),recvtag);
+		Dm->Comm.sendrecv(sendID_Z,Dm->sendCount_Z,Dm->rank_Z(),sendtag,recvID_z,Dm->recvCount_z,Dm->rank_z(),recvtag);
+		Dm->Comm.sendrecv(sendID_xy,Dm->sendCount_xy,Dm->rank_xy(),sendtag,recvID_XY,Dm->recvCount_XY,Dm->rank_XY(),recvtag);
+		Dm->Comm.sendrecv(sendID_XY,Dm->sendCount_XY,Dm->rank_XY(),sendtag,recvID_xy,Dm->recvCount_xy,Dm->rank_xy(),recvtag);
+		Dm->Comm.sendrecv(sendID_Xy,Dm->sendCount_Xy,Dm->rank_Xy(),sendtag,recvID_xY,Dm->recvCount_xY,Dm->rank_xY(),recvtag);
+		Dm->Comm.sendrecv(sendID_xY,Dm->sendCount_xY,Dm->rank_xY(),sendtag,recvID_Xy,Dm->recvCount_Xy,Dm->rank_Xy(),recvtag);
+		Dm->Comm.sendrecv(sendID_xz,Dm->sendCount_xz,Dm->rank_xz(),sendtag,recvID_XZ,Dm->recvCount_XZ,Dm->rank_XZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_XZ,Dm->sendCount_XZ,Dm->rank_XZ(),sendtag,recvID_xz,Dm->recvCount_xz,Dm->rank_xz(),recvtag);
+		Dm->Comm.sendrecv(sendID_Xz,Dm->sendCount_Xz,Dm->rank_Xz(),sendtag,recvID_xZ,Dm->recvCount_xZ,Dm->rank_xZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_xZ,Dm->sendCount_xZ,Dm->rank_xZ(),sendtag,recvID_Xz,Dm->recvCount_Xz,Dm->rank_Xz(),recvtag);
+		Dm->Comm.sendrecv(sendID_yz,Dm->sendCount_yz,Dm->rank_yz(),sendtag,recvID_YZ,Dm->recvCount_YZ,Dm->rank_YZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_YZ,Dm->sendCount_YZ,Dm->rank_YZ(),sendtag,recvID_yz,Dm->recvCount_yz,Dm->rank_yz(),recvtag);
+		Dm->Comm.sendrecv(sendID_Yz,Dm->sendCount_Yz,Dm->rank_Yz(),sendtag,recvID_yZ,Dm->recvCount_yZ,Dm->rank_yZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_yZ,Dm->sendCount_yZ,Dm->rank_yZ(),sendtag,recvID_Yz,Dm->recvCount_Yz,Dm->rank_Yz(),recvtag);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -303,7 +285,7 @@ double morph_open()
 	fillHalo<char> fillChar(Dm->Comm,Dm->rank_info,{Nx-2,Ny-2,Nz-2},{1,1,1},0,1);
 
 
-	MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
 
 	count = 0.f;
 	for (int k=1; k<Nz-1; k++){
@@ -316,7 +298,7 @@ double morph_open()
 			}
 		}
 	}
-	MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	countGlobal = Dm->Comm.sumReduce( count );
 	return countGlobal;
 }
 */
@@ -506,42 +488,42 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		PackID(Dm->sendList_yZ, Dm->sendCount_yZ ,sendID_yZ, id);
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
-		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
-		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		Dm->Comm.sendrecv(sendID_x,Dm->sendCount_x,Dm->rank_x(),sendtag,
+				recvID_X,Dm->recvCount_X,Dm->rank_X(),recvtag);
+		Dm->Comm.sendrecv(sendID_X,Dm->sendCount_X,Dm->rank_X(),sendtag,
+				recvID_x,Dm->recvCount_x,Dm->rank_x(),recvtag);
+		Dm->Comm.sendrecv(sendID_y,Dm->sendCount_y,Dm->rank_y(),sendtag,
+				recvID_Y,Dm->recvCount_Y,Dm->rank_Y(),recvtag);
+		Dm->Comm.sendrecv(sendID_Y,Dm->sendCount_Y,Dm->rank_Y(),sendtag,
+				recvID_y,Dm->recvCount_y,Dm->rank_y(),recvtag);
+		Dm->Comm.sendrecv(sendID_z,Dm->sendCount_z,Dm->rank_z(),sendtag,
+				recvID_Z,Dm->recvCount_Z,Dm->rank_Z(),recvtag);
+		Dm->Comm.sendrecv(sendID_Z,Dm->sendCount_Z,Dm->rank_Z(),sendtag,
+				recvID_z,Dm->recvCount_z,Dm->rank_z(),recvtag);
+		Dm->Comm.sendrecv(sendID_xy,Dm->sendCount_xy,Dm->rank_xy(),sendtag,
+				recvID_XY,Dm->recvCount_XY,Dm->rank_XY(),recvtag);
+		Dm->Comm.sendrecv(sendID_XY,Dm->sendCount_XY,Dm->rank_XY(),sendtag,
+				recvID_xy,Dm->recvCount_xy,Dm->rank_xy(),recvtag);
+		Dm->Comm.sendrecv(sendID_Xy,Dm->sendCount_Xy,Dm->rank_Xy(),sendtag,
+				recvID_xY,Dm->recvCount_xY,Dm->rank_xY(),recvtag);
+		Dm->Comm.sendrecv(sendID_xY,Dm->sendCount_xY,Dm->rank_xY(),sendtag,
+				recvID_Xy,Dm->recvCount_Xy,Dm->rank_Xy(),recvtag);
+		Dm->Comm.sendrecv(sendID_xz,Dm->sendCount_xz,Dm->rank_xz(),sendtag,
+				recvID_XZ,Dm->recvCount_XZ,Dm->rank_XZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_XZ,Dm->sendCount_XZ,Dm->rank_XZ(),sendtag,
+				recvID_xz,Dm->recvCount_xz,Dm->rank_xz(),recvtag);
+		Dm->Comm.sendrecv(sendID_Xz,Dm->sendCount_Xz,Dm->rank_Xz(),sendtag,
+				recvID_xZ,Dm->recvCount_xZ,Dm->rank_xZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_xZ,Dm->sendCount_xZ,Dm->rank_xZ(),sendtag,
+				recvID_Xz,Dm->recvCount_Xz,Dm->rank_Xz(),recvtag);
+		Dm->Comm.sendrecv(sendID_yz,Dm->sendCount_yz,Dm->rank_yz(),sendtag,
+				recvID_YZ,Dm->recvCount_YZ,Dm->rank_YZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_YZ,Dm->sendCount_YZ,Dm->rank_YZ(),sendtag,
+				recvID_yz,Dm->recvCount_yz,Dm->rank_yz(),recvtag);
+		Dm->Comm.sendrecv(sendID_Yz,Dm->sendCount_Yz,Dm->rank_Yz(),sendtag,
+				recvID_yZ,Dm->recvCount_yZ,Dm->rank_yZ(),recvtag);
+		Dm->Comm.sendrecv(sendID_yZ,Dm->sendCount_yZ,Dm->rank_yZ(),sendtag,
+				recvID_Yz,Dm->recvCount_Yz,Dm->rank_Yz(),recvtag);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -617,7 +599,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		}
 		
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		MPI_Barrier(Dm->Comm);
+		Dm->Comm.barrier();
 		
 		for (int k=1; k<nz-1; k++){
 			for (int j=1; j<ny-1; j++){
diff --git a/common/Communication.h b/common/Communication.h
index 7c2f8d08..cf83ffe3 100644
--- a/common/Communication.h
+++ b/common/Communication.h
@@ -134,7 +134,7 @@ void InitializeRanks( const int rank, const int nprocx, const int nprocy, const
 
 
 //***************************************************************************************
-inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
+inline void CommunicateSendRecvCounts( const Utilities::MPI& comm, int sendtag, int recvtag, 
 		int rank_x, int rank_y, int rank_z, 
 		int rank_X, int rank_Y, int rank_Z,
 		int rank_xy, int rank_XY, int rank_xY, int rank_Xy,
@@ -152,54 +152,53 @@ inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int s
 		int& recvCount_yz, int& recvCount_YZ, int& recvCount_yZ, int& recvCount_Yz )
 {
 	MPI_Request req1[18], req2[18];
-	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator.getCommunicator(),&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator.getCommunicator(),&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator.getCommunicator(),&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator.getCommunicator(),&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator.getCommunicator(),&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator.getCommunicator(),&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator.getCommunicator(),&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator.getCommunicator(),&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator.getCommunicator(),&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator.getCommunicator(),&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator.getCommunicator(),&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator.getCommunicator(),&req2[5]);
+	req1[0] = comm.Isend(&sendCount_x,1,rank_x,sendtag+0);
+	req2[0] = comm.Irecv(&recvCount_X,1,rank_X,recvtag+0);
+	req1[1] = comm.Isend(&sendCount_X,1,rank_X,sendtag+1);
+	req2[1] = comm.Irecv(&recvCount_x,1,rank_x,recvtag+1);
+	req1[2] = comm.Isend(&sendCount_y,1,rank_y,sendtag+2);
+	req2[2] = comm.Irecv(&recvCount_Y,1,rank_Y,recvtag+2);
+	req1[3] = comm.Isend(&sendCount_Y,1,rank_Y,sendtag+3);
+	req2[3] = comm.Irecv(&recvCount_y,1,rank_y,recvtag+3);
+	req1[4] = comm.Isend(&sendCount_z,1,rank_z,sendtag+4);
+	req2[4] = comm.Irecv(&recvCount_Z,1,rank_Z,recvtag+4);
+	req1[5] = comm.Isend(&sendCount_Z,1,rank_Z,sendtag+5);
+	req2[5] = comm.Irecv(&recvCount_z,1,rank_z,recvtag+5);
 
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator.getCommunicator(),&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator.getCommunicator(),&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator.getCommunicator(),&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator.getCommunicator(),&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator.getCommunicator(),&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator.getCommunicator(),&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator.getCommunicator(),&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator.getCommunicator(),&req2[9]);
+	req1[6] = comm.Isend(&sendCount_xy,1,rank_xy,sendtag+6);
+	req2[6] = comm.Irecv(&recvCount_XY,1,rank_XY,recvtag+6);
+	req1[7] = comm.Isend(&sendCount_XY,1,rank_XY,sendtag+7);
+	req2[7] = comm.Irecv(&recvCount_xy,1,rank_xy,recvtag+7);
+	req1[8] = comm.Isend(&sendCount_Xy,1,rank_Xy,sendtag+8);
+	req2[8] = comm.Irecv(&recvCount_xY,1,rank_xY,recvtag+8);
+	req1[9] = comm.Isend(&sendCount_xY,1,rank_xY,sendtag+9);
+	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_Xy,recvtag+9);
 
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator.getCommunicator(),&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator.getCommunicator(),&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator.getCommunicator(),&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator.getCommunicator(),&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator.getCommunicator(),&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator.getCommunicator(),&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator.getCommunicator(),&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator.getCommunicator(),&req2[13]);
+	req1[10] = comm.Isend(&sendCount_xz,1,rank_xz,sendtag+10);
+	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_XZ,recvtag+10);
+	req1[11] = comm.Isend(&sendCount_XZ,1,rank_XZ,sendtag+11);
+	req2[11] = comm.Irecv(&recvCount_xz,1,rank_xz,recvtag+11);
+	req1[12] = comm.Isend(&sendCount_Xz,1,rank_Xz,sendtag+12);
+	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_xZ,recvtag+12);
+	req1[13] = comm.Isend(&sendCount_xZ,1,rank_xZ,sendtag+13);
+	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_Xz,recvtag+13);
 
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator.getCommunicator(),&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator.getCommunicator(),&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator.getCommunicator(),&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator.getCommunicator(),&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator.getCommunicator(),&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator.getCommunicator(),&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator.getCommunicator(),&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator.getCommunicator(),&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	Communicator.barrier();
+	req1[14] = comm.Isend(&sendCount_yz,1,rank_yz,sendtag+14);
+	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_YZ,recvtag+14);
+	req1[15] = comm.Isend(&sendCount_YZ,1,rank_YZ,sendtag+15);
+	req2[15] = comm.Irecv(&recvCount_yz,1,rank_yz,recvtag+15);
+	req1[16] = comm.Isend(&sendCount_Yz,1,rank_Yz,sendtag+16);
+	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_yZ,recvtag+16);
+	req1[17] = comm.Isend(&sendCount_yZ,1,rank_yZ,sendtag+17);
+	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_Yz,recvtag+17);
+    comm.waitAll( 18, req1 );
+    comm.waitAll( 18, req2 );
+	comm.barrier();
 }
 
 
 //***************************************************************************************
-inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
+inline void CommunicateRecvLists( const Utilities::MPI& comm, int sendtag, int recvtag, 
 		int *sendList_x, int *sendList_y, int *sendList_z, int *sendList_X, int *sendList_Y, int *sendList_Z,
 		int *sendList_xy, int *sendList_XY, int *sendList_xY, int *sendList_Xy,
 		int *sendList_xz, int *sendList_XZ, int *sendList_xZ, int *sendList_Xz,
@@ -220,53 +219,52 @@ inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendta
 		int rank_Xy, int rank_xz, int rank_XZ, int rank_xZ, int rank_Xz, int rank_yz, int rank_YZ, int rank_yZ, int rank_Yz)
 {
 	MPI_Request req1[18], req2[18];
-	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator.getCommunicator(),&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator.getCommunicator(),&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator.getCommunicator(),&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator.getCommunicator(),&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator.getCommunicator(),&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator.getCommunicator(),&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator.getCommunicator(),&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator.getCommunicator(),&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator.getCommunicator(),&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator.getCommunicator(),&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator.getCommunicator(),&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator.getCommunicator(),&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_x,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_X,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_X,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_x,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_Y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_Y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_Z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_Z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator.getCommunicator(),&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator.getCommunicator(),&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator.getCommunicator(),&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator.getCommunicator(),&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator.getCommunicator(),&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator.getCommunicator(),&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator.getCommunicator(),&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator.getCommunicator(),&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_xy,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_XY,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_XY,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_xy,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_xY,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_xY,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator.getCommunicator(),&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator.getCommunicator(),&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator.getCommunicator(),&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator.getCommunicator(),&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator.getCommunicator(),&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator.getCommunicator(),&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator.getCommunicator(),&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator.getCommunicator(),&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_xz,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_xz,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_Xz,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator.getCommunicator(),&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator.getCommunicator(),&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator.getCommunicator(),&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator.getCommunicator(),&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator.getCommunicator(),&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator.getCommunicator(),&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator.getCommunicator(),&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator.getCommunicator(),&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_yz,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_yz,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz,recvtag);
+    comm.waitAll( 18, req1 );
+    comm.waitAll( 18, req2 );
 }
 
 
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communicator,
+inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& comm,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -316,42 +314,24 @@ inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communi
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendbuf_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
-	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_DOUBLE,rank_x,sendtag,
-			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_DOUBLE,rank_X,sendtag,
-			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_DOUBLE,rank_y,sendtag,
-			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,
-			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_DOUBLE,rank_z,sendtag,
-			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,
-			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,
-			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,
-			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,
-			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,
-			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,
-			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,
-			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,
-			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,
-			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,
-			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,
-			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,
-			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,
-			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	comm.sendrecv(sendbuf_x,sendCount_x,rank_x,sendtag,recvbuf_X,recvCount_X,rank_X,recvtag);
+	comm.sendrecv(sendbuf_X,sendCount_X,rank_X,sendtag,recvbuf_x,recvCount_x,rank_x,recvtag);
+	comm.sendrecv(sendbuf_y,sendCount_y,rank_y,sendtag,recvbuf_Y,recvCount_Y,rank_Y,recvtag);
+	comm.sendrecv(sendbuf_Y,sendCount_Y,rank_Y,sendtag,recvbuf_y,recvCount_y,rank_y,recvtag);
+	comm.sendrecv(sendbuf_z,sendCount_z,rank_z,sendtag,recvbuf_Z,recvCount_Z,rank_Z,recvtag);
+	comm.sendrecv(sendbuf_Z,sendCount_Z,rank_Z,sendtag,recvbuf_z,recvCount_z,rank_z,recvtag);
+	comm.sendrecv(sendbuf_xy,sendCount_xy,rank_xy,sendtag,recvbuf_XY,recvCount_XY,rank_XY,recvtag);
+	comm.sendrecv(sendbuf_XY,sendCount_XY,rank_XY,sendtag,recvbuf_xy,recvCount_xy,rank_xy,recvtag);
+	comm.sendrecv(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag,recvbuf_xY,recvCount_xY,rank_xY,recvtag);
+	comm.sendrecv(sendbuf_xY,sendCount_xY,rank_xY,sendtag,recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
+	comm.sendrecv(sendbuf_xz,sendCount_xz,rank_xz,sendtag,recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
+	comm.sendrecv(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag,recvbuf_xz,recvCount_xz,rank_xz,recvtag);
+	comm.sendrecv(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag,recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
+	comm.sendrecv(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag,recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
+	comm.sendrecv(sendbuf_yz,sendCount_yz,rank_yz,sendtag,recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
+	comm.sendrecv(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag,recvbuf_yz,recvCount_yz,rank_yz,recvtag);
+	comm.sendrecv(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag,recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
+	comm.sendrecv(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag,recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
diff --git a/common/Domain.cpp b/common/Domain.cpp
index 58ca099b..eadda60d 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -792,44 +792,44 @@ void Domain::CommInit()
 	sendBuf_YZ = new int [sendCount_YZ];
 	sendBuf_XZ = new int [sendCount_XZ];
 	//......................................................................................
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm.getCommunicator(),&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm.getCommunicator(),&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm.getCommunicator(),&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm.getCommunicator(),&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm.getCommunicator(),&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm.getCommunicator(),&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm.getCommunicator(),&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm.getCommunicator(),&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm.getCommunicator(),&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm.getCommunicator(),&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm.getCommunicator(),&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm.getCommunicator(),&req2[5]);
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm.getCommunicator(),&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm.getCommunicator(),&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm.getCommunicator(),&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm.getCommunicator(),&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm.getCommunicator(),&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm.getCommunicator(),&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm.getCommunicator(),&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm.getCommunicator(),&req2[9]);
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm.getCommunicator(),&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm.getCommunicator(),&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm.getCommunicator(),&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm.getCommunicator(),&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm.getCommunicator(),&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm.getCommunicator(),&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm.getCommunicator(),&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm.getCommunicator(),&req2[13]);
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm.getCommunicator(),&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm.getCommunicator(),&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm.getCommunicator(),&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm.getCommunicator(),&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm.getCommunicator(),&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm.getCommunicator(),&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm.getCommunicator(),&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm.getCommunicator(),&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	req1[0] = Comm.Isend(&sendCount_x,1,rank_x(),sendtag+0);
+	req2[0] = Comm.Irecv(&recvCount_X,1,rank_X(),recvtag+0);
+	req1[1] = Comm.Isend(&sendCount_X,1,rank_X(),sendtag+1);
+	req2[1] = Comm.Irecv(&recvCount_x,1,rank_x(),recvtag+1);
+	req1[2] = Comm.Isend(&sendCount_y,1,rank_y(),sendtag+2);
+	req2[2] = Comm.Irecv(&recvCount_Y,1,rank_Y(),recvtag+2);
+	req1[3] = Comm.Isend(&sendCount_Y,1,rank_Y(),sendtag+3);
+	req2[3] = Comm.Irecv(&recvCount_y,1,rank_y(),recvtag+3);
+	req1[4] = Comm.Isend(&sendCount_z,1,rank_z(),sendtag+4);
+	req2[4] = Comm.Irecv(&recvCount_Z,1,rank_Z(),recvtag+4);
+	req1[5] = Comm.Isend(&sendCount_Z,1,rank_Z(),sendtag+5);
+	req2[5] = Comm.Irecv(&recvCount_z,1,rank_z(),recvtag+5);
+	req1[6] = Comm.Isend(&sendCount_xy,1,rank_xy(),sendtag+6);
+	req2[6] = Comm.Irecv(&recvCount_XY,1,rank_XY(),recvtag+6);
+	req1[7] = Comm.Isend(&sendCount_XY,1,rank_XY(),sendtag+7);
+	req2[7] = Comm.Irecv(&recvCount_xy,1,rank_xy(),recvtag+7);
+	req1[8] = Comm.Isend(&sendCount_Xy,1,rank_Xy(),sendtag+8);
+	req2[8] = Comm.Irecv(&recvCount_xY,1,rank_xY(),recvtag+8);
+	req1[9] = Comm.Isend(&sendCount_xY,1,rank_xY(),sendtag+9);
+	req2[9] = Comm.Irecv(&recvCount_Xy,1,rank_Xy(),recvtag+9);
+	req1[10] = Comm.Isend(&sendCount_xz,1,rank_xz(),sendtag+10);
+	req2[10] = Comm.Irecv(&recvCount_XZ,1,rank_XZ(),recvtag+10);
+	req1[11] = Comm.Isend(&sendCount_XZ,1,rank_XZ(),sendtag+11);
+	req2[11] = Comm.Irecv(&recvCount_xz,1,rank_xz(),recvtag+11);
+	req1[12] = Comm.Isend(&sendCount_Xz,1,rank_Xz(),sendtag+12);
+	req2[12] = Comm.Irecv(&recvCount_xZ,1,rank_xZ(),recvtag+12);
+	req1[13] = Comm.Isend(&sendCount_xZ,1,rank_xZ(),sendtag+13);
+	req2[13] = Comm.Irecv(&recvCount_Xz,1,rank_Xz(),recvtag+13);
+	req1[14] = Comm.Isend(&sendCount_yz,1,rank_yz(),sendtag+14);
+	req2[14] = Comm.Irecv(&recvCount_YZ,1,rank_YZ(),recvtag+14);
+	req1[15] = Comm.Isend(&sendCount_YZ,1,rank_YZ(),sendtag+15);
+	req2[15] = Comm.Irecv(&recvCount_yz,1,rank_yz(),recvtag+15);
+	req1[16] = Comm.Isend(&sendCount_Yz,1,rank_Yz(),sendtag+16);
+	req2[16] = Comm.Irecv(&recvCount_yZ,1,rank_yZ(),recvtag+16);
+	req1[17] = Comm.Isend(&sendCount_yZ,1,rank_yZ(),sendtag+17);
+	req2[17] = Comm.Irecv(&recvCount_Yz,1,rank_Yz(),recvtag+17);
+	Comm.waitAll(18,req1);
+	Comm.waitAll(18,req2);
 	Comm.barrier();
 	//......................................................................................
 	// recv buffers
@@ -852,44 +852,44 @@ void Domain::CommInit()
 	recvList_YZ = new int [recvCount_YZ];
 	recvList_XZ = new int [recvCount_XZ];
 	//......................................................................................
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm.getCommunicator(),&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm.getCommunicator(),&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm.getCommunicator(),&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm.getCommunicator(),&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm.getCommunicator(),&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm.getCommunicator(),&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm.getCommunicator(),&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm.getCommunicator(),&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm.getCommunicator(),&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm.getCommunicator(),&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm.getCommunicator(),&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm.getCommunicator(),&req2[5]);
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm.getCommunicator(),&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm.getCommunicator(),&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm.getCommunicator(),&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm.getCommunicator(),&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm.getCommunicator(),&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm.getCommunicator(),&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm.getCommunicator(),&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm.getCommunicator(),&req2[9]);
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm.getCommunicator(),&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm.getCommunicator(),&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm.getCommunicator(),&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm.getCommunicator(),&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm.getCommunicator(),&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm.getCommunicator(),&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm.getCommunicator(),&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm.getCommunicator(),&req2[13]);
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm.getCommunicator(),&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm.getCommunicator(),&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm.getCommunicator(),&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm.getCommunicator(),&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm.getCommunicator(),&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm.getCommunicator(),&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm.getCommunicator(),&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm.getCommunicator(),&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	req1[0] = Comm.Isend(sendList_x,sendCount_x,rank_x(),sendtag);
+	req2[0] = Comm.Irecv(recvList_X,recvCount_X,rank_X(),recvtag);
+	req1[1] = Comm.Isend(sendList_X,sendCount_X,rank_X(),sendtag);
+	req2[1] = Comm.Irecv(recvList_x,recvCount_x,rank_x(),recvtag);
+	req1[2] = Comm.Isend(sendList_y,sendCount_y,rank_y(),sendtag);
+	req2[2] = Comm.Irecv(recvList_Y,recvCount_Y,rank_Y(),recvtag);
+	req1[3] = Comm.Isend(sendList_Y,sendCount_Y,rank_Y(),sendtag);
+	req2[3] = Comm.Irecv(recvList_y,recvCount_y,rank_y(),recvtag);
+	req1[4] = Comm.Isend(sendList_z,sendCount_z,rank_z(),sendtag);
+	req2[4] = Comm.Irecv(recvList_Z,recvCount_Z,rank_Z(),recvtag);
+	req1[5] = Comm.Isend(sendList_Z,sendCount_Z,rank_Z(),sendtag);
+	req2[5] = Comm.Irecv(recvList_z,recvCount_z,rank_z(),recvtag);
+	req1[6] = Comm.Isend(sendList_xy,sendCount_xy,rank_xy(),sendtag);
+	req2[6] = Comm.Irecv(recvList_XY,recvCount_XY,rank_XY(),recvtag);
+	req1[7] = Comm.Isend(sendList_XY,sendCount_XY,rank_XY(),sendtag);
+	req2[7] = Comm.Irecv(recvList_xy,recvCount_xy,rank_xy(),recvtag);
+	req1[8] = Comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy(),sendtag);
+	req2[8] = Comm.Irecv(recvList_xY,recvCount_xY,rank_xY(),recvtag);
+	req1[9] = Comm.Isend(sendList_xY,sendCount_xY,rank_xY(),sendtag);
+	req2[9] = Comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy(),recvtag);
+	req1[10] = Comm.Isend(sendList_xz,sendCount_xz,rank_xz(),sendtag);
+	req2[10] = Comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ(),recvtag);
+	req1[11] = Comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ(),sendtag);
+	req2[11] = Comm.Irecv(recvList_xz,recvCount_xz,rank_xz(),recvtag);
+	req1[12] = Comm.Isend(sendList_Xz,sendCount_Xz,rank_Xz(),sendtag);
+	req2[12] = Comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ(),recvtag);
+	req1[13] = Comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ(),sendtag);
+	req2[13] = Comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz(),recvtag);
+	req1[14] = Comm.Isend(sendList_yz,sendCount_yz,rank_yz(),sendtag);
+	req2[14] = Comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ(),recvtag);
+	req1[15] = Comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ(),sendtag);
+	req2[15] = Comm.Irecv(recvList_yz,recvCount_yz,rank_yz(),recvtag);
+	req1[16] = Comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz(),sendtag);
+	req2[16] = Comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ(),recvtag);
+	req1[17] = Comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ(),sendtag);
+	req2[17] = Comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz(),recvtag);
+	Comm.waitAll(18,req1);
+	Comm.waitAll(18,req2);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)    recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)    recvList_X[idx] += (Nx-2);
@@ -1079,42 +1079,24 @@ void Domain::CommunicateMeshHalo(DoubleArray &Mesh)
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendData_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendData_YZ, MeshData);
 	//......................................................................................
-	MPI_Sendrecv(sendData_x,sendCount_x,MPI_DOUBLE,rank_x(),sendtag,
-			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_X,sendCount_X,MPI_DOUBLE,rank_X(),sendtag,
-			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_y,sendCount_y,MPI_DOUBLE,rank_y(),sendtag,
-			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_Y,sendCount_Y,MPI_DOUBLE,rank_Y(),sendtag,
-			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_z,sendCount_z,MPI_DOUBLE,rank_z(),sendtag,
-			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_Z,sendCount_Z,MPI_DOUBLE,rank_Z(),sendtag,
-			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_xy,sendCount_xy,MPI_DOUBLE,rank_xy(),sendtag,
-			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_XY,sendCount_XY,MPI_DOUBLE,rank_XY(),sendtag,
-			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy(),sendtag,
-			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_xY,sendCount_xY,MPI_DOUBLE,rank_xY(),sendtag,
-			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_xz,sendCount_xz,MPI_DOUBLE,rank_xz(),sendtag,
-			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ(),sendtag,
-			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz(),sendtag,
-			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ(),sendtag,
-			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_yz,sendCount_yz,MPI_DOUBLE,rank_yz(),sendtag,
-			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ(),sendtag,
-			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz(),sendtag,
-			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendData_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ(),sendtag,
-			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	Comm.sendrecv(sendData_x,sendCount_x,rank_x(),sendtag,recvData_X,recvCount_X,rank_X(),recvtag);
+	Comm.sendrecv(sendData_X,sendCount_X,rank_X(),sendtag,recvData_x,recvCount_x,rank_x(),recvtag);
+	Comm.sendrecv(sendData_y,sendCount_y,rank_y(),sendtag,recvData_Y,recvCount_Y,rank_Y(),recvtag);
+	Comm.sendrecv(sendData_Y,sendCount_Y,rank_Y(),sendtag,recvData_y,recvCount_y,rank_y(),recvtag);
+	Comm.sendrecv(sendData_z,sendCount_z,rank_z(),sendtag,recvData_Z,recvCount_Z,rank_Z(),recvtag);
+	Comm.sendrecv(sendData_Z,sendCount_Z,rank_Z(),sendtag,recvData_z,recvCount_z,rank_z(),recvtag);
+	Comm.sendrecv(sendData_xy,sendCount_xy,rank_xy(),sendtag,recvData_XY,recvCount_XY,rank_XY(),recvtag);
+	Comm.sendrecv(sendData_XY,sendCount_XY,rank_XY(),sendtag,recvData_xy,recvCount_xy,rank_xy(),recvtag);
+	Comm.sendrecv(sendData_Xy,sendCount_Xy,rank_Xy(),sendtag,recvData_xY,recvCount_xY,rank_xY(),recvtag);
+	Comm.sendrecv(sendData_xY,sendCount_xY,rank_xY(),sendtag,recvData_Xy,recvCount_Xy,rank_Xy(),recvtag);
+	Comm.sendrecv(sendData_xz,sendCount_xz,rank_xz(),sendtag,recvData_XZ,recvCount_XZ,rank_XZ(),recvtag);
+	Comm.sendrecv(sendData_XZ,sendCount_XZ,rank_XZ(),sendtag,recvData_xz,recvCount_xz,rank_xz(),recvtag);
+	Comm.sendrecv(sendData_Xz,sendCount_Xz,rank_Xz(),sendtag,recvData_xZ,recvCount_xZ,rank_xZ(),recvtag);
+	Comm.sendrecv(sendData_xZ,sendCount_xZ,rank_xZ(),sendtag,recvData_Xz,recvCount_Xz,rank_Xz(),recvtag);
+	Comm.sendrecv(sendData_yz,sendCount_yz,rank_yz(),sendtag,recvData_YZ,recvCount_YZ,rank_YZ(),recvtag);
+	Comm.sendrecv(sendData_YZ,sendCount_YZ,rank_YZ(),sendtag,recvData_yz,recvCount_yz,rank_yz(),recvtag);
+	Comm.sendrecv(sendData_Yz,sendCount_Yz,rank_Yz(),sendtag,recvData_yZ,recvCount_yZ,rank_yZ(),recvtag);
+	Comm.sendrecv(sendData_yZ,sendCount_yZ,rank_yZ(),sendtag,recvData_Yz,recvCount_Yz,rank_Yz(),recvtag);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvData_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvData_X, MeshData);
diff --git a/common/Domain.h b/common/Domain.h
index 22b05af7..4343854f 100755
--- a/common/Domain.h
+++ b/common/Domain.h
@@ -120,8 +120,6 @@ public: // Public variables (need to create accessors instead)
 
     int BoundaryCondition;
 
-    MPI_Group Group;    // Group of processors associated with this domain
-
     //**********************************
     // MPI ranks for all 18 neighbors
     //**********************************
@@ -191,7 +189,6 @@ private:
     
 	//......................................................................................
 	MPI_Request req1[18], req2[18];
-	MPI_Status stat1[18],stat2[18];
 
     int *sendBuf_x, *sendBuf_y, *sendBuf_z, *sendBuf_X, *sendBuf_Y, *sendBuf_Z;
     int *sendBuf_xy, *sendBuf_yz, *sendBuf_xz, *sendBuf_Xy, *sendBuf_Yz, *sendBuf_xZ;
diff --git a/common/MPI.I b/common/MPI.I
index 8cbc9c09..0361e108 100644
--- a/common/MPI.I
+++ b/common/MPI.I
@@ -579,6 +579,39 @@ inline MPI_Request MPI_CLASS::Irecv(
 }
 
 
+/************************************************************************
+ *  sendrecv                                                                 *
+ ************************************************************************/
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::sendrecv<char>( const char*, int, int, int, char*, int, int, int ) const;
+template<>
+void MPI_CLASS::sendrecv<int>( const int*, int, int, int, int*, int, int, int ) const;
+template<>
+void MPI_CLASS::sendrecv<float>( const float*, int, int, int, float*, int, int, int ) const;
+template<>
+void MPI_CLASS::sendrecv<double>( const double*, int, int, int, double*, int, int, int ) const;
+template<class TYPE>
+void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag, 
+                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
+{
+    ERROR( "Not implimented" );
+}
+#else
+template<class TYPE>
+void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag, 
+                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
+{
+    ASSERT( dest == 0 );
+    ASSERT( source == 0 );
+    ASSERT( sendcount == recvcount );
+    ASSERT( sendtag == recvtag );
+    memcpy( recvbuf, sendbuf, sendcount * sizeof( TYPE ) );
+}
+#endif
+
+
+
 /************************************************************************
  *  allGather                                                            *
  ************************************************************************/
diff --git a/common/MPI.cpp b/common/MPI.cpp
index d20c1af2..9495372d 100644
--- a/common/MPI.cpp
+++ b/common/MPI.cpp
@@ -2805,6 +2805,54 @@ MPI_Request MPI_CLASS::IrecvBytes(
 }
 
 
+
+/************************************************************************
+ *  sendrecv                                                             *
+ ************************************************************************/
+#if defined( USE_MPI ) || defined( USE_EXT_MPI )
+template<>
+void MPI_CLASS::sendrecv<char>( const char* sendbuf, int sendcount, int dest, int sendtag,
+                                char* recvbuf, int recvcount, int source, int recvtag ) const
+{
+    PROFILE_START( "sendrecv<char>", profile_level );
+    MPI_Sendrecv( sendbuf, sendcount, MPI_CHAR, dest, sendtag, 
+                  recvbuf, recvcount, MPI_CHAR, source, recvtag,
+                  communicator, MPI_STATUS_IGNORE );
+    PROFILE_STOP( "sendrecv<char>", profile_level );
+}
+template<>
+void MPI_CLASS::sendrecv<int>( const int* sendbuf, int sendcount, int dest, int sendtag,
+                               int* recvbuf, int recvcount, int source, int recvtag ) const
+{
+    PROFILE_START( "sendrecv<int>", profile_level );
+    MPI_Sendrecv( sendbuf, sendcount, MPI_INT, dest, sendtag, 
+                  recvbuf, recvcount, MPI_INT, source, recvtag,
+                  communicator, MPI_STATUS_IGNORE );
+    PROFILE_STOP( "sendrecv<int>", profile_level );
+}
+template<>
+void MPI_CLASS::sendrecv<float>( const float* sendbuf, int sendcount, int dest, int sendtag,
+                                 float* recvbuf, int recvcount, int source, int recvtag ) const
+{
+    PROFILE_START( "sendrecv<float>", profile_level );
+    MPI_Sendrecv( sendbuf, sendcount, MPI_FLOAT, dest, sendtag, 
+                  recvbuf, recvcount, MPI_FLOAT, source, recvtag,
+                  communicator, MPI_STATUS_IGNORE );
+    PROFILE_STOP( "sendrecv<float>", profile_level );
+}
+template<>
+void MPI_CLASS::sendrecv<double>( const double* sendbuf, int sendcount, int dest, int sendtag,
+                                  double* recvbuf, int recvcount, int source, int recvtag ) const
+{
+    PROFILE_START( "sendrecv<double>", profile_level );
+    MPI_Sendrecv( sendbuf, sendcount, MPI_DOUBLE, dest, sendtag, 
+                  recvbuf, recvcount, MPI_DOUBLE, source, recvtag,
+                  communicator, MPI_STATUS_IGNORE );
+    PROFILE_STOP( "sendrecv<double>", profile_level );
+}
+#endif
+
+
 /************************************************************************
  *  allGather                                                            *
  *  Note: these specializations are only called when using MPI.          *
diff --git a/common/MPI.h b/common/MPI.h
index e3fd3e13..4161d6a7 100644
--- a/common/MPI.h
+++ b/common/MPI.h
@@ -792,6 +792,13 @@ public: // Member functions
         void *buf, const int N_bytes, const int send_proc, const int tag ) const;
 
 
+    /*!
+     * @brief This function sends and recieves data using a blocking call
+     */
+    template<class type>
+    void sendrecv( const type *sendbuf, int sendcount, int dest, int sendtag, type *recvbuf, int recvcount, int source, int recvtag ) const;
+
+
     /*!
      * Each processor sends every other processor a single value.
      * @param[in] x      Input value for allGather
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 78896d3f..d7f012d1 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -206,7 +206,6 @@ private:
 	int sendtag,recvtag;
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
-	MPI_Group Group;	// Group of processors associated with this domain
 	Utilities::MPI MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
 	//......................................................................................
diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index 1cf764be..11d2b261 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -8,7 +8,7 @@
 #endif
 
 #ifdef USE_MPI
-#include "mpi.h"
+#include "common/MPI.h"
 #endif
 
 #include <algorithm>
diff --git a/cpu/exe/lb2_Color_mpi.cpp b/cpu/exe/lb2_Color_mpi.cpp
index 0cade21e..cdf56af9 100644
--- a/cpu/exe/lb2_Color_mpi.cpp
+++ b/cpu/exe/lb2_Color_mpi.cpp
@@ -36,15 +36,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -58,7 +54,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -115,31 +110,30 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
-
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
 	// **************************************************************
 
@@ -169,7 +163,7 @@ int main(int argc, char **argv)
 
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -451,7 +445,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -588,7 +582,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -599,46 +593,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_x,1,rank_X,sendtag);
+	comm.Recv(&recvCount_X,1,rank_x,recvtag);
+	comm.Send(&sendCount_X,1,rank_x,sendtag);
+	comm.Recv(&recvCount_x,1,rank_X,recvtag);
+	comm.Send(&sendCount_y,1,rank_Y,sendtag);
+	comm.Recv(&recvCount_Y,1,rank_y,recvtag);
+	comm.Send(&sendCount_Y,1,rank_y,sendtag);
+	comm.Recv(&recvCount_y,1,rank_Y,recvtag);
+	comm.Send(&sendCount_z,1,rank_Z,sendtag);
+	comm.Recv(&recvCount_Z,1,rank_z,recvtag);
+	comm.Send(&sendCount_Z,1,rank_z,sendtag);
+	comm.Recv(&recvCount_z,1,rank_Z,recvtag);
 
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_xy,1,rank_XY,sendtag);
+	comm.Recv(&recvCount_XY,1,rank_xy,recvtag);
+	comm.Send(&sendCount_XY,1,rank_xy,sendtag);
+	comm.Recv(&recvCount_xy,1,rank_XY,recvtag);
+	comm.Send(&sendCount_Xy,1,rank_xY,sendtag);
+	comm.Recv(&recvCount_xY,1,rank_Xy,recvtag);
+	comm.Send(&sendCount_xY,1,rank_Xy,sendtag);
+	comm.Recv(&recvCount_Xy,1,rank_xY,recvtag);
 
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_xz,1,rank_XZ,sendtag);
+	comm.Recv(&recvCount_XZ,1,rank_xz,recvtag);
+	comm.Send(&sendCount_XZ,1,rank_xz,sendtag);
+	comm.Recv(&recvCount_xz,1,rank_XZ,recvtag);
+	comm.Send(&sendCount_Xz,1,rank_xZ,sendtag);
+	comm.Recv(&recvCount_xZ,1,rank_Xz,recvtag);
+	comm.Send(&sendCount_xZ,1,rank_Xz,sendtag);
+	comm.Recv(&recvCount_Xz,1,rank_xZ,recvtag);
 
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
+	comm.Send(&sendCount_yz,1,rank_YZ,sendtag);
+	comm.Recv(&recvCount_YZ,1,rank_yz,recvtag);
+	comm.Send(&sendCount_YZ,1,rank_yz,sendtag);
+	comm.Recv(&recvCount_yz,1,rank_YZ,recvtag);
+	comm.Send(&sendCount_Yz,1,rank_yZ,sendtag);
+	comm.Recv(&recvCount_yZ,1,rank_Yz,recvtag);
+	comm.Send(&sendCount_yZ,1,rank_Yz,sendtag);
+	comm.Recv(&recvCount_Yz,1,rank_yZ,recvtag);
+	comm.barrier();
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -669,48 +663,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -846,42 +840,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -914,7 +890,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -994,48 +970,49 @@ int main(int argc, char **argv)
 	PackValues(sendList_YZ, sendCount_YZ,sendbuf_YZ, Phi, N);
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
-	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+	//...................................................................................
+	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
+	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//...................................................................................
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
 	//...................................................................................
 	//...................................................................................
 	UnpackValues(recvList_x, recvCount_x,recvbuf_x, Phi, N);
@@ -1064,8 +1041,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	//.........................................
 
 	sendtag = recvtag = 5;
@@ -1158,42 +1135,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1211,8 +1188,8 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1293,23 +1270,23 @@ int main(int argc, char **argv)
 		//...................................................................................
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		req1[0] = comm.Isend(recvbuf_x, 2*recvCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(sendbuf_X, 2*sendCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(recvbuf_X, 2*recvCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(sendbuf_x, 2*sendCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(recvbuf_y, 2*recvCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(sendbuf_Y, 2*sendCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(recvbuf_Y, 2*recvCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(sendbuf_y, 2*sendCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(recvbuf_z, 2*recvCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(sendbuf_Z, 2*sendCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(recvbuf_Z, 2*recvCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(sendbuf_z, 2*sendCount_z,rank_Z,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-		MPI_Waitall(6,req1,stat1);
-		MPI_Waitall(6,req2,stat2);
+	    comm.waitAll(6,req1);
+	    comm.waitAll(6,req2);
 		//...................................................................................
 		//...................................................................................
 		UnpackDenD3Q7(sendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1345,48 +1322,49 @@ int main(int argc, char **argv)
 		PackValues(sendList_YZ, sendCount_YZ,sendbuf_YZ, Phi, N);
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
-		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
+		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		//...................................................................................
+	    comm.waitAll(18,req1);
+	    comm.waitAll(18,req2);
 		//...................................................................................
 		//...................................................................................
 		UnpackValues(recvList_x, recvCount_x,recvbuf_x, Phi, N);
@@ -1409,14 +1387,14 @@ int main(int argc, char **argv)
 		UnpackValues(recvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Iteration completed!
 		timestep++;
 		//...................................................................
 	}
 	//************************************************************************/
 
-	stoptime = MPI_Wtime();
+	stoptime = Utilities::MPI::time();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1459,7 +1437,7 @@ int main(int argc, char **argv)
 	fwrite(ColorGrad,8,3*N,COLORGRAD);
 	fclose(COLORGRAD);
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
index 298e3fae..5fc04e77 100644
--- a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
+++ b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
@@ -62,7 +62,7 @@ inline void UnpackMeshData(int *list, int count, double *recvbuf, DoubleArray &V
 	}
 }
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &MeshData, MPI_Comm Communicator,
+inline void CommunicateMeshHalo(DoubleArray &MeshData, const Utilities::MPI& Communicator,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -111,42 +111,24 @@ inline void CommunicateMeshHalo(DoubleArray &MeshData, MPI_Comm Communicator,
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendbuf_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
-	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
-			recvbuf_X,recvCount_X,MPI_CHAR,rank_X,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
-			recvbuf_x,recvCount_x,MPI_CHAR,rank_x,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
-			recvbuf_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
-			recvbuf_y,recvCount_y,MPI_CHAR,rank_y,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
-			recvbuf_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
-			recvbuf_z,recvCount_z,MPI_CHAR,rank_z,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
-			recvbuf_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
-			recvbuf_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
-			recvbuf_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
-			recvbuf_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
-			recvbuf_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
-			recvbuf_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
-			recvbuf_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
-			recvbuf_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
-			recvbuf_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
-			recvbuf_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
-			recvbuf_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,Communicator,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
-			recvbuf_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,Communicator,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendbuf_x,sendCount_x,rank_x,sendtag,recvbuf_X,recvCount_X,rank_X,recvtag);
+	comm.sendrecv(sendbuf_X,sendCount_X,rank_X,sendtag,recvbuf_x,recvCount_x,rank_x,recvtag);
+	comm.sendrecv(sendbuf_y,sendCount_y,rank_y,sendtag,recvbuf_Y,recvCount_Y,rank_Y,recvtag);
+	comm.sendrecv(sendbuf_Y,sendCount_Y,rank_Y,sendtag,recvbuf_y,recvCount_y,rank_y,recvtag);
+	comm.sendrecv(sendbuf_z,sendCount_z,rank_z,sendtag,recvbuf_Z,recvCount_Z,rank_Z,recvtag);
+	comm.sendrecv(sendbuf_Z,sendCount_Z,rank_Z,sendtag,recvbuf_z,recvCount_z,rank_z,recvtag);
+	comm.sendrecv(sendbuf_xy,sendCount_xy,rank_xy,sendtag,recvbuf_XY,recvCount_XY,rank_XY,recvtag);
+	comm.sendrecv(sendbuf_XY,sendCount_XY,rank_XY,sendtag,recvbuf_xy,recvCount_xy,rank_xy,recvtag);
+	comm.sendrecv(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag,recvbuf_xY,recvCount_xY,rank_xY,recvtag);
+	comm.sendrecv(sendbuf_xY,sendCount_xY,rank_xY,sendtag,recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
+	comm.sendrecv(sendbuf_xz,sendCount_xz,rank_xz,sendtag,recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
+	comm.sendrecv(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag,recvbuf_xz,recvCount_xz,rank_xz,recvtag);
+	comm.sendrecv(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag,recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
+	comm.sendrecv(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag,recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
+	comm.sendrecv(sendbuf_yz,sendCount_yz,rank_yz,sendtag,recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
+	comm.sendrecv(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag,recvbuf_yz,recvCount_yz,rank_yz,recvtag);
+	comm.sendrecv(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag,recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
+	comm.sendrecv(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag,recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
@@ -172,15 +154,11 @@ inline void CommunicateMeshHalo(DoubleArray &MeshData, MPI_Comm Communicator,
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -194,7 +172,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -283,38 +260,39 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&xIntPos,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&wp_saturation,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&xIntPos,1,0);
+	comm.bcast(&wp_saturation,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&Restart,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
+
 	// Computational domain
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-//	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-//	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nz,1,0);
+//	comm.bcast(&nBlocks,1,0);
+//	comm.bcast(&nthreads,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
 	// **************************************************************
 	double Ps = -(das-dbs)/(das+dbs);
@@ -346,7 +324,7 @@ int main(int argc, char **argv)
 		printf("********************************************************\n");
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -685,14 +663,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,nspheres,0);
+	comm.bcast(cy,nspheres,0);
+	comm.bcast(cz,nspheres,0);
+	comm.bcast(rad,nspheres,0);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -725,7 +703,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+	porosity = comm.sumReduce( sum_local );
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -859,7 +837,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -870,89 +848,48 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag,comm,&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag,comm,&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag,comm,&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag,comm,&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag,comm,&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag,comm,&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag,comm,&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag,comm,&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag,comm,&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag,comm,&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag,comm,&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(&sendCount_x,1,rank_x,sendtag);
+	req2[0] = comm.Irecv(&recvCount_X,1,rank_X,recvtag);
+	req1[1] = comm.Isend(&sendCount_X,1,rank_X,sendtag);
+	req2[1] = comm.Irecv(&recvCount_x,1,rank_x,recvtag);
+	req1[2] = comm.Isend(&sendCount_y,1,rank_y,sendtag);
+	req2[2] = comm.Irecv(&recvCount_Y,1,rank_Y,recvtag);
+	req1[3] = comm.Isend(&sendCount_Y,1,rank_Y,sendtag);
+	req2[3] = comm.Irecv(&recvCount_y,1,rank_y,recvtag);
+	req1[4] = comm.Isend(&sendCount_z,1,rank_z,sendtag);
+	req2[4] = comm.Irecv(&recvCount_Z,1,rank_Z,recvtag);
+	req1[5] = comm.Isend(&sendCount_Z,1,rank_Z,sendtag);
+	req2[5] = comm.Irecv(&recvCount_z,1,rank_z,recvtag);
 
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag,comm,&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag,comm,&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag,comm,&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag,comm,&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag,comm,&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag,comm,&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag,comm,&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(&sendCount_xy,1,rank_xy,sendtag);
+	req2[6] = comm.Irecv(&recvCount_XY,1,rank_XY,recvtag);
+	req1[7] = comm.Isend(&sendCount_XY,1,rank_XY,sendtag);
+	req2[7] = comm.Irecv(&recvCount_xy,1,rank_xy,recvtag);
+	req1[8] = comm.Isend(&sendCount_Xy,1,rank_Xy,sendtag);
+	req2[8] = comm.Irecv(&recvCount_xY,1,rank_xY,recvtag);
+	req1[9] = comm.Isend(&sendCount_xY,1,rank_xY,sendtag);
+	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_Xy,recvtag);
 
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag,comm,&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag,comm,&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag,comm,&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag,comm,&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag,comm,&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag,comm,&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag,comm,&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(&sendCount_xz,1,rank_xz,sendtag);
+	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_XZ,recvtag);
+	req1[11] = comm.Isend(&sendCount_XZ,1,rank_XZ,sendtag);
+	req2[11] = comm.Irecv(&recvCount_xz,1,rank_xz,recvtag);
+	req1[12] = comm.Isend(&sendCount_Xz,1,rank_Xz,sendtag);
+	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_xZ,recvtag);
+	req1[13] = comm.Isend(&sendCount_xZ,1,rank_xZ,sendtag);
+	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_Xz,recvtag);
 
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag,comm,&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag,comm,&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag,comm,&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag,comm,&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag,comm,&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag,comm,&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag,comm,&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
-/*	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
-*/	//**********************************************************************************
+	req1[14] = comm.Isend(&sendCount_yz,1,rank_yz,sendtag);
+	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_YZ,recvtag);
+	req1[15] = comm.Isend(&sendCount_YZ,1,rank_YZ,sendtag);
+	req2[15] = comm.Irecv(&recvCount_yz,1,rank_yz,recvtag);
+	req1[16] = comm.Isend(&sendCount_Yz,1,rank_Yz,sendtag);
+	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_yZ,recvtag);
+	req1[17] = comm.Isend(&sendCount_yZ,1,rank_yZ,sendtag);
+	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_Yz,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -982,48 +919,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_x,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_X,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_X,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_x,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_Y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_Y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_Z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_Z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_xy,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_XY,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_XY,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_xy,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_xY,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_xY,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_xz,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_xz,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,endCount_Xz,rank_Xz,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_yz,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_yz,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -1138,7 +1075,7 @@ int main(int argc, char **argv)
 	dvc_AllocateDeviceMemory((void **) &dvcRecvList_Yz, recvCount_Yz*sizeof(int));	// Allocate device memory
 	dvc_AllocateDeviceMemory((void **) &dvcRecvList_YZ, recvCount_YZ*sizeof(int));	// Allocate device memory
 	//......................................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("Prepare to copy send/recv Lists to device \n");
 	dvc_CopyToDevice(dvcSendList_x,sendList_x,sendCount_x*sizeof(int));
 	dvc_CopyToDevice(dvcSendList_X,sendList_X,sendCount_X*sizeof(int));
@@ -1245,42 +1182,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_x,sendtag,recvID_X,recvCount_X,rank_X,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_X,sendtag,recvID_x,recvCount_x,rank_x,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_y,sendtag,recvID_Y,recvCount_Y,rank_Y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_Y,sendtag,recvID_y,recvCount_y,rank_y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_z,sendtag,recvID_Z,recvCount_Z,rank_Z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_Z,sendtag,recvID_z,recvCount_z,rank_z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_xy,sendtag,recvID_XY,recvCount_XY,rank_XY,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_XY,sendtag,recvID_xy,recvCount_xy,rank_xy,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_Xy,sendtag,recvID_xY,recvCount_xY,rank_xY,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_xY,sendtag,recvID_Xy,recvCount_Xy,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_xz,sendtag,recvID_XZ,recvCount_XZ,rank_XZ,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_XZ,sendtag,recvID_xz,recvCount_xz,rank_xz,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_Xz,sendtag,recvID_xZ,recvCount_xZ,rank_xZ,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_xZ,sendtag,recvID_Xz,recvCount_Xz,rank_Xz,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_yz,sendtag,recvID_YZ,recvCount_YZ,rank_YZ,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_YZ,sendtag,recvID_yz,recvCount_yz,rank_yz,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_Yz,sendtag,recvID_yZ,recvCount_yZ,rank_yZ,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_yZ,sendtag,recvID_Yz,recvCount_Yz,rank_Yz,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1348,7 +1267,7 @@ int main(int argc, char **argv)
 	recvMeshData_YZ = new double [recvCount_YZ];
 	recvMeshData_XZ = new double [recvCount_XZ];
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1528,8 +1447,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	//.........................................
 	//...........................................................................
 	//				MAIN  VARIABLES INITIALIZED HERE
@@ -1600,7 +1519,7 @@ int main(int argc, char **argv)
 			dvc_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
 			dvc_CopyToDevice(Den,cDen,2*N*sizeof(double));
 			dvc_Barrier();
-			MPI_Barrier(comm);
+			comm.barrier();
 		}
 		// Pack the buffers (zeros out the halo region)
 		dvc_PackDenD3Q7(dvcRecvList_x,recvCount_x,recvbuf_x,2,Den,N);
@@ -1639,48 +1558,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_x,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_X,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_X,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_x,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_Y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_Y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_Z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_Z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		dvc_Barrier();
 		//...................................................................................
 		//...................................................................................
@@ -1726,7 +1645,7 @@ int main(int argc, char **argv)
 		dvc_CopyToHost(Phase.data,Phi,N*sizeof(double));
 		dvc_CopyToHost(Press.data,Pressure,N*sizeof(double));
 		dvc_CopyToHost(Vel,Velocity,3*N*sizeof(double));
-		MPI_Barrier(comm);
+		comm.barrier();
 		//...........................................................................
 
 		timestep=0;
@@ -1822,42 +1741,42 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Send all the distributions
-			MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-			MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-			MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-			MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-			MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-			MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-			MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-			MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-			MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-			MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-			MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-			MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-			MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-			MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-			MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-			MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-			MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-			MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-			MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-			MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-			MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-			MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-			MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-			MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-			MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-			MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-			MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-			MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-			MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-			MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-			MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-			MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-			MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-			MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-			MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-			MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+			req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_x,sendtag);
+			req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_X,recvtag);
+			req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_X,sendtag);
+			req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_x,recvtag);
+			req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_y,sendtag);
+			req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_Y,recvtag);
+			req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_Y,sendtag);
+			req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_y,recvtag);
+			req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_z,sendtag);
+			req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_Z,recvtag);
+			req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_Z,sendtag);
+			req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_z,recvtag);
+			req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
+			req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
+			req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
+			req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
+			req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
+			req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
+			req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
+			req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
+			req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
+			req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
+			req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
+			req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
+			req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
+			req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
+			req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
+			req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
+			req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
+			req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
+			req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
+			req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
+			req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
+			req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
+			req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
+			req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
 			//...................................................................................
 
 			//*************************************************************************
@@ -1874,8 +1793,8 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Wait for completion of D3Q19 communication
-			MPI_Waitall(18,req1,stat1);
-			MPI_Waitall(18,req2,stat2);
+			comm.waitAll(18,req1);
+			comm.waitAll(18,req2);
 
 			//...................................................................................
 			// Unpack the distributions on the device
@@ -1958,23 +1877,23 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Send all the D3Q7 distributions
-			MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-			MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-			MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-			MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-			MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-			MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-			MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-			MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-			MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-			MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-			MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-			MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+			req1[0] = comm.Isend(recvbuf_x,2*recvCount_x,rank_x,sendtag);
+			req2[0] = comm.Irecv(sendbuf_X,2*sendCount_X,rank_X,recvtag);
+			req1[1] = comm.Isend(recvbuf_X,2*recvCount_X,rank_X,sendtag);
+			req2[1] = comm.Irecv(sendbuf_x,2*sendCount_x,rank_x,recvtag);
+			req1[2] = comm.Isend(recvbuf_y,2*recvCount_y,rank_y,sendtag);
+			req2[2] = comm.Irecv(sendbuf_Y,2*sendCount_Y,rank_Y,recvtag);
+			req1[3] = comm.Isend(recvbuf_Y,2*recvCount_Y,rank_Y,sendtag);
+			req2[3] = comm.Irecv(sendbuf_y,2*sendCount_y,rank_y,recvtag);
+			req1[4] = comm.Isend(recvbuf_z,2*recvCount_z,rank_z,sendtag);
+			req2[4] = comm.Irecv(sendbuf_Z,2*sendCount_Z,rank_Z,recvtag);
+			req1[5] = comm.Isend(recvbuf_Z,2*recvCount_Z,rank_Z,sendtag);
+			req2[5] = comm.Irecv(sendbuf_z,2*sendCount_z,rank_z,recvtag);
 			//...................................................................................
 			//...................................................................................
 			// Wait for completion of D3Q7 communication
-			MPI_Waitall(6,req1,stat1);
-			MPI_Waitall(6,req2,stat2);
+			comm.waitAll(6,req1);
+			comm.waitAll(6,req2);
 			//...................................................................................
 			//...................................................................................
 			dvc_UnpackDenD3Q7(dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -2013,48 +1932,48 @@ int main(int argc, char **argv)
 			//...................................................................................
 			// Send / Recv all the phase indcator field values
 			//...................................................................................
-			MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-			MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-			MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-			MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-			MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-			MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-			MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-			MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-			MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-			MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-			MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-			MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-			MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-			MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-			MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-			MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-			MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-			MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-			MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-			MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-			MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-			MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-			MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-			MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-			MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-			MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-			MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-			MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-			MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-			MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-			MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-			MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-			MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-			MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-			MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-			MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+			req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_x,sendtag);
+			req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_X,recvtag);
+			req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_X,sendtag);
+			req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_x,recvtag);
+			req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_y,sendtag);
+			req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_Y,recvtag);
+			req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_Y,sendtag);
+			req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_y,recvtag);
+			req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_z,sendtag);
+			req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_Z,recvtag);
+			req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_Z,sendtag);
+			req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_z,recvtag);
+			req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
+			req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
+			req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
+			req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
+			req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
+			req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
+			req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
+			req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
+			req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
+			req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
+			req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
+			req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
+			req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
+			req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
+			req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
+			req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
+			req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
+			req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
+			req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
+			req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
+			req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
+			req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
+			req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
+			req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
 			//...................................................................................
 			//...................................................................................
 			// Wait for completion of Indicator Field communication
 			//...................................................................................
-			MPI_Waitall(18,req1,stat1);
-			MPI_Waitall(18,req2,stat2);
+			comm.waitAll(18,req1);
+			comm.waitAll(18,req2);
 			dvc_Barrier();
 			//...................................................................................
 			//...................................................................................
@@ -2084,7 +2003,7 @@ int main(int argc, char **argv)
 			dvc_UnpackValues(dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 			dvc_UnpackValues(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 			//...................................................................................
-			MPI_Barrier(comm);
+			comm.barrier();
 
 			// Iteration completed!
 			timestep++;
@@ -2364,27 +2283,27 @@ int main(int argc, char **argv)
 			//...........................................................................
 		}
 		//...........................................................................
-		MPI_Barrier(comm);
-		MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		comm.barrier();
+		nwp_volume_global = comm.sumReduce( nwp_volume );
+		awn_global = comm.sumReduce( awn );
+		ans_global = comm.sumReduce( ans );
+		aws_global = comm.sumReduce( aws );
+		lwns_global = comm.sumReduce( lwns );
+		As_global = comm.sumReduce( As );
+		Jwn_global = comm.sumReduce( Jwn );
+		efawns_global = comm.sumReduce( efawns );
 		// Phase averages
-		MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-		MPI_Barrier(comm);
+		vol_w_global = comm.sumReduce( vol_w );
+		vol_n_global = comm.sumReduce( vol_n );
+		paw_global = comm.sumReduce( paw );
+		pan_global = comm.sumReduce( pan );
+		vaw_global(0) = comm.sumReduce( vaw(0) );
+		van_global(0) = comm.sumReduce( van(0) );
+		vawn_global(0) = comm.sumReduce( vawn(0) );
+		Gwn_global(0) = comm.sumReduce( Gwn(0) );
+		Gns_global(0) = comm.sumReduce( Gns(0) );
+		Gws_global(0) = comm.sumReduce( Gws(0) );
+		comm.barrier();
 		//.........................................................................
 		// Compute the change in the total surface energy based on the defined interval
 		// See McClure, Prins and Miller (2013) 
@@ -2451,8 +2370,8 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	dvc_Barrier();
-	MPI_Barrier(comm);
-	stoptime = MPI_Wtime();
+	comm.barrier();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
@@ -2489,7 +2408,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb1_MRT_mpi.cpp b/gpu/exe/lb1_MRT_mpi.cpp
index 7ef41e90..68bf4edf 100644
--- a/gpu/exe/lb1_MRT_mpi.cpp
+++ b/gpu/exe/lb1_MRT_mpi.cpp
@@ -6,7 +6,7 @@
 #include <iostream>
 #include <fstream>
 #include <string.h>
-#include <mpi.h>
+#include "common/MPI.h"
 #include <stdlib.h>
 
 using namespace std;
@@ -64,15 +64,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -86,7 +82,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 	//**********************************
 	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	//!!!!!!!!!!! Random debugging communications!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -136,24 +131,23 @@ int main(int argc, char **argv)
 
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
-
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
 
 	double rlx_setA = 1.f/tau;
@@ -176,7 +170,7 @@ int main(int argc, char **argv)
 		printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -457,7 +451,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -594,7 +588,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -605,46 +599,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_x,1,rank_X,sendtag);
+	comm.recv(&recvCount_X,1,rank_x,recvtag);
+	comm.send(&sendCount_X,1,rank_x,sendtag);
+	comm.recv(&recvCount_x,1,rank_X,recvtag);
+	comm.send(&sendCount_y,1,rank_Y,sendtag);
+	comm.recv(&recvCount_Y,1,rank_y,recvtag);
+	comm.send(&sendCount_Y,1,rank_y,sendtag);
+	comm.recv(&recvCount_y,1,rank_Y,recvtag);
+	comm.send(&sendCount_z,1,rank_Z,sendtag);
+	comm.recv(&recvCount_Z,1,rank_z,recvtag);
+	comm.send(&sendCount_Z,1,rank_z,sendtag);
+	comm.recv(&recvCount_z,1,rank_Z,recvtag);
 
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_xy,1,rank_XY,sendtag);
+	comm.recv(&recvCount_XY,1,rank_xy,recvtag);
+	comm.send(&sendCount_XY,1,rank_xy,sendtag);
+	comm.recv(&recvCount_xy,1,rank_XY,recvtag);
+	comm.send(&sendCount_Xy,1,rank_xY,sendtag);
+	comm.recv(&recvCount_xY,1,rank_Xy,recvtag);
+	comm.send(&sendCount_xY,1,rank_Xy,sendtag);
+	comm.recv(&recvCount_Xy,1,rank_xY,recvtag);
 
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_xz,1,rank_XZ,sendtag);
+	comm.recv(&recvCount_XZ,1,rank_xz,recvtag);
+	comm.send(&sendCount_XZ,1,rank_xz,sendtag);
+	comm.recv(&recvCount_xz,1,rank_XZ,recvtag);
+	comm.send(&sendCount_Xz,1,rank_xZ,sendtag);
+	comm.recv(&recvCount_xZ,1,rank_Xz,recvtag);
+	comm.send(&sendCount_xZ,1,rank_Xz,sendtag);
+	comm.recv(&recvCount_Xz,1,rank_xZ,recvtag);
 
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
+	comm.send(&sendCount_yz,1,rank_YZ,sendtag);
+	comm.recv(&recvCount_YZ,1,rank_yz,recvtag);
+	comm.send(&sendCount_YZ,1,rank_yz,sendtag);
+	comm.recv(&recvCount_yz,1,rank_YZ,recvtag);
+	comm.send(&sendCount_Yz,1,rank_yZ,sendtag);
+	comm.recv(&recvCount_yZ,1,rank_Yz,recvtag);
+	comm.send(&sendCount_yZ,1,rank_Yz,sendtag);
+	comm.recv(&recvCount_Yz,1,rank_yZ,recvtag);
+	comm.barrier();
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -675,48 +669,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	double *sendbuf_x, *sendbuf_y, *sendbuf_z, *sendbuf_X, *sendbuf_Y, *sendbuf_Z;
 	double *sendbuf_xy, *sendbuf_yz, *sendbuf_xz, *sendbuf_Xy, *sendbuf_Yz, *sendbuf_xZ;
@@ -915,42 +909,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -983,7 +959,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1023,8 +999,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1136,48 +1112,48 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1260,7 +1236,7 @@ int main(int argc, char **argv)
 		//*****************************************************************************
 		//*****************************************************************************
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Iteration completed!
 		timestep++;
 		//...................................................................
@@ -1269,8 +1245,8 @@ int main(int argc, char **argv)
 
 //	cudaThreadSynchronize();
 	dvc_Barrier();
-	MPI_Barrier(comm);
-	stoptime = MPI_Wtime();
+	comm.barrier();
+	stoptime = Utilities::MPI::time();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1304,7 +1280,7 @@ int main(int argc, char **argv)
 //	dvc_CopyToDevice(velocity, vel, 3*dist_mem_size, dvc_CopyToDeviceDeviceToHost);
 	//..............................................................................
 //	cudaThreadSynchronize();
-//	MPI_Barrier(comm);
+//	comm.barrier();
 	//............................................................
 	//....Write the z-velocity to test poiseuille flow............
 //	double vz,vz_avg;
@@ -1333,7 +1309,7 @@ int main(int argc, char **argv)
 //	free (velocity);	free(id);
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb1_MRT_mpi.cu b/gpu/exe/lb1_MRT_mpi.cu
index 0c0863c7..776ea29f 100644
--- a/gpu/exe/lb1_MRT_mpi.cu
+++ b/gpu/exe/lb1_MRT_mpi.cu
@@ -1,8 +1,10 @@
+#include "common/MPI.h"
+
 #include <stdio.h>
 #include <iostream>
 #include <fstream>
 #include <cuda.h>
-#include <mpi.h>
+
 
 inline void PackID(int *list, int count, char *sendbuf, char *ID){
 	// Fill in the phase ID values from neighboring processors
@@ -553,15 +555,11 @@ void Write_Out(double *array, int Nx, int Ny, int Nz){
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -575,7 +573,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 	//**********************************
 	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	//!!!!!!!!!!! Random debugging communications!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -625,24 +622,21 @@ int main(int argc, char **argv)
 
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&iterMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
-
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	//.................................................
-	MPI_Barrier(comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&iterMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
 	// **************************************************************
 
 	double rlx_setA = 1.f/tau;
@@ -665,7 +659,7 @@ int main(int argc, char **argv)
 		printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -946,7 +940,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -1083,7 +1077,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -1094,46 +1088,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_x,1,rank_X,sendtag);
+	comm.recv(&recvCount_X,1,rank_x,recvtag);
+	comm.send(&sendCount_X,1,rank_x,sendtag);
+	comm.recv(&recvCount_x,1,rank_X,recvtag);
+	comm.send(&sendCount_y,1,rank_Y,sendtag);
+	comm.recv(&recvCount_Y,1,rank_y,recvtag);
+	comm.send(&sendCount_Y,1,rank_y,sendtag);
+	comm.recv(&recvCount_y,1,rank_Y,recvtag);
+	comm.send(&sendCount_z,1,rank_Z,sendtag);
+	comm.recv(&recvCount_Z,1,rank_z,recvtag);
+	comm.send(&sendCount_Z,1,rank_z,sendtag);
+	comm.recv(&recvCount_z,1,rank_Z,recvtag);
 
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_xy,1,rank_XY,sendtag);
+	comm.recv(&recvCount_XY,1,rank_xy,recvtag);
+	comm.send(&sendCount_XY,1,rank_xy,sendtag);
+	comm.recv(&recvCount_xy,1,rank_XY,recvtag);
+	comm.send(&sendCount_Xy,1,rank_xY,sendtag);
+	comm.recv(&recvCount_xY,1,rank_Xy,recvtag);
+	comm.send(&sendCount_xY,1,rank_Xy,sendtag);
+	comm.recv(&recvCount_Xy,1,rank_xY,recvtag);
 
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.send(&sendCount_xz,1,rank_XZ,sendtag);
+	comm.recv(&recvCount_XZ,1,rank_xz,recvtag);
+	comm.send(&sendCount_XZ,1,rank_xz,sendtag);
+	comm.recv(&recvCount_xz,1,rank_XZ,recvtag);
+	comm.send(&sendCount_Xz,1,rank_xZ,sendtag);
+	comm.recv(&recvCount_xZ,1,rank_Xz,recvtag);
+	comm.send(&sendCount_xZ,1,rank_Xz,sendtag);
+	comm.recv(&recvCount_Xz,1,rank_xZ,recvtag);
 
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
+	comm.send(&sendCount_yz,1,rank_YZ,sendtag);
+	comm.recv(&recvCount_YZ,1,rank_yz,recvtag);
+	comm.send(&sendCount_YZ,1,rank_yz,sendtag);
+	comm.recv(&recvCount_yz,1,rank_YZ,recvtag);
+	comm.send(&sendCount_Yz,1,rank_yZ,sendtag);
+	comm.recv(&recvCount_yZ,1,rank_Yz,recvtag);
+	comm.send(&sendCount_yZ,1,rank_Yz,sendtag);
+	comm.recv(&recvCount_Yz,1,rank_yZ,recvtag);
+	comm.barrier();
 	//**********************************************************************************
 	//recvCount_x = sendCount_x;
 	//recvCount_X = sendCount_X;
@@ -1157,7 +1151,7 @@ int main(int argc, char **argv)
 	//......................................................................................
 	// Use MPI to fill in the appropriate values
 	//	int tag = 5;
-	//	MPI_Sendrecv(sendCount_x,1,MPI_INT,rank_x,tag,sendCount_X,1,MPI_INT,comm,req);
+	//	Mcomm.sendrecv(sendCount_x,1,rank_x,tag,sendCount_X,1);
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -1187,48 +1181,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	double *sendbuf_x, *sendbuf_y, *sendbuf_z, *sendbuf_X, *sendbuf_Y, *sendbuf_Z;
 	double *sendbuf_xy, *sendbuf_yz, *sendbuf_xz, *sendbuf_Xy, *sendbuf_Yz, *sendbuf_xZ;
@@ -1427,42 +1421,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1495,7 +1471,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1535,8 +1511,8 @@ int main(int argc, char **argv)
 	
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1633,48 +1609,48 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1758,7 +1734,7 @@ int main(int argc, char **argv)
 		//*****************************************************************************
 		//*****************************************************************************
 
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Iteration completed!
 		iter++;
 		//...................................................................
@@ -1766,8 +1742,8 @@ int main(int argc, char **argv)
 	//************************************************************************/
 	
 	cudaThreadSynchronize();
-	MPI_Barrier(comm);
-	stoptime = MPI_Wtime();
+	comm.barrier();
+	stoptime = Utilities::MPI::time();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*iter)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1802,7 +1778,7 @@ int main(int argc, char **argv)
 	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
 	//..............................................................................
 	cudaThreadSynchronize();
-	MPI_Barrier(comm);
+	comm.barrier();
 	//............................................................	
 	//....Write the z-velocity to test poiseuille flow............
 	double vz,vz_avg;	
@@ -1831,7 +1807,7 @@ int main(int argc, char **argv)
 	free (velocity);	free(id);
 	
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb2_Color.cu b/gpu/exe/lb2_Color.cu
index 1871b23c..1f227d08 100644
--- a/gpu/exe/lb2_Color.cu
+++ b/gpu/exe/lb2_Color.cu
@@ -1,6 +1,4 @@
-#ifdef useMPI
-#include <mpi.h>
-#endif
+#include "common/MPI.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -62,18 +60,10 @@ int main(int argc, char *argv[])
 {
 	
 	//********** Initialize MPI ****************
-	int numprocs,rank;
-#ifdef useMPI
-	MPI_Status stat;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_size(comm,&numprocs);
-	MPI_Comm_rank(comm,&rank);
-#else
-    MPI_Comm comm = MPI_COMM_WORLD;
-	numprocs = 1;
-	rank = 0;
-#endif
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int numprocs = comm.getSize();
 	//******************************************
 	
 	if (rank == 0){
@@ -123,32 +113,31 @@ int main(int argc, char *argv[])
 		input >> tol;				// error tolerance
 		//.............................................................
 	}
-#ifdef useMPI
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
-#endif
 	
 	double rlxA = 1.f/tau;
 	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
@@ -243,11 +232,7 @@ int main(int argc, char *argv[])
 			if (k==4)	k=Nz-5;
 		}
 	}
-#ifdef useMPI	//............................................................
-	MPI_Barrier(comm);
-	MPI_Bcast(&id[0],N,MPI_CHAR,0,comm);
-	MPI_Barrier(comm);
-#endif
+	comm.bcast(&id[0],N,0);
 	if (rank == 0) printf("Domain set.\n");
 	//...........................................................................
 
diff --git a/gpu/exe/lb2_Color_mpi.cpp b/gpu/exe/lb2_Color_mpi.cpp
index fe11d32f..a2f3d8a9 100644
--- a/gpu/exe/lb2_Color_mpi.cpp
+++ b/gpu/exe/lb2_Color_mpi.cpp
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
-#include <mpi.h>
+#include "common/MPI.h"
 
 using namespace std;
 
@@ -98,15 +98,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -120,7 +116,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -177,31 +172,30 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
-
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
 	// **************************************************************
 
@@ -231,7 +225,7 @@ int main(int argc, char **argv)
 
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -513,7 +507,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -650,7 +644,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -661,46 +655,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_x,1,rank_X,sendtag);
+	comm.Recv(&recvCount_X,1,rank_x,recvtag);
+	comm.Send(&sendCount_X,1,rank_x,sendtag);
+	comm.Recv(&recvCount_x,1,rank_X,recvtag);
+	comm.Send(&sendCount_y,1,rank_Y,sendtag);
+	comm.Recv(&recvCount_Y,1,rank_y,recvtag);
+	comm.Send(&sendCount_Y,1,rank_y,sendtag);
+	comm.Recv(&recvCount_y,1,rank_Y,recvtag);
+	comm.Send(&sendCount_z,1,rank_Z,sendtag);
+	comm.Recv(&recvCount_Z,1,rank_z,recvtag);
+	comm.Send(&sendCount_Z,1,rank_z,sendtag);
+	comm.Recv(&recvCount_z,1,rank_Z,recvtag);
 
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_xy,1,rank_XY,sendtag);
+	comm.Recv(&recvCount_XY,1,rank_xy,recvtag);
+	comm.Send(&sendCount_XY,1,rank_xy,sendtag);
+	comm.Recv(&recvCount_xy,1,rank_XY,recvtag);
+	comm.Send(&sendCount_Xy,1,rank_xY,sendtag);
+	comm.Recv(&recvCount_xY,1,rank_Xy,recvtag);
+	comm.Send(&sendCount_xY,1,rank_Xy,sendtag);
+	comm.Recv(&recvCount_Xy,1,rank_xY,recvtag);
 
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.Send(&sendCount_xz,1,rank_XZ,sendtag);
+	comm.Recv(&recvCount_XZ,1,rank_xz,recvtag);
+	comm.Send(&sendCount_XZ,1,rank_xz,sendtag);
+	comm.Recv(&recvCount_xz,1,rank_XZ,recvtag);
+	comm.Send(&sendCount_Xz,1,rank_xZ,sendtag);
+	comm.Recv(&recvCount_xZ,1,rank_Xz,recvtag);
+	comm.Send(&sendCount_xZ,1,rank_Xz,sendtag);
+	comm.Recv(&recvCount_Xz,1,rank_xZ,recvtag);
 
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
+	comm.Send(&sendCount_yz,1,rank_YZ,sendtag);
+	comm.Recv(&recvCount_YZ,1,rank_yz,recvtag);
+	comm.Send(&sendCount_YZ,1,rank_yz,sendtag);
+	comm.Recv(&recvCount_yz,1,rank_YZ,recvtag);
+	comm.Send(&sendCount_Yz,1,rank_yZ,sendtag);
+	comm.Recv(&recvCount_yZ,1,rank_Yz,recvtag);
+	comm.Send(&sendCount_yZ,1,rank_Yz,sendtag);
+	comm.Recv(&recvCount_Yz,1,rank_yZ,recvtag);
+	comm.barrier();
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -731,48 +725,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -993,42 +987,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1061,7 +1037,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1126,48 +1102,49 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
+	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+
 	//...................................................................................
 	//...................................................................................
 	/*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1207,8 +1184,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1309,42 +1286,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1363,8 +1340,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+	    comm.waitAll(18,req1);
+	    comm.waitAll(18,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1446,23 +1423,23 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		req1[0] = comm.Isend(recvbuf_x, 2*recvCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(sendbuf_X, 2*sendCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(recvbuf_X, 2*recvCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(sendbuf_x, 2*sendCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(recvbuf_y, 2*recvCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(sendbuf_Y, 2*sendCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(recvbuf_Y, 2*recvCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(sendbuf_y, 2*sendCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(recvbuf_z, 2*recvCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(sendbuf_Z, 2*sendCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(recvbuf_Z, 2*recvCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(sendbuf_z, 2*sendCount_z,rank_Z,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-		MPI_Waitall(6,req1,stat1);
-		MPI_Waitall(6,req2,stat2);
+	    comm.waitAll(6,req1);
+	    comm.waitAll(6,req2);
 		//...................................................................................
 		//...................................................................................
 		dvc_UnpackDenD3Q7(faceGrid,packThreads,dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1507,48 +1484,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+	    comm.waitAll(18,req1);
+	    comm.waitAll(18,req2);
 		//...................................................................................
 		//...................................................................................
 /*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1577,7 +1554,7 @@ int main(int argc, char **argv)
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Iteration completed!
 		timestep++;
 		
@@ -1587,8 +1564,8 @@ int main(int argc, char **argv)
 
 //	cudaThreadSynchronize();
 	dvc_Barrier();
-	MPI_Barrier(comm);
-	stoptime = MPI_Wtime();
+	comm.barrier();
+	stoptime = Utilities::MPI::time();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1616,7 +1593,7 @@ int main(int argc, char **argv)
 	//************************************************************************/
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb2_Color_pBC_wia_mpi.cpp b/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
index c29e529e..fe803470 100644
--- a/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
+++ b/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
-#include <mpi.h>
+#include "common/MPI.h"
 
 #include "pmmc.h"
 #include "Domain.h"
@@ -101,15 +101,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
-	//*****************************************
-	// ***** MPI STUFF ****************
-	//*****************************************
 	// Initialize MPI
-	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
-	MPI_Comm_rank(comm,&rank);
-	MPI_Comm_size(comm,&nprocs);
+    Utilities::MPI comm( MPI_COMM_WORLD );
+	int rank = comm.getRank();
+	int nprocs = comm.getSize();
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -123,7 +119,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -203,35 +198,35 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	MPI_Barrier(comm);
+	comm.barrier();
 	//.................................................
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
 	// Computational domain
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nBlocks,1,0);
+	comm.bcast(&nthreads,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	// **************************************************************
 	// **************************************************************
 	double Ps = -(das-dbs)/(das+dbs);
@@ -263,7 +258,7 @@ int main(int argc, char **argv)
 		printf("********************************************************\n");
 	}
 
-	MPI_Barrier(comm);
+	comm.barrier();
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -561,14 +556,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	MPI_Barrier(comm);
+	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,nspheres,0);
+	comm.bcast(cy,nspheres,0);
+	comm.bcast(cz,nspheres,0);
+	comm.bcast(rad,nspheres,0);
 	//...........................................................................
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -718,7 +713,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Barrier(comm);
+	comm.barrier();
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -729,89 +724,49 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(&sendCount_x,1,rank_X,sendtag);
+	req2[0] = comm.Irecv(&recvCount_X,1,rank_x,recvtag);
+	req1[1] = comm.Isend(&sendCount_X,1,rank_x,sendtag);
+	req2[1] = comm.Irecv(&recvCount_x,1,rank_X,recvtag);
+	req1[2] = comm.Isend(&sendCount_y,1,rank_Y,sendtag);
+	req2[2] = comm.Irecv(&recvCount_Y,1,rank_y,recvtag);
+	req1[3] = comm.Isend(&sendCount_Y,1,rank_y,sendtag);
+	req2[3] = comm.Irecv(&recvCount_y,1,rank_Y,recvtag);
+	req1[4] = comm.Isend(&sendCount_z,1,rank_Z,sendtag);
+	req2[4] = comm.Irecv(&recvCount_Z,1,rank_z,recvtag);
+	req1[5] = comm.Isend(&sendCount_Z,1,rank_z,sendtag);
+	req2[5] = comm.Irecv(&recvCount_z,1,rank_Z,recvtag);
 
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(&sendCount_xy,1,rank_XY,sendtag);
+	req2[6] = comm.Irecv(&recvCount_XY,1,rank_xy,recvtag);
+	req1[7] = comm.Isend(&sendCount_XY,1,rank_xy,sendtag);
+	req2[7] = comm.Irecv(&recvCount_xy,1,rank_XY,recvtag);
+	req1[8] = comm.Isend(&sendCount_Xy,1,rank_xY,sendtag);
+	req2[8] = comm.Irecv(&recvCount_xY,1,rank_Xy,recvtag);
+	req1[9] = comm.Isend(&sendCount_xY,1,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_xY,recvtag);
 
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(&sendCount_xz,1,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_xz,recvtag);
+	req1[11] = comm.Isend(&sendCount_XZ,1,rank_xz,sendtag);
+	req2[11] = comm.Irecv(&recvCount_xz,1,rank_XZ,recvtag);
+	req1[12] = comm.Isend(&sendCount_Xz,1,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_Xz,recvtag);
+	req1[13] = comm.Isend(&sendCount_xZ,1,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_xZ,recvtag);
 
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
-/*	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
-	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
-	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
-	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
-	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
-	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
-	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
-	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
-	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
-	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
-	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
-	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
-	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
-	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
-	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-
-	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
-	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
-	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
-	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
-	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Barrier(comm);
-*/	//**********************************************************************************
+	req1[14] = comm.Isend(&sendCount_yz,1,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_yz,recvtag);
+	req1[15] = comm.Isend(&sendCount_YZ,1,rank_yz,sendtag);
+	req2[15] = comm.Irecv(&recvCount_yz,1,rank_YZ,recvtag);
+	req1[16] = comm.Isend(&sendCount_Yz,1,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_Yz,recvtag);
+	req1[17] = comm.Isend(&sendCount_yZ,1,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
+	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -841,48 +796,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
+	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
+	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
+	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
-	MPI_Barrier(comm);
+	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
+	comm.barrier();
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -1103,42 +1058,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1171,7 +1108,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	MPI_Barrier(comm);
+	comm.barrier();
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1322,48 +1259,48 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
+	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
+	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
+	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
+	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
+	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
+	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
+	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
+	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
+	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
+	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
+	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
+	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
 	//...................................................................................
 	//...................................................................................
 	/*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1403,8 +1340,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	MPI_Barrier(comm);
-	starttime = MPI_Wtime();
+	comm.barrier();
+	starttime = Utilities::MPI::time();
 	//.........................................
 
 	sendtag = recvtag = 5;
@@ -1494,42 +1431,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1547,8 +1484,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1630,23 +1567,23 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		req1[0] = comm.Isend(recvbuf_x,2*recvCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(sendbuf_X,2*sendCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(recvbuf_X,2*recvCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(sendbuf_x,2*sendCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(recvbuf_y,2*recvCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(sendbuf_Y,2*sendCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(recvbuf_Y,2*recvCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(sendbuf_y,2*sendCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(recvbuf_z,2*recvCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(sendbuf_Z,2*sendCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(recvbuf_Z,2*recvCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(sendbuf_z,2*sendCount_z,rank_Z,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-		MPI_Waitall(6,req1,stat1);
-		MPI_Waitall(6,req2,stat2);
+	    comm.waitAll(6,req1);
+	    comm.waitAll(6,req2);
 		//...................................................................................
 		//...................................................................................
 		dvc_UnpackDenD3Q7(faceGrid,packThreads,dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1685,48 +1622,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
+		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
+		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+	    comm.waitAll(18,req1);
+	    comm.waitAll(18,req2);
 		//...................................................................................
 		//...................................................................................
 /*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1755,7 +1692,7 @@ int main(int argc, char **argv)
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
-		MPI_Barrier(comm);
+		comm.barrier();
 		// Iteration completed!
 		timestep++;
 		//...................................................................
@@ -1766,7 +1703,7 @@ int main(int argc, char **argv)
 			//...........................................................................
 			dvc_Barrier();
 			dvc_CopyToHost(Phase.data,Phi,N*sizeof(double));
-			MPI_Barrier(comm);
+			comm.barrier();
 			//...........................................................................
 			// Compute areas using porous medium marching cubes algorithm
 			// McClure, Adalsteinsson, et al. (2007)
@@ -1935,15 +1872,15 @@ int main(int argc, char **argv)
 				//*******************************************************************
 			}
 			//...........................................................................
-			MPI_Barrier(comm);
-			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			comm.barrier();
+			nwp_volume_global = comm.sumReduce( nwp_volume );
+			awn_global = comm.sumReduce( awn );
+			ans_global = comm.sumReduce( ans );
+			aws_global = comm.sumReduce( aws );
+			lwns_global = comm.sumReduce( lwns );
+			As_global = comm.sumReduce( As );
 
-			MPI_Barrier(comm);
+			comm.barrier();
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
 			// See McClure, Prins and Miller (2013) 
@@ -1972,8 +1909,8 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	dvc_Barrier();
-	MPI_Barrier(comm);
-	stoptime = MPI_Wtime();
+	comm.barrier();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
@@ -2009,7 +1946,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	MPI_Barrier(comm);
+	comm.barrier();
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index bc8681af..5a9c56d4 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -192,12 +192,12 @@ void ScaLBL_ColorModel::ReadInput(){
 	}
 	else if (domain_db->keyExists( "GridFile" )){
         // Read the local domain data
-	    auto input_id = readMicroCT( *domain_db, MPI_COMM_WORLD );
+	    auto input_id = readMicroCT( *domain_db, comm );
         // Fill the halo (assuming GCW of 1)
         array<int,3> size0 = { (int) input_id.size(0), (int) input_id.size(1), (int) input_id.size(2) };
         ArraySize size1 = { (size_t) Mask->Nx, (size_t) Mask->Ny, (size_t) Mask->Nz };
         ASSERT( (int) size1[0] == size0[0]+2 && (int) size1[1] == size0[1]+2 && (int) size1[2] == size0[2]+2 );
-        fillHalo<signed char> fill( MPI_COMM_WORLD, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
+        fillHalo<signed char> fill( comm, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
         Array<signed char> id_view;
         id_view.viewRaw( size1, Mask->id );
         fill.copy( input_id, id_view );
@@ -652,7 +652,7 @@ void ScaLBL_ColorModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	starttime = MPI_Wtime();
+	starttime = Utilities::MPI::time();
 	//.........................................
 
 	//************ MAIN ITERATION LOOP ***************************************/
@@ -991,7 +991,7 @@ void ScaLBL_ColorModel::Run(){
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = MPI_Wtime();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/models/DFHModel.cpp b/models/DFHModel.cpp
index ced5853f..9709b107 100644
--- a/models/DFHModel.cpp
+++ b/models/DFHModel.cpp
@@ -487,7 +487,7 @@ void ScaLBL_DFHModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	starttime = MPI_Wtime();
+	starttime = Utilities::MPI::time();
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
 
@@ -583,7 +583,7 @@ void ScaLBL_DFHModel::Run(){
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = MPI_Wtime();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index d9b8069d..76d54571 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -208,7 +208,7 @@ void ScaLBL_MRTModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
     comm.barrier();
-	starttime = MPI_Wtime();
+	starttime = Utilities::MPI::time();
 	if (rank==0) printf("Beginning AA timesteps, timestepMax = %i \n", timestepMax);
 	if (rank==0) printf("********************************************************\n");
 	timestep=0;
@@ -306,7 +306,7 @@ void ScaLBL_MRTModel::Run(){
 		}
 	}
 	//************************************************************************/
-	stoptime = MPI_Wtime();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/tests/BlobAnalyzeParallel.cpp b/tests/BlobAnalyzeParallel.cpp
index 48e9e230..773309f9 100644
--- a/tests/BlobAnalyzeParallel.cpp
+++ b/tests/BlobAnalyzeParallel.cpp
@@ -138,16 +138,16 @@ int main(int argc, char **argv)
     }
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&nx,1,0);
+	comm.bcast(&ny,1,0);
+	comm.bcast(&nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 
@@ -291,7 +291,7 @@ int main(int argc, char **argv)
     }
     Dm.CommInit(); // Initialize communications for domains
 
-    MPI_Allreduce(&sum,&sum_global,1,MPI_DOUBLE,MPI_SUM,comm);
+    sum_global = comm.sumReduce( sum );
     porosity = sum_global/Dm.Volume;
     if (rank==0) printf("Porosity = %f \n",porosity);
 
diff --git a/tests/GenerateSphereTest.cpp b/tests/GenerateSphereTest.cpp
index 5886be21..0c84287e 100644
--- a/tests/GenerateSphereTest.cpp
+++ b/tests/GenerateSphereTest.cpp
@@ -213,42 +213,24 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         PackID(Dm.sendList_yZ, Dm.sendCount_yZ ,sendID_yZ, id);
         PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
         //......................................................................................
-        MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
-		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
-		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
-		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
-		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
-		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
-		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
-		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
-		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
-		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
-		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
-		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
-		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
-		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
-		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
-		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
-		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
-		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
-        MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
-		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        Dm.Comm.sendrecv(sendID_x,Dm.sendCount_x,Dm.rank_x(),sendtag,recvID_X,Dm.recvCount_X,Dm.rank_X(),recvtag);
+        Dm.Comm.sendrecv(sendID_X,Dm.sendCount_X,Dm.rank_X(),sendtag,recvID_x,Dm.recvCount_x,Dm.rank_x(),recvtag);
+        Dm.Comm.sendrecv(sendID_y,Dm.sendCount_y,Dm.rank_y(),sendtag,recvID_Y,Dm.recvCount_Y,Dm.rank_Y(),recvtag);
+        Dm.Comm.sendrecv(sendID_Y,Dm.sendCount_Y,Dm.rank_Y(),sendtag,recvID_y,Dm.recvCount_y,Dm.rank_y(),recvtag);
+        Dm.Comm.sendrecv(sendID_z,Dm.sendCount_z,Dm.rank_z(),sendtag,recvID_Z,Dm.recvCount_Z,Dm.rank_Z(),recvtag);
+        Dm.Comm.sendrecv(sendID_Z,Dm.sendCount_Z,Dm.rank_Z(),sendtag,recvID_z,Dm.recvCount_z,Dm.rank_z(),recvtag);
+        Dm.Comm.sendrecv(sendID_xy,Dm.sendCount_xy,Dm.rank_xy(),sendtag,recvID_XY,Dm.recvCount_XY,Dm.rank_XY(),recvtag);
+        Dm.Comm.sendrecv(sendID_XY,Dm.sendCount_XY,Dm.rank_XY(),sendtag,recvID_xy,Dm.recvCount_xy,Dm.rank_xy(),recvtag);
+        Dm.Comm.sendrecv(sendID_Xy,Dm.sendCount_Xy,Dm.rank_Xy(),sendtag,recvID_xY,Dm.recvCount_xY,Dm.rank_xY(),recvtag);
+        Dm.Comm.sendrecv(sendID_xY,Dm.sendCount_xY,Dm.rank_xY(),sendtag,recvID_Xy,Dm.recvCount_Xy,Dm.rank_Xy(),recvtag);
+        Dm.Comm.sendrecv(sendID_xz,Dm.sendCount_xz,Dm.rank_xz(),sendtag,recvID_XZ,Dm.recvCount_XZ,Dm.rank_XZ(),recvtag);
+        Dm.Comm.sendrecv(sendID_XZ,Dm.sendCount_XZ,Dm.rank_XZ(),sendtag,recvID_xz,Dm.recvCount_xz,Dm.rank_xz(),recvtag);
+        Dm.Comm.sendrecv(sendID_Xz,Dm.sendCount_Xz,Dm.rank_Xz(),sendtag,recvID_xZ,Dm.recvCount_xZ,Dm.rank_xZ(),recvtag);
+        Dm.Comm.sendrecv(sendID_xZ,Dm.sendCount_xZ,Dm.rank_xZ(),sendtag,recvID_Xz,Dm.recvCount_Xz,Dm.rank_Xz(),recvtag);
+        Dm.Comm.sendrecv(sendID_yz,Dm.sendCount_yz,Dm.rank_yz(),sendtag,recvID_YZ,Dm.recvCount_YZ,Dm.rank_YZ(),recvtag);
+        Dm.Comm.sendrecv(sendID_YZ,Dm.sendCount_YZ,Dm.rank_YZ(),sendtag,recvID_yz,Dm.recvCount_yz,Dm.rank_yz(),recvtag);
+        Dm.Comm.sendrecv(sendID_Yz,Dm.sendCount_Yz,Dm.rank_Yz(),sendtag,recvID_yZ,Dm.recvCount_yZ,Dm.rank_yZ(),recvtag);
+        Dm.Comm.sendrecv(sendID_yZ,Dm.sendCount_yZ,Dm.rank_yZ(),sendtag,recvID_Yz,Dm.recvCount_Yz,Dm.rank_Yz(),recvtag);
         //......................................................................................
         UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
         UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
diff --git a/tests/TestBlobAnalyze.cpp b/tests/TestBlobAnalyze.cpp
index 63d928c1..19360fe3 100644
--- a/tests/TestBlobAnalyze.cpp
+++ b/tests/TestBlobAnalyze.cpp
@@ -190,16 +190,16 @@ int main(int argc, char **argv)
     }
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&nx,1,0);
+	comm.bcast(&ny,1,0);
+	comm.bcast(&nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 
@@ -255,10 +255,10 @@ int main(int argc, char **argv)
 
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,nspheres,0);
+	comm.bcast(cy,nspheres,0);
+	comm.bcast(cz,nspheres,0);
+	comm.bcast(rad,nspheres,0);
 	//...........................................................................
 	comm.barrier();
 	//.......................................................................
diff --git a/tests/TestBubble.cpp b/tests/TestBubble.cpp
index e7e0ced8..6eb74b37 100644
--- a/tests/TestBubble.cpp
+++ b/tests/TestBubble.cpp
@@ -45,7 +45,6 @@ int main(int argc, char **argv)
     int nprocx,nprocy,nprocz;
 
     MPI_Request req1[18],req2[18];
-    MPI_Status stat1[18],stat2[18];
 
     if (rank == 0){
         printf("********************************************************\n");
@@ -434,7 +433,7 @@ int main(int argc, char **argv)
     //.......create and start timer............
     double starttime,stoptime,cputime;
     comm.barrier();
-    starttime = MPI_Wtime();
+    starttime = Utilities::MPI::time();
     //.........................................
     //...........................................................................
     //                MAIN  VARIABLES INITIALIZED HERE
@@ -809,25 +808,25 @@ int main(int argc, char **argv)
         }
         //...........................................................................
         comm.barrier();
-        MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        nwp_volume_global = comm.sumReduce( nwp_volume );
+        awn_global = comm.sumReduce( awn );
+        ans_global = comm.sumReduce( ans );
+        aws_global = comm.sumReduce( aws );
+        lwns_global = comm.sumReduce( lwns );
+        As_global  = comm.sumReduce( As );
+        Jwn_global = comm.sumReduce( Jwn );
+        efawns_global = comm.sumReduce( efawns );
         // Phase averages
-        MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-        MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+        vol_w_global = comm.sumReduce( vol_w );
+        vol_n_global = comm.sumReduce( vol_n );
+        paw_global   = comm.sumReduce( paw );
+        pan_global   = comm.sumReduce( pan );
+        vaw_global(0) = comm.sumReduce( vaw(0) );
+        van_global(0) = comm.sumReduce( van(0) );
+        vawn_global(0) = comm.sumReduce( vawn(0) );
+        Gwn_global(0) = comm.sumReduce( Gwn(0) );
+        Gns_global(0) = comm.sumReduce( Gns(0) );
+        Gws_global(0) = comm.sumReduce( Gws(0) );
         comm.barrier();
         //.........................................................................
         // Compute the change in the total surface energy based on the defined interval
@@ -952,7 +951,7 @@ int main(int argc, char **argv)
     //************************************************************************/
     ScaLBL_DeviceBarrier();
     comm.barrier();
-    stoptime = MPI_Wtime();
+    stoptime = Utilities::MPI::time();
     if (rank==0) printf("-------------------------------------------------------------------\n");
     // Compute the walltime per timestep
     cputime = (stoptime - starttime)/timestep;
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index 7f5d0047..8b4f1a9b 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -387,7 +387,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 		//.........................................
 
 		err = 1.0; 	
@@ -487,7 +487,7 @@ int main(int argc, char **argv)
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/TestColorGrad.cpp b/tests/TestColorGrad.cpp
index df1c1daf..2566f8c0 100644
--- a/tests/TestColorGrad.cpp
+++ b/tests/TestColorGrad.cpp
@@ -114,16 +114,16 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		comm.bcast(&nspheres,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
 		comm.barrier();
 		// **************************************************************
diff --git a/tests/TestCommD3Q19.cpp b/tests/TestCommD3Q19.cpp
index d2799355..c4a045ae 100644
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@@ -378,7 +378,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 		//.........................................
 
 
@@ -403,7 +403,7 @@ int main(int argc, char **argv)
 			//...................................................................
 		}
 		//************************************************************************/
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
diff --git a/tests/TestForceD3Q19.cpp b/tests/TestForceD3Q19.cpp
index f8569624..31151584 100644
--- a/tests/TestForceD3Q19.cpp
+++ b/tests/TestForceD3Q19.cpp
@@ -450,7 +450,7 @@ int main (int argc, char **argv)
 	for (int i=0; i<nprocs; i++) {
 		if ( rank==i )
 			printf("%i of %i: Testing force term \n",rank,nprocs);
-		MPI_Barrier(MPI_COMM_WORLD);
+	    comm.barrier();
 	}
 
 	// Create a memory leak for valgrind to find
@@ -540,6 +540,6 @@ int main (int argc, char **argv)
     printf("Fy = %f; Computed vy=%f \n",Fy,vel[Np+0]);
     printf("Fz = %f; Computed vz=%f \n",Fz,vel[2*Np+0]);
     
-    MPI_Barrier(MPI_COMM_WORLD);
+    comm.barrier();
     MPI_Finalize();
 }
diff --git a/tests/TestForceMoments.cpp b/tests/TestForceMoments.cpp
index 0df4a726..b10954b1 100644
--- a/tests/TestForceMoments.cpp
+++ b/tests/TestForceMoments.cpp
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 
 		ScaLBL_DeviceBarrier(); comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 
 		/************ MAIN ITERATION LOOP (timing communications)***************************************/
 		//ScaLBL_Comm->SendD3Q19(dist, &dist[10*Np]);
@@ -244,7 +244,7 @@ int main(int argc, char **argv)
 
 
 		//************************************************************************/
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
diff --git a/tests/TestMRT.cpp b/tests/TestMRT.cpp
index 5f2c4449..e4acba99 100644
--- a/tests/TestMRT.cpp
+++ b/tests/TestMRT.cpp
@@ -580,16 +580,16 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		comm.bcast(&nspheres,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
 		comm.barrier();
 		// **************************************************************
@@ -668,7 +668,7 @@ int main(int argc, char **argv)
 			}
 		}
 		comm.barrier();
-		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+		sum = comm.sumReduce( sum_local );
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -731,7 +731,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 
 		ScaLBL_DeviceBarrier(); comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 
 		while (timestep < timesteps) {
 			
@@ -752,7 +752,7 @@ int main(int argc, char **argv)
 
 		}
 		//************************************************************************/
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -795,7 +795,7 @@ int main(int argc, char **argv)
     			}
     		}
     	}
-    	MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+    	sum = comm.sumReduce( sum_local );
     	double PoreVel = sum*iVol_global;
     	if (rank==0) printf("Velocity = %f \n",PoreVel);
 
diff --git a/tests/TestMicroCTReader.cpp b/tests/TestMicroCTReader.cpp
index 9a54610c..52a5b9d3 100644
--- a/tests/TestMicroCTReader.cpp
+++ b/tests/TestMicroCTReader.cpp
@@ -62,7 +62,6 @@ int main(int argc, char **argv)
     int N_errors = ut.NumFailGlobal();
 
     // Close MPI
-    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/TestMomentsD3Q19.cpp b/tests/TestMomentsD3Q19.cpp
index 6bd3e8ff..2660ed26 100644
--- a/tests/TestMomentsD3Q19.cpp
+++ b/tests/TestMomentsD3Q19.cpp
@@ -539,7 +539,7 @@ int main (int argc, char **argv)
 
     error=count;
     // Finished
-    MPI_Barrier(MPI_COMM_WORLD);
+    comm.barrier();
     MPI_Finalize();
     return error; 
 }
diff --git a/tests/TestNetcdf.cpp b/tests/TestNetcdf.cpp
index 8768c9ea..3d0498d2 100644
--- a/tests/TestNetcdf.cpp
+++ b/tests/TestNetcdf.cpp
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
     PROFILE_SAVE("TestNetcdf");
 
     // Close MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    comm.barrier();
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/TestSegDist.cpp b/tests/TestSegDist.cpp
index b5e23ec8..ecb6d6b9 100644
--- a/tests/TestSegDist.cpp
+++ b/tests/TestSegDist.cpp
@@ -100,10 +100,10 @@ int main(int argc, char **argv)
     comm.barrier();
     if (rank==0) printf("Initialized! Converting to Signed Distance function \n");
 
-    double t1 = MPI_Wtime();
+    double t1 = Utilities::MPI::time();
     DoubleArray Distance(nx,ny,nz);
     CalcDist(Distance,id,Dm,{false,false,false});
-    double t2 = MPI_Wtime();
+    double t2 = Utilities::MPI::time();
     if (rank==0)
         printf("Total time: %f seconds \n",t2-t1);
 
diff --git a/tests/lb2_CMT_wia.cpp b/tests/lb2_CMT_wia.cpp
index 820428a3..389bc8a8 100644
--- a/tests/lb2_CMT_wia.cpp
+++ b/tests/lb2_CMT_wia.cpp
@@ -292,18 +292,18 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+		req1[0] = comm.Isend(sendbuf_x,2*sendCount_x,rank_x,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X,2*recvCount_X,rank_X,recvtag);
+		req1[1] = comm.Isend(sendbuf_X,2*sendCount_X,rank_X,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x,2*recvCount_x,rank_x,recvtag);
+		req1[2] = comm.Isend(sendbuf_y,2*sendCount_y,rank_y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y,2*recvCount_Y,rank_Y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y,2*sendCount_Y,rank_Y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y,2*recvCount_y,rank_y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z,2*sendCount_z,rank_z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z,2*recvCount_Z,rank_Z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z,2*sendCount_Z,rank_Z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z,2*recvCount_z,rank_z,recvtag);
 */		//...................................................................................
 		
 		ScaLBL_D3Q7_Swap(ID, &packed_even[0], &packed_odd[0], Nx, Ny, Nz);
@@ -311,8 +311,8 @@ int main(int argc, char **argv)
 		
 /*		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(6,req1,stat1);
-		MPI_Waitall(6,req2,stat2);
+		comm.waitAll(6,req1);
+		comm.waitAll(6,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -358,7 +358,7 @@ int main(int argc, char **argv)
 	fclose(PHASE);
     
     // Close MPI
-    MPI_Barrier(MPI_COMM_WORLD);
+    comm.barrier();
     MPI_Finalize();
     return 0;
 }
diff --git a/tests/lb2_Color_blob_wia_mpi.cpp b/tests/lb2_Color_blob_wia_mpi.cpp
index 70342176..e3323612 100644
--- a/tests/lb2_Color_blob_wia_mpi.cpp
+++ b/tests/lb2_Color_blob_wia_mpi.cpp
@@ -114,7 +114,6 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -207,36 +206,36 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	//.................................................
-	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&phi_s,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&wp_saturation,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
-	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-	MPI_Bcast(&interval,1,MPI_INT,0,comm);
-	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&tau,1,0);
+	comm.bcast(&alpha,1,0);
+	comm.bcast(&beta,1,0);
+	comm.bcast(&das,1,0);
+	comm.bcast(&dbs,1,0);
+	comm.bcast(&phi_s,1,0);
+	comm.bcast(&wp_saturation,1,0);
+	comm.bcast(&pBC,1,0);
+	comm.bcast(&Restart,1,0);
+	comm.bcast(&din,1,0);
+	comm.bcast(&dout,1,0);
+	comm.bcast(&Fx,1,0);
+	comm.bcast(&Fy,1,0);
+	comm.bcast(&Fz,1,0);
+	comm.bcast(&timestepMax,1,0);
+	comm.bcast(&interval,1,0);
+	comm.bcast(&tol,1,0);
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-//	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
-//	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+//	comm.bcast(&nBlocks,1,0);
+//	comm.bcast(&nthreads,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 	
@@ -399,10 +398,10 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,nspheres,0);
+	comm.bcast(cy,nspheres,0);
+	comm.bcast(cz,nspheres,0);
+	comm.bcast(rad,nspheres,0);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
@@ -418,7 +417,7 @@ int main(int argc, char **argv)
 		D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 		printf("Sauter Mean Diameter (computed from sphere packing) = %f \n ",D);
 	}
-	MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&D,1,0);
 
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -478,7 +477,7 @@ int main(int argc, char **argv)
 	id[(Nz-1)*Nx*Ny] = id[(Nz-1)*Nx*Ny+Nx-1] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx + Nx-1] = 0;
 	//.........................................................
 	sum_local = 1.0*sum;
-	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+	porosity = comm.sumReduce( sum_local );
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -886,42 +885,24 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
-			recvID_X,recvCount_X,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
-			recvID_x,recvCount_x,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
-			recvID_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
-			recvID_y,recvCount_y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
-			recvID_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
-			recvID_z,recvCount_z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
-			recvID_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
-			recvID_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
-			recvID_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
-			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
-			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
-			recvID_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
-			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
-			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
-			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
-			recvID_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
-			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
-			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,sendCount_x,rank_x,sendtag,recvID_X,recvCount_X,rank_X,recvtag);
+	comm.sendrecv(sendID_X,sendCount_X,rank_X,sendtag,recvID_x,recvCount_x,rank_x,recvtag);
+	comm.sendrecv(sendID_y,sendCount_y,rank_y,sendtag,recvID_Y,recvCount_Y,rank_Y,recvtag);
+	comm.sendrecv(sendID_Y,sendCount_Y,rank_Y,sendtag,recvID_y,recvCount_y,rank_y,recvtag);
+	comm.sendrecv(sendID_z,sendCount_z,rank_z,sendtag,recvID_Z,recvCount_Z,rank_Z,recvtag);
+	comm.sendrecv(sendID_Z,sendCount_Z,rank_Z,sendtag,recvID_z,recvCount_z,rank_z,recvtag);
+	comm.sendrecv(sendID_xy,sendCount_xy,rank_xy,sendtag,recvID_XY,recvCount_XY,rank_XY,recvtag);
+	comm.sendrecv(sendID_XY,sendCount_XY,rank_XY,sendtag,recvID_xy,recvCount_xy,rank_xy,recvtag);
+	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_Xy,sendtag,recvID_xY,recvCount_xY,rank_xY,recvtag);
+	comm.sendrecv(sendID_xY,sendCount_xY,rank_xY,sendtag,recvID_Xy,recvCount_Xy,rank_Xy,recvtag);
+	comm.sendrecv(sendID_xz,sendCount_xz,rank_xz,sendtag,recvID_XZ,recvCount_XZ,rank_XZ,recvtag);
+	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_XZ,sendtag,recvID_xz,recvCount_xz,rank_xz,recvtag);
+	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_Xz,sendtag,recvID_xZ,recvCount_xZ,rank_xZ,recvtag);
+	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_xZ,sendtag,recvID_Xz,recvCount_Xz,rank_Xz,recvtag);
+	comm.sendrecv(sendID_yz,sendCount_yz,rank_yz,sendtag,recvID_YZ,recvCount_YZ,rank_YZ,recvtag);
+	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_YZ,sendtag,recvID_yz,recvCount_yz,rank_yz,recvtag);
+	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_Yz,sendtag,recvID_yZ,recvCount_yZ,rank_yZ,recvtag);
+	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_yZ,sendtag,recvID_Yz,recvCount_Yz,rank_Yz,recvtag);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1380,48 +1361,48 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+	req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_x,sendtag);
+	req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag);
+	req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_X,sendtag);
+	req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag);
+	req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_y,sendtag);
+	req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag);
+	req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag);
+	req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag);
+	req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_z,sendtag);
+	req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag);
+	req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag);
+	req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag);
+	req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
+	req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
+	req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
+	req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
+	req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
+	req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
+	req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
+	req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
+	req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
+	req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
+	req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
+	req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
+	req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
+	req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
+	req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
+	req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
+	req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
+	req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
+	req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
+	req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
+	req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
+	req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
+	req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
+	req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	MPI_Waitall(18,req1,stat1);
-	MPI_Waitall(18,req2,stat2);
+	comm.waitAll(18,req1);
+	comm.waitAll(18,req2);
 	ScaLBL_DeviceBarrier();
 	//...................................................................................
 	//...................................................................................
@@ -1497,7 +1478,7 @@ int main(int argc, char **argv)
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	comm.barrier();
-	starttime = MPI_Wtime();
+	starttime = Utilities::MPI::time();
 	//.........................................
 	
 	sendtag = recvtag = 5;
@@ -1593,42 +1574,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x, 5*sendCount_x,rank_x,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X, 5*recvCount_X,rank_X,recvtag);
+		req1[1] = comm.Isend(sendbuf_X, 5*sendCount_X,rank_X,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x, 5*recvCount_x,rank_x,recvtag);
+		req1[2] = comm.Isend(sendbuf_y, 5*sendCount_y,rank_y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y, 5*recvCount_Y,rank_Y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y, 5*sendCount_Y,rank_Y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y, 5*recvCount_y,rank_y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z, 5*sendCount_z,rank_z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z, 5*recvCount_Z,rank_Z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z, 5*sendCount_Z,rank_Z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z, 5*recvCount_z,rank_z,recvtag);
+		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
+		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
+		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
+		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
+		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
+		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
+		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
+		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
+		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
+		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
+		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
+		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
+		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
+		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
+		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
+		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
+		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
+		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
+		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
+		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
+		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
+		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
+		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
+		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1648,8 +1629,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 
 		//...................................................................................
 		// Unpack the distributions on the device
@@ -1743,18 +1724,18 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+		req1[0] = comm.Isend(sendbuf_x, 2*sendCount_x,rank_x,sendtag);
+		req2[0] = comm.Irecv(recvbuf_X, 2*recvCount_X,rank_X,recvtag);
+		req1[1] = comm.Isend(sendbuf_X, 2*sendCount_X,rank_X,sendtag);
+		req2[1] = comm.Irecv(recvbuf_x, 2*recvCount_x,rank_x,recvtag);
+		req1[2] = comm.Isend(sendbuf_y, 2*sendCount_y,rank_y,sendtag);
+		req2[2] = comm.Irecv(recvbuf_Y, 2*recvCount_Y,rank_Y,recvtag);
+		req1[3] = comm.Isend(sendbuf_Y, 2*sendCount_Y,rank_Y,sendtag);
+		req2[3] = comm.Irecv(recvbuf_y, 2*recvCount_y,rank_y,recvtag);
+		req1[4] = comm.Isend(sendbuf_z, 2*sendCount_z,rank_z,sendtag);
+		req2[4] = comm.Irecv(recvbuf_Z, 2*recvCount_Z,rank_Z,recvtag);
+		req1[5] = comm.Isend(sendbuf_Z, 2*sendCount_Z,rank_Z,sendtag);
+		req2[5] = comm.Irecv(recvbuf_z, 2*recvCount_z,rank_z,recvtag);
 		//...................................................................................
 		
 		ScaLBL_D3Q7_Swap(ID, A_even, A_odd, Nx, Ny, Nz);
@@ -1762,8 +1743,8 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		MPI_Waitall(6,req1,stat1);
-		MPI_Waitall(6,req2,stat2);
+		comm.waitAll(6,req1);
+		comm.waitAll(6,req2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1824,48 +1805,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
-		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
-		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
-		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
-		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
-		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
-		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
-		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
-		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
-		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
-		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
-		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
-		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
-		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
-		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
-		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
-		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
-		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
-		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
-		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
-		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
-		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
-		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
-		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
-		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
-		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
-		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
-		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
-		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
-		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
-		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
-		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
-		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
-		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
-		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
-		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
+		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_x,sendtag,comm,&req1[0]);
+		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag,comm,&req2[0]);
+		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_X,sendtag,comm,&req1[1]);
+		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag,comm,&req2[1]);
+		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_y,sendtag,comm,&req1[2]);
+		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag,comm,&req2[2]);
+		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag,comm,&req1[3]);
+		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag,comm,&req2[3]);
+		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_z,sendtag,comm,&req1[4]);
+		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag,comm,&req2[4]);
+		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag,comm,&req1[5]);
+		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag,comm,&req2[5]);
+		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag,comm,&req1[6]);
+		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag,comm,&req2[6]);
+		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag,comm,&req1[7]);
+		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag,comm,&req2[7]);
+		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag,comm,&req1[8]);
+		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag,comm,&req2[8]);
+		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag,comm,&req1[9]);
+		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag,comm,&req2[9]);
+		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag,comm,&req1[10]);
+		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag,comm,&req2[10]);
+		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag,comm,&req1[11]);
+		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag,comm,&req2[11]);
+		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag,comm,&req1[12]);
+		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag,comm,&req2[12]);
+		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag,comm,&req1[13]);
+		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag,comm,&req2[13]);
+		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag,comm,&req1[14]);
+		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag,comm,&req2[14]);
+		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag,comm,&req1[15]);
+		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag,comm,&req2[15]);
+		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag,comm,&req1[16]);
+		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag,comm,&req2[16]);
+		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag,comm,&req1[17]);
+		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag,comm,&req2[17]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		MPI_Waitall(18,req1,stat1);
-		MPI_Waitall(18,req2,stat2);
+		comm.waitAll(18,req1);
+		comm.waitAll(18,req2);
 		ScaLBL_DeviceBarrier();
 		//...................................................................................
 		//...................................................................................
@@ -2442,28 +2423,28 @@ int main(int argc, char **argv)
 			
 			//...........................................................................
 			comm.barrier();
-			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&Kwn,&Kwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			nwp_volume_global = comm.sumReduce( nwp_volume );
+			awn_global = comm.sumReduce( awn );
+			ans_global = comm.sumReduce( ans );
+			aws_global = comm.sumReduce( aws );
+			lwns_global = comm.sumReduce( lwns );
+			As_global  = comm.sumReduce( As );
+			Jwn_global = comm.sumReduce( Jwn );
+			Kwn_global = comm.sumReduce( Kwn );
+			efawns_global = comm.sumReduce( efawns );
 			// Phase averages
-			MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			vol_w_global = comm.sumReduce( vol_w );
+			vol_n_global = comm.sumReduce( vol_n );
+			paw_global   = comm.sumReduce( paw );
+			pan_global   = comm.sumReduce( pan );
+			vaw_global(0) = comm.sumReduce( vaw(0) );
+			van_global(0) = comm.sumReduce( van(0) );
+			vawn_global(0) = comm.sumReduce( vawn(0) );
+			Gwn_global(0) = comm.sumReduce( Gwn(0) );
+			Gns_global(0) = comm.sumReduce( Gns(0) );
+			Gws_global(0) = comm.sumReduce( Gws(0) );
+			trawn_global = comm.sumReduce( trawn );
+			trJwn_global = comm.sumReduce( trJwn );
 			comm.barrier();
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
@@ -2689,7 +2670,7 @@ int main(int argc, char **argv)
 	//************************************************************************/
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = MPI_Wtime();
+	stoptime = Utilities::MPI::time();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_BGK_simulator.cpp b/tests/lbpm_BGK_simulator.cpp
index 8b079900..1ac61853 100644
--- a/tests/lbpm_BGK_simulator.cpp
+++ b/tests/lbpm_BGK_simulator.cpp
@@ -97,28 +97,28 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-		//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
-		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-		MPI_Bcast(&interval,1,MPI_INT,0,comm);
-		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&tau,1,0);
+		//comm.bcast(&pBC,1,0);
+		//comm.bcast(&Restart,1,0);
+		comm.bcast(&din,1,0);
+		comm.bcast(&dout,1,0);
+		comm.bcast(&Fx,1,0);
+		comm.bcast(&Fy,1,0);
+		comm.bcast(&Fz,1,0);
+		comm.bcast(&timestepMax,1,0);
+		comm.bcast(&interval,1,0);
+		comm.bcast(&tol,1,0);
 		// Computational domain
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		//MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		//comm.bcast(&nspheres,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
 		comm.barrier();
 
@@ -249,7 +249,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+		sum = comm.sumReduce( sum_local );
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -331,7 +331,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 		//.........................................
 
 		double D32,Fo,Re,velocity,err1D,mag_force,vel_prev;
@@ -410,7 +410,7 @@ int main(int argc, char **argv)
 		//************************************************************************/
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_color_macro_simulator.cpp b/tests/lbpm_color_macro_simulator.cpp
index 97df6812..c92b0c45 100644
--- a/tests/lbpm_color_macro_simulator.cpp
+++ b/tests/lbpm_color_macro_simulator.cpp
@@ -39,9 +39,6 @@ int main(int argc, char **argv)
 		int nprocx,nprocy,nprocz;
 		int iproc,jproc,kproc;
 
-		MPI_Request req1[18],req2[18];
-		MPI_Status stat1[18],stat2[18];
-
 		if (rank == 0){
 			printf("********************************************************\n");
 			printf("Running Color LBM	\n");
@@ -172,32 +169,32 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		MPI_Bcast(&tauA,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&tauB,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&rhoA,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&rhoB,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&BoundaryCondition,1,MPI_INT,0,comm);
-		MPI_Bcast(&InitialCondition,1,MPI_INT,0,comm);
-		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-		MPI_Bcast(&RESTART_INTERVAL,1,MPI_INT,0,comm);
-		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&tauA,1,0);
+		comm.bcast(&tauB,1,0);
+		comm.bcast(&rhoA,1,0);
+		comm.bcast(&rhoB,1,0);
+		comm.bcast(&alpha,1,0);
+		comm.bcast(&beta,1,0);
+		comm.bcast(&BoundaryCondition,1,0);
+		comm.bcast(&InitialCondition,1,0);
+		comm.bcast(&din,1,0);
+		comm.bcast(&dout,1,0);
+		comm.bcast(&Fx,1,0);
+		comm.bcast(&Fy,1,0);
+		comm.bcast(&Fz,1,0);
+		comm.bcast(&timestepMax,1,0);
+		comm.bcast(&RESTART_INTERVAL,1,0);
+		comm.bcast(&tol,1,0);
 		// Computational domain
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
 
 		flux = 0.f;
@@ -322,7 +319,7 @@ int main(int argc, char **argv)
 					timestep=0;
 				}
 			}
-			MPI_Bcast(&timestep,1,MPI_INT,0,comm);
+			comm.bcast(&timestep,1,0);
 			FILE *RESTART = fopen(LocalRestartFile,"rb");
 			if (IDFILE==NULL) ERROR("lbpm_color_simulator: Error opening file: Restart.xxxxx");
 			readID=fread(id,1,N,RESTART);
@@ -361,7 +358,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
+		sum - comm.sumReduce( sum_local );
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 		//.........................................................
@@ -537,7 +534,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		starttime = MPI_Wtime();
+		starttime = Utilities::MPI::time();
 		//.........................................
 
 		err = 1.0; 	
@@ -637,7 +634,7 @@ int main(int argc, char **argv)
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = MPI_Wtime();
+		stoptime = Utilities::MPI::time();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_disc_pp.cpp b/tests/lbpm_disc_pp.cpp
index 20d41884..41825c7d 100644
--- a/tests/lbpm_disc_pp.cpp
+++ b/tests/lbpm_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"    // This includes mpi.h
+#include "common/MPI.h"
 #include "common/SpherePack.h"
 
 /*
@@ -147,8 +147,6 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	int depth;
 
@@ -189,16 +187,16 @@ int main(int argc, char **argv)
 	comm.barrier();
 	//.................................................
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&ndiscs,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&ndiscs,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 
@@ -275,9 +273,9 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,ndiscs,0);
+	comm.bcast(cy,ndiscs,0);
+	comm.bcast(rad,ndiscs,0);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0){
@@ -346,7 +344,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+	porosity = comm.sumReduce( sum_local );
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -362,7 +360,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 	
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_inkbottle_pp.cpp b/tests/lbpm_inkbottle_pp.cpp
index 669ab8c0..ca188633 100644
--- a/tests/lbpm_inkbottle_pp.cpp
+++ b/tests/lbpm_inkbottle_pp.cpp
@@ -81,16 +81,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 	
@@ -197,7 +197,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_juanes_bench_disc_pp.cpp b/tests/lbpm_juanes_bench_disc_pp.cpp
index 47d8cb84..a90d43f8 100644
--- a/tests/lbpm_juanes_bench_disc_pp.cpp
+++ b/tests/lbpm_juanes_bench_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"    // This includes mpi.h
+#include "common/MPI.h"
 #include "common/SpherePack.h"
 
 /*
@@ -147,9 +147,6 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
-
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -193,16 +190,16 @@ int main(int argc, char **argv)
 	comm.barrier();
 	//.................................................
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&ndiscs,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&ndiscs,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 
@@ -292,9 +289,9 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,ndiscs,0);
+	comm.bcast(cy,ndiscs,0);
+	comm.bcast(rad,ndiscs,0);
 	//...........................................................................
 	comm.barrier();
 	/*	if (rank == 0){
@@ -436,7 +433,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+	porosity = comm.sumReduce( sum_local );
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -452,7 +449,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_nondarcy_simulator.cpp b/tests/lbpm_nondarcy_simulator.cpp
index 096dc790..a25fef69 100644
--- a/tests/lbpm_nondarcy_simulator.cpp
+++ b/tests/lbpm_nondarcy_simulator.cpp
@@ -94,8 +94,6 @@ int main(int argc, char **argv)
 			int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 			int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 			//**********************************
-			MPI_Request req1[18],req2[18];
-			MPI_Status stat1[18],stat2[18];
 
 			double REYNOLDS_NUMBER = 100.f;
 			if (argc > 1){
@@ -158,28 +156,28 @@ int main(int argc, char **argv)
 			// Broadcast simulation parameters from rank 0 to all other procs
 			comm.barrier();
 			//.................................................
-			MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
-			//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
-			//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
-			MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
-			MPI_Bcast(&interval,1,MPI_INT,0,comm);
-			MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+			comm.bcast(&tau,1,0);
+			//comm.bcast(&pBC,1,0);
+			//comm.bcast(&Restart,1,0);
+			comm.bcast(&din,1,0);
+			comm.bcast(&dout,1,0);
+			comm.bcast(&Fx,1,0);
+			comm.bcast(&Fy,1,0);
+			comm.bcast(&Fz,1,0);
+			comm.bcast(&timestepMax,1,0);
+			comm.bcast(&interval,1,0);
+			comm.bcast(&tol,1,0);
 			// Computational domain
-			MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-			MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-			MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-			MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-			MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-			MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-			MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-			MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-			MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+			comm.bcast(&Nx,1,0);
+			comm.bcast(&Ny,1,0);
+			comm.bcast(&Nz,1,0);
+			comm.bcast(&nprocx,1,0);
+			comm.bcast(&nprocy,1,0);
+			comm.bcast(&nprocz,1,0);
+			comm.bcast(&nspheres,1,0);
+			comm.bcast(&Lx,1,0);
+			comm.bcast(&Ly,1,0);
+			comm.bcast(&Lz,1,0);
 			//.................................................
 			comm.barrier();
 
@@ -308,8 +306,8 @@ int main(int argc, char **argv)
 					}
 				}
 			}
-			MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
-			//	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+			por_vol = comm.sumReduce( sum_local );
+			//porosity = comm.sumReduce( sum_local );
 			porosity = pore_vol*iVol_global;
 			if (rank==0) printf("Media porosity = %f \n",porosity);
 			//.........................................................
@@ -433,7 +431,7 @@ int main(int argc, char **argv)
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
 			comm.barrier();
-			starttime = MPI_Wtime();
+			starttime = Utilities::MPI::time();
 			//.........................................
 
 			double D32,vawx,vawy,vawz,Fo,Re,velocity,err1D,mag_force,vel_prev;
@@ -554,7 +552,7 @@ int main(int argc, char **argv)
 			fclose(NONDARCY);
 			ScaLBL_DeviceBarrier();
 			comm.barrier();
-			stoptime = MPI_Wtime();
+			stoptime = Utilities::MPI::time();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
 			cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_nonnewtonian_simulator.cpp b/tests/lbpm_nonnewtonian_simulator.cpp
index ff8792e7..bea3a814 100644
--- a/tests/lbpm_nonnewtonian_simulator.cpp
+++ b/tests/lbpm_nonnewtonian_simulator.cpp
@@ -124,8 +124,6 @@ int main(int argc, char **argv)
 		//		int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 		//		int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 		//**********************************
-		MPI_Request req1[18],req2[18];
-		MPI_Status stat1[18],stat2[18];
 
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -428,8 +426,8 @@ int main(int argc, char **argv)
 			}
 		}
 
-		MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);					/*    6     */
-		//MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+		pore_vol = comm.sumReduce( sum_local );					/*    6     */
+		//porosity = comm.sumReduce( sum_local );
 		porosity = pore_vol*iVol_global;
 
 		if (rank==0) printf("Media porosity = %f \n",porosity);
@@ -574,7 +572,7 @@ int main(int argc, char **argv)
 					timestep=5;
 				}
 			}
-			MPI_Bcast(&timestep,1,MPI_INT,0,comm);
+			comm.bcast(&timestep,1,0);
 
 			// Read in the restart file to CPU buffers
 			double *cDen = new double[2*N];
@@ -662,7 +660,7 @@ int main(int argc, char **argv)
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
 			comm.barrier();
-			starttime = MPI_Wtime();
+			starttime = Utilities::MPI::time();
 
 			/*
 			 *  Create the thread pool
@@ -810,7 +808,7 @@ int main(int argc, char **argv)
 			//************************************************************************/
 			ScaLBL_DeviceBarrier();
 			comm.barrier();
-			stoptime = MPI_Wtime();
+			stoptime = Utilities::MPI::time();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
 			cputime = (stoptime - starttime)/timestep;
@@ -835,20 +833,6 @@ int main(int argc, char **argv)
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 // Scrap
 
 // if (rank==0){
diff --git a/tests/lbpm_plates_pp.cpp b/tests/lbpm_plates_pp.cpp
index acd64f52..37191979 100644
--- a/tests/lbpm_plates_pp.cpp
+++ b/tests/lbpm_plates_pp.cpp
@@ -31,8 +31,6 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	double TubeRadius =15.0;
 	double WIDTH;
@@ -77,16 +75,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 	
@@ -176,7 +174,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_porenetwork_pp.cpp b/tests/lbpm_porenetwork_pp.cpp
index 4a6ccda7..1715811f 100644
--- a/tests/lbpm_porenetwork_pp.cpp
+++ b/tests/lbpm_porenetwork_pp.cpp
@@ -24,9 +24,6 @@ int main(int argc, char **argv)
 	int iproc,jproc,kproc;
 	int sendtag,recvtag;
 	//*****************************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
-	//**********************************
 
 	int nsph,ncyl, BC;
 	nsph = atoi(argv[1]);
@@ -67,16 +64,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 	
@@ -269,7 +266,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 	if (rank==0) printf("Pore volume = %f \n",pore_vol/double(Nx*Ny*Nz));
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_random_pp.cpp b/tests/lbpm_random_pp.cpp
index ad4b83cc..8318f50f 100644
--- a/tests/lbpm_random_pp.cpp
+++ b/tests/lbpm_random_pp.cpp
@@ -98,16 +98,16 @@ int main(int argc, char **argv)
 	}
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&nx,1,0);
+	comm.bcast(&ny,1,0);
+	comm.bcast(&nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 
@@ -166,7 +166,7 @@ int main(int argc, char **argv)
 		}
 	}
 	// total Global is the number of nodes in the pore-space
-	MPI_Allreduce(&count,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
+	totalGlobal = sumReduce( count );
 	float porosity=float(totalGlobal)/(nprocx*nprocy*nprocz*(nx-2)*(ny-2)*(nz-2));
 	if (rank==0) printf("Media Porosity: %f \n",porosity);
 
@@ -216,12 +216,12 @@ int main(int argc, char **argv)
 			sizeY = SizeY[bin];
 			sizeZ = SizeZ[bin];
 		}
-		MPI_Bcast(&x,1,MPI_INT,0,comm);
-		MPI_Bcast(&y,1,MPI_INT,0,comm);
-		MPI_Bcast(&z,1,MPI_INT,0,comm);
-		MPI_Bcast(&sizeX,1,MPI_INT,0,comm);
-		MPI_Bcast(&sizeY,1,MPI_INT,0,comm);
-		MPI_Bcast(&sizeZ,1,MPI_INT,0,comm);
+		comm.bcast(&x,1,0);
+		comm.bcast(&y,1,0);
+		comm.bcast(&z,1,0);
+		comm.bcast(&sizeX,1,0);
+		comm.bcast(&sizeY,1,0);
+		comm.bcast(&sizeZ,1,0);
 
 		//if (rank==0) printf("Broadcast block at %i,%i,%i \n",x,y,z);
 
@@ -269,7 +269,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
+		countGlobal = sumReduce( count );
 		sat = float(countGlobal)/totalGlobal;
 		//if (rank==0) printf("New count=%i\n",countGlobal);
 		//if (rank==0) printf("New saturation=%f\n",sat);
@@ -345,42 +345,24 @@ int main(int argc, char **argv)
 	PackID(Dm.sendList_yZ, Dm.sendCount_yZ ,sendID_yZ, id);
 	PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
-			recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
-			recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
-			recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
-			recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
-			recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
-			recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
-			recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
-			recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
-			recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
-			recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
-			recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
-			recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
-			recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
-			recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
-			recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
-			recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
-			recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,comm,MPI_STATUS_IGNORE);
-	MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
-			recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,comm,MPI_STATUS_IGNORE);
+	comm.sendrecv(sendID_x,Dm.sendCount_x,Dm.rank_x(),sendtag,recvID_X,Dm.recvCount_X,Dm.rank_X(),recvtag);
+	comm.sendrecv(sendID_X,Dm.sendCount_X,Dm.rank_X(),sendtag,recvID_x,Dm.recvCount_x,Dm.rank_x(),recvtag);
+	comm.sendrecv(sendID_y,Dm.sendCount_y,Dm.rank_y(),sendtag,recvID_Y,Dm.recvCount_Y,Dm.rank_Y(),recvtag);
+	comm.sendrecv(sendID_Y,Dm.sendCount_Y,Dm.rank_Y(),sendtag,recvID_y,Dm.recvCount_y,Dm.rank_y(),recvtag);
+	comm.sendrecv(sendID_z,Dm.sendCount_z,Dm.rank_z(),sendtag,recvID_Z,Dm.recvCount_Z,Dm.rank_Z(),recvtag);
+	comm.sendrecv(sendID_Z,Dm.sendCount_Z,Dm.rank_Z(),sendtag,recvID_z,Dm.recvCount_z,Dm.rank_z(),recvtag);
+	comm.sendrecv(sendID_xy,Dm.sendCount_xy,Dm.rank_xy(),sendtag,recvID_XY,Dm.recvCount_XY,Dm.rank_XY(),recvtag);
+	comm.sendrecv(sendID_XY,Dm.sendCount_XY,Dm.rank_XY(),sendtag,recvID_xy,Dm.recvCount_xy,Dm.rank_xy(),recvtag);
+	comm.sendrecv(sendID_Xy,Dm.sendCount_Xy,Dm.rank_Xy(),sendtag,recvID_xY,Dm.recvCount_xY,Dm.rank_xY(),recvtag);
+	comm.sendrecv(sendID_xY,Dm.sendCount_xY,Dm.rank_xY(),sendtag,recvID_Xy,Dm.recvCount_Xy,Dm.rank_Xy(),recvtag);
+	comm.sendrecv(sendID_xz,Dm.sendCount_xz,Dm.rank_xz(),sendtag,recvID_XZ,Dm.recvCount_XZ,Dm.rank_XZ(),recvtag);
+	comm.sendrecv(sendID_XZ,Dm.sendCount_XZ,Dm.rank_XZ(),sendtag,recvID_xz,Dm.recvCount_xz,Dm.rank_xz(),recvtag);
+	comm.sendrecv(sendID_Xz,Dm.sendCount_Xz,Dm.rank_Xz(),sendtag,recvID_xZ,Dm.recvCount_xZ,Dm.rank_xZ(),recvtag);
+	comm.sendrecv(sendID_xZ,Dm.sendCount_xZ,Dm.rank_xZ(),sendtag,recvID_Xz,Dm.recvCount_Xz,Dm.rank_Xz(),recvtag);
+	comm.sendrecv(sendID_yz,Dm.sendCount_yz,Dm.rank_yz(),sendtag,recvID_YZ,Dm.recvCount_YZ,Dm.rank_YZ(),recvtag);
+	comm.sendrecv(sendID_YZ,Dm.sendCount_YZ,Dm.rank_YZ(),sendtag,recvID_yz,Dm.recvCount_yz,Dm.rank_yz(),recvtag);
+	comm.sendrecv(sendID_Yz,Dm.sendCount_Yz,Dm.rank_Yz(),sendtag,recvID_yZ,Dm.recvCount_yZ,Dm.rank_yZ(),recvtag);
+	comm.sendrecv(sendID_yZ,Dm.sendCount_yZ,Dm.rank_yZ(),sendtag,recvID_Yz,Dm.recvCount_Yz,Dm.rank_Yz(),recvtag);
 	//......................................................................................
 	UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
 	UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
@@ -412,7 +394,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
+		countGlobal = comm.sumReduce( count );
 		sat = float(countGlobal)/totalGlobal;
 	if (rank==0) printf("Final saturation=%f\n",sat);
 
diff --git a/tests/lbpm_segmented_decomp.cpp b/tests/lbpm_segmented_decomp.cpp
index 1bc89adb..65b8576f 100644
--- a/tests/lbpm_segmented_decomp.cpp
+++ b/tests/lbpm_segmented_decomp.cpp
@@ -85,23 +85,23 @@ int main(int argc, char **argv)
 		comm.barrier();
 		// Computational domain
 		//.................................................
-		MPI_Bcast(&nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+		comm.bcast(&nx,1,0);
+		comm.bcast(&ny,1,0);
+		comm.bcast(&nz,1,0);
+		comm.bcast(&nprocx,1,0);
+		comm.bcast(&nprocy,1,0);
+		comm.bcast(&nprocz,1,0);
+		comm.bcast(&nspheres,1,0);
+		comm.bcast(&Lx,1,0);
+		comm.bcast(&Ly,1,0);
+		comm.bcast(&Lz,1,0);
 		//.................................................
-		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-		MPI_Bcast(&xStart,1,MPI_INT,0,comm);
-		MPI_Bcast(&yStart,1,MPI_INT,0,comm);
-		MPI_Bcast(&zStart,1,MPI_INT,0,comm);
+		comm.bcast(&Nx,1,0);
+		comm.bcast(&Ny,1,0);
+		comm.bcast(&Nz,1,0);
+		comm.bcast(&xStart,1,0);
+		comm.bcast(&yStart,1,0);
+		comm.bcast(&zStart,1,0);
 		//.................................................
 		comm.barrier();
 
@@ -191,7 +191,7 @@ int main(int argc, char **argv)
 						}
 						else{
 							printf("Sending data to process %i \n", rnk);
-							MPI_Send(tmp,N,MPI_CHAR,rnk,15,comm);
+							comm.send(tmp,N,rnk,15);
 						}
 					}
 				}
@@ -200,7 +200,7 @@ int main(int argc, char **argv)
 		else{
 			// Recieve the subdomain from rank = 0
 			printf("Ready to recieve data %i at process %i \n", N,rank);
-			MPI_Recv(Dm.id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
+			comm.recv(Dm.id,N,0,15);
 		}
 		comm.barrier();
 
@@ -243,8 +243,8 @@ int main(int argc, char **argv)
 				printf("Original label=%i, New label=%i \n",oldlabel,newlabel);
 			}
 		}
-		MPI_Barrier(MPI_COMM_WORLD);
-		MPI_Bcast(LabelList,2*NLABELS,MPI_INT,0,MPI_COMM_WORLD);
+		comm.barrier();
+		comm.bcast(LabelList,2*NLABELS,0);
 		
 		char *newIDs;
 		newIDs= new char [nx*ny*nz];
@@ -278,8 +278,8 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
-		MPI_Allreduce(&total,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
+		countGlobal = comm.sumReduce( count );
+		totalGlobal = comm.sumReduce( total );
 
 
 		float porosity = float(totalGlobal-countGlobal)/totalGlobal;
@@ -321,8 +321,8 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
-		MPI_Allreduce(&total,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
+		countGlobal = comm.sumReduce( count );
+		totalGlobal = comm.sumReduce( total );
 		float saturation = float(countGlobal)/totalGlobal;
 		if (rank==0) printf("wetting phase saturation=%f\n",saturation);
 
diff --git a/tests/lbpm_segmented_pp.cpp b/tests/lbpm_segmented_pp.cpp
index 39cf0bd1..484a11e2 100644
--- a/tests/lbpm_segmented_pp.cpp
+++ b/tests/lbpm_segmented_pp.cpp
@@ -180,7 +180,7 @@ int main(int argc, char **argv)
 			fflush(stdout);
 			porosity = ReadFromBlock(Dm->id,Dm->iproc(),Dm->jproc(),Dm->kproc(),nx,ny,nz);
 			
-			MPI_Barrier(MPI_COMM_WORLD);
+		    comm.barrier();
 			if (rank==0) printf("Writing local ID files (poros=%f) \n",porosity);
 			fflush(stdout);
 			FILE *ID = fopen(LocalRankFilename,"wb");
diff --git a/tests/lbpm_sphere_pp.cpp b/tests/lbpm_sphere_pp.cpp
index 2e053eed..0df11b96 100644
--- a/tests/lbpm_sphere_pp.cpp
+++ b/tests/lbpm_sphere_pp.cpp
@@ -38,8 +38,6 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -125,10 +123,10 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
-	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
+	comm.bcast(cx,nspheres,0);
+	comm.bcast(cy,nspheres,0);
+	comm.bcast(cz,nspheres,0);
+	comm.bcast(rad,nspheres,0);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
@@ -144,7 +142,7 @@ int main(int argc, char **argv)
 		D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 		printf("Sauter Mean Diameter (computed from sphere packing) = %f \n",D);
 	}
-	MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&D,1,0);
 
 	//.......................................................................
 	SignedDistance(SignDist.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
@@ -177,7 +175,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
+	porosity = comm.sumReduce( sum_local );
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -193,7 +191,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 	
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_squaretube_pp.cpp b/tests/lbpm_squaretube_pp.cpp
index c1f05aee..a4ee5f60 100644
--- a/tests/lbpm_squaretube_pp.cpp
+++ b/tests/lbpm_squaretube_pp.cpp
@@ -30,9 +30,6 @@ int main(int argc, char **argv)
 	int rank_xy,rank_XY,rank_xY,rank_Xy;
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
-	//**********************************
-	MPI_Request req1[18],req2[18];
-	MPI_Status stat1[18],stat2[18];
 
 	int ORIENTATION=2; //default: the tube is aligned with Z axis
 	                   //ORIENTATION = 0: tube is aligned with X axis
@@ -83,16 +80,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
-	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
-	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
-	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
-	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
-	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
-	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
+	comm.bcast(&Nx,1,0);
+	comm.bcast(&Ny,1,0);
+	comm.bcast(&Nz,1,0);
+	comm.bcast(&nprocx,1,0);
+	comm.bcast(&nprocy,1,0);
+	comm.bcast(&nprocz,1,0);
+	comm.bcast(&nspheres,1,0);
+	comm.bcast(&Lx,1,0);
+	comm.bcast(&Ly,1,0);
+	comm.bcast(&Lz,1,0);
 	//.................................................
 	comm.barrier();
 	
@@ -235,7 +232,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+	pore_vol = comm.sumReduce( sum_local );
 
 	//.........................................................
 	// don't perform computations at the eight corners

From 2a66e63672084b687677c891dc4b44001f92188b Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Wed, 29 Jan 2020 17:14:48 -0500
Subject: [PATCH 031/121] add pressure BC for abs-perm simulator; need
 validation test for this

---
 models/GreyscaleModel.cpp | 62 +++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 36f853b1..0499951f 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -44,7 +44,7 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	flux=0.0;
     dp = 10.0; //unit of 'dp': voxel
 	
-	// Greyscale Model parameters
+	// ---------------------- Greyscale Model parameters -----------------------//
 	if (greyscale_db->keyExists( "timestepMax" )){
 		timestepMax = greyscale_db->getScalar<int>( "timestepMax" );
 	}
@@ -77,10 +77,14 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	if (greyscale_db->keyExists( "tolerance" )){
 		tolerance = greyscale_db->getScalar<double>( "tolerance" );
 	}
+	// ------------------------------------------------------------------------//
+    
+    //------------------------ Other Domain parameters ------------------------//
 	BoundaryCondition = 0;
 	if (domain_db->keyExists( "BC" )){
 		BoundaryCondition = domain_db->getScalar<int>( "BC" );
 	}
+	// ------------------------------------------------------------------------//
 }
 
 void ScaLBL_GreyscaleModel::SetDomain(){
@@ -366,6 +370,9 @@ void ScaLBL_GreyscaleModel::Create(){
 
 void ScaLBL_GreyscaleModel::Initialize(){
 	if (rank==0)	printf ("Initializing distributions \n");
+    //TODO: for BGK, you need to consider voxel porosity
+    //      for IMRT, the whole set of feq is different
+    //      if in the future you have different collison mode, need to write two set of initialization functions
 	ScaLBL_D3Q19_Init(fq, Np);
 
 	if (Restart == true){
@@ -431,21 +438,36 @@ void ScaLBL_GreyscaleModel::Run(){
 	double flow_rate_previous = 0.0;
 	while (timestep < timestepMax && error > tolerance) {
 		//************************************************************************/
+		// *************ODD TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		ScaLBL_DeviceBarrier();
+		// Set BCs
+		if (BoundaryCondition == 3){
+			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+
+		// *************EVEN TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
-		//ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
-		//ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		ScaLBL_DeviceBarrier();
+		// Set BCs
+		if (BoundaryCondition == 3){
+			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
+		//ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
@@ -461,12 +483,28 @@ void ScaLBL_GreyscaleModel::Run(){
             double px_loc,py_loc,pz_loc;
             double px,py,pz;
             double mass_loc,mass_glb;
+            
+            //parameters for domain average
+	        int64_t i,j,k,n,imin,jmin,kmin,kmax;
+            // If external boundary conditions are set, do not average over the inlet and outlet
+            kmin=1; kmax=Nz-1;
+            //In case user forgets to specify the inlet/outlet buffer layers for BC>0
+            if (BoundaryCondition > 0 && Dm->kproc() == 0) kmin=4;
+            if (BoundaryCondition > 0 && Dm->kproc() == Dm->nprocz()-1) kmax=Nz-4;
+
+            imin=jmin=1;
+            // If inlet/outlet layers exist use these as default
+            //if (Dm->inlet_layers_x > 0) imin = Dm->inlet_layers_x;
+            //if (Dm->inlet_layers_y > 0) jmin = Dm->inlet_layers_y;
+            if (BoundaryCondition > 0 && Dm->inlet_layers_z > 0 && Dm->kproc() == 0) kmin = 1 + Dm->inlet_layers_z;//"1" indicates the halo layer
+            if (BoundaryCondition > 0 && Dm->outlet_layers_z > 0 && Dm->kproc() == Dm->nprocz()-1) kmax = Nz-1 - Dm->outlet_layers_z; 
+
 
 			px_loc = py_loc = pz_loc = 0.f;
             mass_loc = 0.f;
-			for (int k=1; k<Nz-1; k++){
-				for (int j=1; j<Ny-1; j++){
-					for (int i=1; i<Nx-1; i++){
+			for (int k=kmin; k<kmax; k++){
+				for (int j=jmin; j<Ny-1; j++){
+					for (int i=imin; i<Nx-1; i++){
 						if (SignDist(i,j,k) > 0){
 							px_loc   += Velocity_x(i,j,k)*Den*PorosityMap(i,j,k);
 							py_loc   += Velocity_y(i,j,k)*Den*PorosityMap(i,j,k);

From 6e7cb832546f5064551019cc713e8f7c63ee9203 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Wed, 29 Jan 2020 23:49:36 -0500
Subject: [PATCH 032/121] add pressure to output data

---
 common/ScaLBL.h           |  8 ++++----
 cpu/Greyscale.cpp         | 20 ++++++++++++++++----
 gpu/Greyscale.cu          | 36 ++++++++++++++++++++++++------------
 models/GreyscaleModel.cpp | 35 ++++++++++++++++++++++++++---------
 models/GreyscaleModel.h   |  3 ++-
 5 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index d2495e3f..04cfbd97 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -57,16 +57,16 @@ extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int star
 
 // GREYSCALE MODEL
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
-                                              double *Poros,double *Perm, double *Velocity);
+                                              double *Poros,double *Perm, double *Velocity,double *Pressure);
 
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
-                                             double *Poros,double *Perm, double *Velocity);
+                                             double *Poros,double *Perm, double *Velocity,double *Pressure);
 
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
-                                              double *Poros,double *Perm, double *Velocity,double Den);
+                                              double *Poros,double *Perm, double *Velocity,double Den,double *Pressure);
 
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
-                                             double *Poros,double *Perm, double *Velocity,double Den);
+                                             double *Poros,double *Perm, double *Velocity,double Den,double *Pressure);
 
 
 // MRT MODEL
diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index f2be769e..d1bde7f2 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -1,11 +1,12 @@
 #include <math.h>
 
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                              double *Poros,double *Perm, double *Velocity){
+                                              double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
 	double rho,vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
+    double pressure;
     //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
@@ -48,6 +49,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+        pressure = rho/porosity/3.0;
 		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
 		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
 		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
@@ -159,15 +161,18 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 	}
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
-                                             double *Poros,double *Perm, double *Velocity){
+                                             double *Poros,double *Perm, double *Velocity,double *Pressure){
 	int n;
 	// conserved momemnts
 	double rho,vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
+    double pressure;
     //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
@@ -266,6 +271,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+        pressure = rho/porosity/3.0;
 		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
 		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
 		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
@@ -377,12 +383,14 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 	}
 }
 
 
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                              double *Poros,double *Perm, double *Velocity, double Den){
+                                              double *Poros,double *Perm, double *Velocity, double Den,double *Pressure){
 	int n;
 	double vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
@@ -837,11 +845,13 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 	}
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
-                                             double *Poros,double *Perm, double *Velocity, double Den){
+                                             double *Poros,double *Perm, double *Velocity, double Den,double *Pressure){
 	int n, nread;
 	double vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
@@ -1332,6 +1342,8 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 	}
 }
 
diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index 5b8273fe..12ef6f17 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -4,11 +4,12 @@
 #define NTHREADS 256
 
 __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                                  double *Poros,double *Perm, double *Velocity){
+                                                  double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
 	double rho,vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
+    double pressure;
     //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
@@ -56,6 +57,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+        pressure = rho/porosity/3.0;
 		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
 		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
 		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
@@ -167,17 +169,20 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 
 		}
 	}
 }
 
 __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                                 double *Poros,double *Perm, double *Velocity){
+                                                 double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
 	double rho,vx,vy,vz,v_mag;
     double ux,uy,uz,u_mag;
+    double pressure;
     //double uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
@@ -279,6 +284,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
         if (porosity==1.0) c1 = 0.0;//i.e. apparent pore nodes
 
 		rho = f0+f2+f1+f4+f3+f6+f5+f8+f7+f10+f9+f12+f11+f14+f13+f16+f15+f18+f17;
+        pressure = rho/porosity/3.0;
 		vx = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14)/rho+0.5*porosity*Gx;
 		vy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18)/rho+0.5*porosity*Gy;
 		vz = (f5-f6+f11-f12-f13+f14+f15-f16-f17+f18)/rho+0.5*porosity*Gz;
@@ -390,12 +396,14 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
 		Velocity[2*Np+n] = uz;
+        //Update pressure on device
+        Pressure[n] = pressure;
 		}
 	}
 }
 
 __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                                  double *Poros,double *Perm, double *Velocity, double Den){
+                                                  double *Poros,double *Perm, double *Velocity, double Den, double *Pressure){
 
 	int n;
 	double vx,vy,vz,v_mag;
@@ -857,6 +865,8 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
             Velocity[0*Np+n] = ux;
             Velocity[1*Np+n] = uy;
             Velocity[2*Np+n] = uz;
+            //Update pressure on device
+            Pressure[n] = pressure;
 
 		}
 	}
@@ -864,7 +874,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
 
 
 __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
-                                                 double *Poros,double *Perm, double *Velocity,double Den){
+                                                 double *Poros,double *Perm, double *Velocity,double Den, double *Pressure){
 
 	int n, nread;
 	double vx,vy,vz,v_mag;
@@ -1361,15 +1371,17 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
             Velocity[0*Np+n] = ux;
             Velocity[1*Np+n] = uy;
             Velocity[2*Np+n] = uz;
+            //Update pressure on device
+            Pressure[n] = pressure;
 
 		}
 	}
 }
 
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
 	
-    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1377,9 +1389,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity){
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
 
-    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity);
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1387,9 +1399,9 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den){
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
 	
-    dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den);
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1397,9 +1409,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den){
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
 
-    dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den);
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 0499951f..018af5ec 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -104,6 +104,7 @@ void ScaLBL_GreyscaleModel::SetDomain(){
 	Velocity_y.resize(Nx,Ny,Nz);
 	Velocity_z.resize(Nx,Ny,Nz);
 	PorosityMap.resize(Nx,Ny,Nz);
+	Pressure.resize(Nx,Ny,Nz);
 
 	id = new signed char [N];
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
@@ -320,7 +321,7 @@ void ScaLBL_GreyscaleModel::Create(){
 	ScaLBL_AllocateDeviceMemory((void **) &fq, 19*dist_mem_size);
 	ScaLBL_AllocateDeviceMemory((void **) &Permeability, sizeof(double)*Np);		
 	ScaLBL_AllocateDeviceMemory((void **) &Porosity, sizeof(double)*Np);		
-	ScaLBL_AllocateDeviceMemory((void **) &Pressure, sizeof(double)*Np);
+	ScaLBL_AllocateDeviceMemory((void **) &Pressure_dvc, sizeof(double)*Np);
 	ScaLBL_AllocateDeviceMemory((void **) &Velocity, 3*sizeof(double)*Np);
 	//...........................................................................
 	// Update GPU data structures
@@ -441,8 +442,8 @@ void ScaLBL_GreyscaleModel::Run(){
 		// *************ODD TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		//ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_DeviceBarrier();
 		// Set BCs
@@ -450,15 +451,15 @@ void ScaLBL_GreyscaleModel::Run(){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
-		ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		//ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		// *************EVEN TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		//ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		//ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_DeviceBarrier();
 		// Set BCs
@@ -466,8 +467,8 @@ void ScaLBL_GreyscaleModel::Run(){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
-		ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity);
-		//ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den);
+		//ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
@@ -476,6 +477,7 @@ void ScaLBL_GreyscaleModel::Run(){
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
 			ScaLBL_Comm->RegularLayout(Map,Porosity,PorosityMap);
+			//ScaLBL_Comm->RegularLayout(Map,Pressure_dvc,Pressure);
 			
 			double count_loc=0;
 			double count;
@@ -678,6 +680,7 @@ void ScaLBL_GreyscaleModel::VelocityField(){
 	auto VyVar = std::make_shared<IO::Variable>();
 	auto VzVar = std::make_shared<IO::Variable>();
 	auto SignDistVar = std::make_shared<IO::Variable>();
+	auto PressureVar = std::make_shared<IO::Variable>();
 
 	IO::initialize("","silo","false");
 	// Create the MeshDataStruct	
@@ -706,20 +709,34 @@ void ScaLBL_GreyscaleModel::VelocityField(){
 	VzVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
 	visData[0].vars.push_back(VzVar);
 	
+	PressureVar->name = "Pressure";
+	PressureVar->type = IO::VariableType::VolumeVariable;
+	PressureVar->dim = 1;
+	PressureVar->data.resize(Dm->Nx-2,Dm->Ny-2,Dm->Nz-2);
+	visData[0].vars.push_back(PressureVar);
+
 	Array<double>& SignData  = visData[0].vars[0]->data;
 	Array<double>& VelxData = visData[0].vars[1]->data;
 	Array<double>& VelyData = visData[0].vars[2]->data;
 	Array<double>& VelzData = visData[0].vars[3]->data;
+	Array<double>& PressureData = visData[0].vars[4]->data;
 	
     ASSERT(visData[0].vars[0]->name=="SignDist");
     ASSERT(visData[0].vars[1]->name=="Velocity_x");
     ASSERT(visData[0].vars[2]->name=="Velocity_y");
     ASSERT(visData[0].vars[3]->name=="Velocity_z");
+    ASSERT(visData[0].vars[4]->name=="Pressure");
 	
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
+	ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
+	ScaLBL_Comm->RegularLayout(Map,Pressure_dvc,Pressure);
+
     fillData.copy(SignDist,SignData);
     fillData.copy(Velocity_x,VelxData);
     fillData.copy(Velocity_y,VelyData);
     fillData.copy(Velocity_z,VelzData);
+    fillData.copy(Pressure,PressureData);
 	
     IO::writeData( timestep, visData, Dm->Comm );
 
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index d1399053..a7a5f528 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -65,13 +65,14 @@ public:
 	double *Permeability;//grey voxel permeability
 	double *Porosity;
 	double *Velocity;
-	double *Pressure;
+	double *Pressure_dvc;
     IntArray Map;
     DoubleArray SignDist;
     DoubleArray Velocity_x;
     DoubleArray Velocity_y;
     DoubleArray Velocity_z;
     DoubleArray PorosityMap;
+    DoubleArray Pressure;
 		
 private:
 	MPI_Comm comm;

From 25df1e0f3522ffcd1ad51e3de90892e2894ac51c Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Thu, 30 Jan 2020 13:23:27 -0500
Subject: [PATCH 033/121] add a few print-out to make the program output more
 verbose

---
 common/Domain.cpp | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 48bfed15..1be64859 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -434,7 +434,7 @@ void Domain::Decomp(std::string Filename)
 		}
 
 		if (inlet_layers_z > 0){
-			printf("Checkerboard pattern at z inlet for %i layers \n",inlet_layers_z);
+			printf("Checkerboard pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
 			// use checkerboard pattern
 			for (int k = zStart; k < zStart+inlet_layers_z; k++){
 				for (int j = 0; j<global_Ny; j++){
@@ -492,7 +492,7 @@ void Domain::Decomp(std::string Filename)
 		}
 
 		if (outlet_layers_z > 0){
-			printf("Checkerboard pattern at z outlet for %i layers \n",outlet_layers_z);
+			printf("Checkerboard pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
 			// use checkerboard pattern
 			for (int k = zStart + nz*nprocz - outlet_layers_z; k < zStart + nz*nprocz; k++){
 				for (int j = 0; j<global_Ny; j++){
@@ -594,23 +594,35 @@ void Domain::Decomp(std::string Filename)
 	//.........................................................
 	// If external boundary conditions are applied remove solid
 	if (BoundaryCondition >  0  && kproc() == 0){
-    	if (inlet_layers_z < 4)	inlet_layers_z=4;
+    	if (inlet_layers_z < 4){
+            inlet_layers_z=4;
+            if(RANK==0){
+                printf("NOTE:Non-periodic BC is applied, but the number of Z-inlet layers is not specified (or is smaller than 3 voxels) \n");
+                printf("     the number of Z-inlet layer is reset to %i voxels, saturated with phase label=%i",inlet_layers_z-1,inlet_layers_phase);
+            } 
+        }	
 		for (int k=0; k<inlet_layers_z; k++){
 			for (int j=0;j<Ny;j++){
 				for (int i=0;i<Nx;i++){
 					int n = k*Nx*Ny+j*Nx+i;
-					id[n] = 1;
+					id[n] = inlet_layers_phase;
 				}                    
 			}
  		}
  	}
     if (BoundaryCondition >  0  && kproc() == nprocz-1){
-    	if (outlet_layers_z < 4)	outlet_layers_z=4;
+    	if (outlet_layers_z < 4){
+            outlet_layers_z=4;
+            if(RANK==0){
+                printf("NOTE:Non-periodic BC is applied, but the number of Z-outlet layers is not specified (or is smaller than 3 voxels) \n");
+                printf("     the number of Z-outlet layer is reset to %i voxels, saturated with phase label=%i",outlet_layers_z-1,outlet_layers_phase);
+            } 
+        }	
  		for (int k=Nz-outlet_layers_z; k<Nz; k++){
  			for (int j=0;j<Ny;j++){
  				for (int i=0;i<Nx;i++){
  					int n = k*Nx*Ny+j*Nx+i;
- 					id[n] = 2;
+ 					id[n] = outlet_layers_phase;
  				}                    
  			}
  		}

From 34c5b223b0cdc9e0ce162fd52f66563403fb6acc Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Thu, 30 Jan 2020 17:57:56 -0500
Subject: [PATCH 034/121] fix printf bug

---
 common/Domain.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 1be64859..1028a0ef 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -597,8 +597,7 @@ void Domain::Decomp(std::string Filename)
     	if (inlet_layers_z < 4){
             inlet_layers_z=4;
             if(RANK==0){
-                printf("NOTE:Non-periodic BC is applied, but the number of Z-inlet layers is not specified (or is smaller than 3 voxels) \n");
-                printf("     the number of Z-inlet layer is reset to %i voxels, saturated with phase label=%i",inlet_layers_z-1,inlet_layers_phase);
+                printf("NOTE:Non-periodic BC is applied, but the number of Z-inlet layers is not specified (or is smaller than 3 voxels) \n     the number of Z-inlet layer is reset to %i voxels, saturated with phase label=%i \n",inlet_layers_z-1,inlet_layers_phase);
             } 
         }	
 		for (int k=0; k<inlet_layers_z; k++){
@@ -613,9 +612,8 @@ void Domain::Decomp(std::string Filename)
     if (BoundaryCondition >  0  && kproc() == nprocz-1){
     	if (outlet_layers_z < 4){
             outlet_layers_z=4;
-            if(RANK==0){
-                printf("NOTE:Non-periodic BC is applied, but the number of Z-outlet layers is not specified (or is smaller than 3 voxels) \n");
-                printf("     the number of Z-outlet layer is reset to %i voxels, saturated with phase label=%i",outlet_layers_z-1,outlet_layers_phase);
+            if(RANK==nprocs-1){
+                printf("NOTE:Non-periodic BC is applied, but the number of Z-outlet layers is not specified (or is smaller than 3 voxels) \n     the number of Z-outlet layer is reset to %i voxels, saturated with phase label=%i \n",outlet_layers_z-1,outlet_layers_phase);
             } 
         }	
  		for (int k=Nz-outlet_layers_z; k<Nz; k++){

From 69ee9f79cb449970b650c915a8ae2ff5b9be18d1 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Fri, 31 Jan 2020 15:15:26 -0500
Subject: [PATCH 035/121] Some updates:(1)add different fq initialization for
 BGK and IMRT;(2)user can choose collision model

---
 common/ScaLBL.h           |  4 +++
 cpu/D3Q19.cpp             | 27 ++++++++++++++
 gpu/D3Q19.cu              | 40 ++++++++++++++++++++-
 models/GreyscaleModel.cpp | 76 +++++++++++++++++++++++++++++++++------
 models/GreyscaleModel.h   |  1 +
 5 files changed, 136 insertions(+), 12 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 04cfbd97..007fda34 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -46,6 +46,7 @@ extern "C" void ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int
 
 extern "C" void ScaLBL_D3Q19_Init(double *Dist, int Np);
 
+
 extern "C" void ScaLBL_D3Q19_Momentum(double *dist, double *vel, int Np);
 
 extern "C" void ScaLBL_D3Q19_Pressure(double *dist, double *press, int Np);
@@ -56,6 +57,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int
 extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz);
 
 // GREYSCALE MODEL
+
+extern "C" void ScaLBL_D3Q19_GreyIMRT_Init(double *Dist, int Np, double Den);
+
 extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
                                               double *Poros,double *Perm, double *Velocity,double *Pressure);
 
diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp
index 2af59883..244bb3d2 100644
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@@ -84,6 +84,33 @@ extern "C" void ScaLBL_D3Q19_Init(double *dist, int Np)
 	}
 }
 
+
+extern "C" void ScaLBL_D3Q19_GreyIMRT_Init(double *dist, int Np, double Den)
+{
+	int n;
+	for (n=0; n<Np; n++){
+		dist[n] = Den - 0.6666666666666667;
+		dist[Np+n] = 0.055555555555555555;		//double(100*n)+1.f;
+		dist[2*Np+n] = 0.055555555555555555;	//double(100*n)+2.f;
+		dist[3*Np+n] = 0.055555555555555555;	//double(100*n)+3.f;
+		dist[4*Np+n] = 0.055555555555555555;	//double(100*n)+4.f;
+		dist[5*Np+n] = 0.055555555555555555;	//double(100*n)+5.f;
+		dist[6*Np+n] = 0.055555555555555555;	//double(100*n)+6.f;
+		dist[7*Np+n] = 0.0277777777777778;   //double(100*n)+7.f;
+		dist[8*Np+n] = 0.0277777777777778;   //double(100*n)+8.f;
+		dist[9*Np+n] = 0.0277777777777778;   //double(100*n)+9.f;
+		dist[10*Np+n] = 0.0277777777777778;  //double(100*n)+10.f;
+		dist[11*Np+n] = 0.0277777777777778;  //double(100*n)+11.f;
+		dist[12*Np+n] = 0.0277777777777778;  //double(100*n)+12.f;
+		dist[13*Np+n] = 0.0277777777777778;  //double(100*n)+13.f;
+		dist[14*Np+n] = 0.0277777777777778;  //double(100*n)+14.f;
+		dist[15*Np+n] = 0.0277777777777778;  //double(100*n)+15.f;
+		dist[16*Np+n] = 0.0277777777777778;  //double(100*n)+16.f;
+		dist[17*Np+n] = 0.0277777777777778;  //double(100*n)+17.f;
+		dist[18*Np+n] = 0.0277777777777778;  //double(100*n)+18.f;
+	}
+}
+
 //*************************************************************************
 extern "C" void ScaLBL_D3Q19_Swap(char *ID, double *disteven, double *distodd, int Nx, int Ny, int Nz)
 {
diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu
index 2df4db6c..b790b0a9 100644
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
@@ -267,6 +267,36 @@ __global__ void dvc_ScaLBL_D3Q19_Init(double *dist, int Np)
 	}
 }
 
+__global__ void dvc_ScaLBL_D3Q19_GreyIMRT_Init(double *dist, int Np, double Den)
+{
+	int n;
+	int S = Np/NBLOCKS/NTHREADS + 1;
+	for (int s=0; s<S; s++){
+		//........Get 1-D index for this thread....................
+		n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
+		if (n<Np ){
+			dist[n] = Den - 0.6666666666666667;
+			dist[Np+n] = 0.055555555555555555;		//double(100*n)+1.f;
+			dist[2*Np+n] = 0.055555555555555555;	//double(100*n)+2.f;
+			dist[3*Np+n] = 0.055555555555555555;	//double(100*n)+3.f;
+			dist[4*Np+n] = 0.055555555555555555;	//double(100*n)+4.f;
+			dist[5*Np+n] = 0.055555555555555555;	//double(100*n)+5.f;
+			dist[6*Np+n] = 0.055555555555555555;	//double(100*n)+6.f;
+			dist[7*Np+n] = 0.0277777777777778;   //double(100*n)+7.f;
+			dist[8*Np+n] = 0.0277777777777778;   //double(100*n)+8.f;
+			dist[9*Np+n] = 0.0277777777777778;   //double(100*n)+9.f;
+			dist[10*Np+n] = 0.0277777777777778;  //double(100*n)+10.f;
+			dist[11*Np+n] = 0.0277777777777778;  //double(100*n)+11.f;
+			dist[12*Np+n] = 0.0277777777777778;  //double(100*n)+12.f;
+			dist[13*Np+n] = 0.0277777777777778;  //double(100*n)+13.f;
+			dist[14*Np+n] = 0.0277777777777778;  //double(100*n)+14.f;
+			dist[15*Np+n] = 0.0277777777777778;  //double(100*n)+15.f;
+			dist[16*Np+n] = 0.0277777777777778;  //double(100*n)+16.f;
+			dist[17*Np+n] = 0.0277777777777778;  //double(100*n)+17.f;
+			dist[18*Np+n] = 0.0277777777777778;  //double(100*n)+18.f;
+		}
+	}
+}
 
 //*************************************************************************
 __global__  void dvc_ScaLBL_D3Q19_Swap_Compact(int *neighborList, double *disteven, double *distodd, int Np, int q){
@@ -2325,7 +2355,15 @@ extern "C" void ScaLBL_D3Q19_Init(double *dist, int Np){
 	dvc_ScaLBL_D3Q19_Init<<<NBLOCKS,NTHREADS >>>(dist, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q19_AA_Init: %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q19_Init: %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_GreyIMRT_Init(double *dist, int Np, double Den){
+	dvc_ScaLBL_D3Q19_GreyIMRT_Init<<<NBLOCKS,NTHREADS >>>(dist, Np, Den);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_GreyIMRT_Init: %s \n",cudaGetErrorString(err));
 	}
 }
 
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 018af5ec..5f8e4e36 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -43,6 +43,7 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	din=dout=1.0;
 	flux=0.0;
     dp = 10.0; //unit of 'dp': voxel
+    CollisionType = 1; //1: IMRT; 2: BGK
 	
 	// ---------------------- Greyscale Model parameters -----------------------//
 	if (greyscale_db->keyExists( "timestepMax" )){
@@ -77,6 +78,10 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	if (greyscale_db->keyExists( "tolerance" )){
 		tolerance = greyscale_db->getScalar<double>( "tolerance" );
 	}
+	auto collision = greyscale_db->getWithDefault<std::string>( "collision", "IMRT" );
+	if (collision == "BGK"){
+        CollisionType=2;
+	}
 	// ------------------------------------------------------------------------//
     
     //------------------------ Other Domain parameters ------------------------//
@@ -374,7 +379,20 @@ void ScaLBL_GreyscaleModel::Initialize(){
     //TODO: for BGK, you need to consider voxel porosity
     //      for IMRT, the whole set of feq is different
     //      if in the future you have different collison mode, need to write two set of initialization functions
-	ScaLBL_D3Q19_Init(fq, Np);
+    if (CollisionType==1){
+	    ScaLBL_D3Q19_GreyIMRT_Init(fq, Np, Den);
+        if (rank==0) printf("Collision model: Incompressible MRT.\n");
+    }
+    else if (CollisionType==2){
+	    ScaLBL_D3Q19_Init(fq, Np);
+        if (rank==0) printf("Collision model: BGK.\n");
+    }
+    else{
+        if (rank==0) printf("Unknown collison type! IMRT collision is used.\n"); 
+	    ScaLBL_D3Q19_GreyIMRT_Init(fq, Np, Den);
+        CollisionType=1;
+		greyscale_db->putScalar<std::string>( "collision", "IMRT" );
+    }
 
 	if (Restart == true){
 		if (rank==0){
@@ -442,8 +460,17 @@ void ScaLBL_GreyscaleModel::Run(){
 		// *************ODD TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
-		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+        switch (CollisionType){
+            case 1: 
+                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+            case 2: 
+                    ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+                    break;
+            default: 
+                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+        }
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_DeviceBarrier();
 		// Set BCs
@@ -451,25 +478,52 @@ void ScaLBL_GreyscaleModel::Run(){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
-		//ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
-		ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+        switch (CollisionType){
+            case 1: 
+		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+            case 2: 
+		            ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+                    break;
+            default: 
+		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+        }
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		// *************EVEN TIMESTEP*************//
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
-		//ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
-		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
-		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+        switch (CollisionType){
+            case 1: 
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+            case 2: 
+		            ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+                    break;
+            default: 
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+        }
+        ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_DeviceBarrier();
 		// Set BCs
 		if (BoundaryCondition == 3){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
-		//ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
-		ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+        switch (CollisionType){
+            case 1: 
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+            case 2: 
+		            ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+                    break;
+            default: 
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    break;
+        }
+        ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
 		if (timestep%analysis_interval==0){
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index a7a5f528..792e87ea 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -35,6 +35,7 @@ public:
 	bool Restart,pBC;
 	int timestep,timestepMax;
 	int BoundaryCondition;
+    int CollisionType;
 	double tau;
     double Den;//constant density
 	double tolerance;

From ea8fceda8c40f7e904c6999aba617d2e20a78451 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Fri, 31 Jan 2020 15:42:27 -0500
Subject: [PATCH 036/121] revert to the old velocity averaging method as it is
 more accurate

---
 models/GreyscaleModel.cpp | 63 ++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 5f8e4e36..4b803272 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -530,15 +530,16 @@ void ScaLBL_GreyscaleModel::Run(){
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
-			ScaLBL_Comm->RegularLayout(Map,Porosity,PorosityMap);
+			//ScaLBL_Comm->RegularLayout(Map,Porosity,PorosityMap);
 			//ScaLBL_Comm->RegularLayout(Map,Pressure_dvc,Pressure);
 			
 			double count_loc=0;
 			double count;
 			double vax,vay,vaz;
-            double px_loc,py_loc,pz_loc;
-            double px,py,pz;
-            double mass_loc,mass_glb;
+			double vax_loc,vay_loc,vaz_loc;
+            //double px_loc,py_loc,pz_loc;
+            //double px,py,pz;
+            //double mass_loc,mass_glb;
             
             //parameters for domain average
 	        int64_t i,j,k,n,imin,jmin,kmin,kmax;
@@ -555,30 +556,51 @@ void ScaLBL_GreyscaleModel::Run(){
             if (BoundaryCondition > 0 && Dm->inlet_layers_z > 0 && Dm->kproc() == 0) kmin = 1 + Dm->inlet_layers_z;//"1" indicates the halo layer
             if (BoundaryCondition > 0 && Dm->outlet_layers_z > 0 && Dm->kproc() == Dm->nprocz()-1) kmax = Nz-1 - Dm->outlet_layers_z; 
 
-
-			px_loc = py_loc = pz_loc = 0.f;
-            mass_loc = 0.f;
+//			px_loc = py_loc = pz_loc = 0.f;
+//            mass_loc = 0.f;
+//			for (int k=kmin; k<kmax; k++){
+//				for (int j=jmin; j<Ny-1; j++){
+//					for (int i=imin; i<Nx-1; i++){
+//						if (SignDist(i,j,k) > 0){
+//							px_loc   += Velocity_x(i,j,k)*Den*PorosityMap(i,j,k);
+//							py_loc   += Velocity_y(i,j,k)*Den*PorosityMap(i,j,k);
+//							pz_loc   += Velocity_z(i,j,k)*Den*PorosityMap(i,j,k);
+//							mass_loc += Den*PorosityMap(i,j,k);
+//						}
+//					}
+//				}
+//			}
+//			MPI_Allreduce(&px_loc,  &px,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+//			MPI_Allreduce(&py_loc,  &py,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+//			MPI_Allreduce(&pz_loc,  &pz,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+//			MPI_Allreduce(&mass_loc,&mass_glb,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+//			
+//			vax = px/mass_glb;
+//			vay = py/mass_glb;
+//			vaz = pz/mass_glb;
+            
+			vax_loc = vay_loc = vaz_loc = 0.f;
 			for (int k=kmin; k<kmax; k++){
 				for (int j=jmin; j<Ny-1; j++){
 					for (int i=imin; i<Nx-1; i++){
 						if (SignDist(i,j,k) > 0){
-							px_loc   += Velocity_x(i,j,k)*Den*PorosityMap(i,j,k);
-							py_loc   += Velocity_y(i,j,k)*Den*PorosityMap(i,j,k);
-							pz_loc   += Velocity_z(i,j,k)*Den*PorosityMap(i,j,k);
-							mass_loc += Den*PorosityMap(i,j,k);
+							vax_loc += Velocity_x(i,j,k);
+							vay_loc += Velocity_y(i,j,k);
+							vaz_loc += Velocity_z(i,j,k);
+							count_loc+=1.0;
 						}
 					}
 				}
 			}
-			MPI_Allreduce(&px_loc,  &px,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&py_loc,  &py,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&pz_loc,  &pz,      1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&mass_loc,&mass_glb,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			
-			vax = px/mass_glb;
-			vay = py/mass_glb;
-			vaz = pz/mass_glb;
+			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			
+			vax /= count;
+			vay /= count;
+			vaz /= count;
+
 			double force_mag = sqrt(Fx*Fx+Fy*Fy+Fz*Fz);
 			double dir_x = Fx/force_mag;
 			double dir_y = Fy/force_mag;
@@ -590,7 +612,8 @@ void ScaLBL_GreyscaleModel::Run(){
 				dir_z = 1.0;
 				force_mag = 1.0;
 			}
-			double flow_rate = (px*dir_x + py*dir_y + pz*dir_z)/mass_glb;
+			//double flow_rate = (px*dir_x + py*dir_y + pz*dir_z)/mass_glb;
+			double flow_rate = (vax*dir_x + vay*dir_y + vaz*dir_z);
 			
 			error = fabs(flow_rate - flow_rate_previous) / fabs(flow_rate);
 			flow_rate_previous = flow_rate;

From 50e4b5a9baf3a2767c1b69e8ef70a5a4377dbd48 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Sat, 1 Feb 2020 14:04:39 -0500
Subject: [PATCH 037/121] add the greyscale effective viscosity back, but by
 default it is set equal to the normal viscosity

---
 common/ScaLBL.h           |  8 ++---
 cpu/Greyscale.cpp         | 48 ++++++++++++++---------------
 gpu/Greyscale.cu          | 64 +++++++++++++++++++--------------------
 models/GreyscaleModel.cpp | 29 ++++++++++--------
 models/GreyscaleModel.h   |  1 +
 5 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 007fda34..447a9b14 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -60,16 +60,16 @@ extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int star
 
 extern "C" void ScaLBL_D3Q19_GreyIMRT_Init(double *Dist, int Np, double Den);
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,
                                               double *Poros,double *Perm, double *Velocity,double *Pressure);
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz, 
                                              double *Poros,double *Perm, double *Velocity,double *Pressure);
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,
                                               double *Poros,double *Perm, double *Velocity,double Den,double *Pressure);
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz, 
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz, 
                                              double *Poros,double *Perm, double *Velocity,double Den,double *Pressure);
 
 
diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index d1bde7f2..16fad1e0 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -1,6 +1,6 @@
 #include <math.h>
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Gx, double Gy, double Gz,
                                               double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
@@ -14,7 +14,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	for (int n=start; n<finish; n++){
@@ -42,7 +42,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -60,9 +60,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
+        Fx = -porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
             Fx=Gx;
             Fy=Gy;
@@ -166,7 +166,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz, 
                                              double *Poros,double *Perm, double *Velocity,double *Pressure){
 	int n;
 	// conserved momemnts
@@ -181,7 +181,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int nread;
@@ -264,7 +264,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -282,9 +282,9 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
+        Fx = -porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
             Fx=Gx;
             Fy=Gy;
@@ -389,7 +389,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 }
 
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz,
                                               double *Poros,double *Perm, double *Velocity, double Den,double *Pressure){
 	int n;
 	double vx,vy,vz,v_mag;
@@ -405,7 +405,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
     double rlx_setA = rlx;
     double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
@@ -702,7 +702,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -718,9 +718,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
-        Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
-        Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+        Fx = Den*(-porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+        Fy = Den*(-porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+        Fz = Den*(-porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
         if (porosity==1.0){
             Fx=Den*Gx;
             Fy=Den*Gy;
@@ -850,7 +850,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz, 
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz, 
                                              double *Poros,double *Perm, double *Velocity, double Den,double *Pressure){
 	int n, nread;
 	double vx,vy,vz,v_mag;
@@ -866,7 +866,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
     double rlx_setA = rlx;
     double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
@@ -1181,7 +1181,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -1197,9 +1197,9 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
-        Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
-        Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+        Fx = Den*(-porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+        Fy = Den*(-porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+        Fz = Den*(-porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
         if (porosity==1.0){
             Fx=Den*Gx;
             Fy=Den*Gy;
diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index 12ef6f17..d3fd52ab 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -3,7 +3,7 @@
 #define NBLOCKS 1024
 #define NTHREADS 256
 
-__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz,
                                                   double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
@@ -17,7 +17,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
@@ -50,7 +50,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -68,9 +68,9 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
+        Fx = -porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
             Fx=Gx;
             Fy=Gy;
@@ -176,7 +176,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
 	}
 }
 
-__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz,
                                                  double *Poros,double *Perm, double *Velocity, double *Pressure){
 	int n;
 	// conserved momemnts
@@ -191,7 +191,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
@@ -277,7 +277,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
         porosity = Poros[n];
         perm = Perm[n];
 
-        c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+        c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
         if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
         GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
         c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -295,9 +295,9 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
         u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
         //Update the body force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-        Fx = -porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
-        Fy = -porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
-        Fz = -porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
+        Fx = -porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx;
+        Fy = -porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy;
+        Fz = -porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz;
         if (porosity==1.0){
             Fx=Gx;
             Fy=Gy;
@@ -402,7 +402,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
 	}
 }
 
-__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+__global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz,
                                                   double *Poros,double *Perm, double *Velocity, double Den, double *Pressure){
 
 	int n;
@@ -419,7 +419,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
     double rlx_setA = rlx;
     double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
@@ -722,7 +722,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
             porosity = Poros[n];
             perm = Perm[n];
 
-            c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+            c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
             if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
             GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
             c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -738,9 +738,9 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
             u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
             //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-            Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
-            Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
-            Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+            Fx = Den*(-porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+            Fy = Den*(-porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+            Fz = Den*(-porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
             if (porosity==1.0){
                 Fx=Den*Gx;
                 Fy=Den*Gy;
@@ -873,7 +873,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
 }
 
 
-__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Gx, double Gy, double Gz,
+__global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx,  double rlx_eff, double Gx, double Gy, double Gz,
                                                  double *Poros,double *Perm, double *Velocity,double Den, double *Pressure){
 
 	int n, nread;
@@ -890,7 +890,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
     double porosity;
     double perm;//voxel permeability
     double c0, c1; //Guo's model parameters
-    double mu = (1.0/rlx-0.5)/3.0;//kinematic viscosity
+    double mu_eff = (1.0/rlx_eff-0.5)/3.0;//kinematic viscosity
     double Fx, Fy, Fz;//The total body force including Brinkman force and user-specified (Gx,Gy,Gz)
     double rlx_setA = rlx;
     double rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
@@ -1210,7 +1210,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
             porosity = Poros[n];
             perm = Perm[n];
 
-            c0 = 0.5*(1.0+porosity*0.5*mu/perm);
+            c0 = 0.5*(1.0+porosity*0.5*mu_eff/perm);
             if (porosity==1.0) c0 = 0.5;//i.e. apparent pore nodes
             GeoFun = 1.75/sqrt(150.0*porosity*porosity*porosity);
             c1 = porosity*0.5*GeoFun/sqrt(perm);
@@ -1226,9 +1226,9 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
             u_mag=sqrt(ux*ux+uy*uy+uz*uz);
 
             //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-            Fx = Den*(-porosity*mu/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
-            Fy = Den*(-porosity*mu/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
-            Fz = Den*(-porosity*mu/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
+            Fx = Den*(-porosity*mu_eff/perm*ux - porosity*GeoFun/sqrt(perm)*u_mag*ux + porosity*Gx);
+            Fy = Den*(-porosity*mu_eff/perm*uy - porosity*GeoFun/sqrt(perm)*u_mag*uy + porosity*Gy);
+            Fz = Den*(-porosity*mu_eff/perm*uz - porosity*GeoFun/sqrt(perm)*u_mag*uz + porosity*Gz);
             if (porosity==1.0){
                 Fx=Den*Gx;
                 Fy=Den*Gy;
@@ -1379,9 +1379,9 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
 }
 
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
 	
-    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,rlx_eff,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1389,9 +1389,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double *Pressure){
 
-    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,rlx_eff,Fx,Fy,Fz,Poros,Perm,Velocity,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1399,9 +1399,9 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
+extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
 	
-    dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
+    dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(dist,start,finish,Np,rlx,rlx_eff,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
@@ -1409,9 +1409,9 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
+extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double rlx_eff, double Fx, double Fy, double Fz,double *Poros,double *Perm, double *Velocity,double Den,double *Pressure){
 
-    dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
+    dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT<<<NBLOCKS,NTHREADS >>>(neighborList,dist,start,finish,Np,rlx,rlx_eff,Fx,Fy,Fz,Poros,Perm,Velocity,Den,Pressure);
 
     cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 4b803272..79b7a9c7 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -14,7 +14,7 @@ void DeleteArray( const TYPE *p )
 }
 
 ScaLBL_GreyscaleModel::ScaLBL_GreyscaleModel(int RANK, int NP, MPI_Comm COMM):
-rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),GreyPorosity(0),
+rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),tau_eff(0),Den(0),Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),GreyPorosity(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
 {
 	SignDist.resize(Nx,Ny,Nz);           
@@ -36,6 +36,7 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	// set defaults
 	timestepMax = 100000;
 	tau = 1.0;
+    tau_eff = tau;
     Den = 1.0;//constant density
 	tolerance = 0.01;
 	Fx = Fy = Fz = 0.0;
@@ -52,6 +53,7 @@ void ScaLBL_GreyscaleModel::ReadParams(string filename){
 	if (greyscale_db->keyExists( "tau" )){
 		tau = greyscale_db->getScalar<double>( "tau" );
 	}
+	tau_eff = greyscale_db->getWithDefault<double>( "tau_eff", tau );
 	if (greyscale_db->keyExists( "Den" )){
 		Den = greyscale_db->getScalar<double>( "Den" );
 	}
@@ -453,6 +455,7 @@ void ScaLBL_GreyscaleModel::Run(){
 	PROFILE_START("Loop");
 	auto current_db = db->cloneDatabase();
 	double rlx = 1.0/tau;
+    double rlx_eff = 1.0/tau_eff;
 	double error = 1.0;
 	double flow_rate_previous = 0.0;
 	while (timestep < timestepMax && error > tolerance) {
@@ -462,13 +465,13 @@ void ScaLBL_GreyscaleModel::Run(){
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
         switch (CollisionType){
             case 1: 
-                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
             case 2: 
-                    ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+                    ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
                     break;
             default: 
-                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+                    ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
         }
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
@@ -480,13 +483,13 @@ void ScaLBL_GreyscaleModel::Run(){
 		}
         switch (CollisionType){
             case 1: 
-		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
             case 2: 
-		            ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		            ScaLBL_D3Q19_AAodd_Greyscale(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
                     break;
             default: 
-		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAodd_Greyscale_IMRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
         }
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
@@ -496,13 +499,13 @@ void ScaLBL_GreyscaleModel::Run(){
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
         switch (CollisionType){
             case 1: 
-		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
             case 2: 
-		            ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
                     break;
             default: 
-		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
         }
         ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
@@ -514,13 +517,13 @@ void ScaLBL_GreyscaleModel::Run(){
 		}
         switch (CollisionType){
             case 1: 
-		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
             case 2: 
-		            ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Pressure_dvc);
                     break;
             default: 
-		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
+		            ScaLBL_D3Q19_AAeven_Greyscale_IMRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx, rlx_eff, Fx, Fy, Fz,Porosity,Permeability,Velocity,Den,Pressure_dvc);
                     break;
         }
         ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index 792e87ea..c670239f 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -37,6 +37,7 @@ public:
 	int BoundaryCondition;
     int CollisionType;
 	double tau;
+    double tau_eff;
     double Den;//constant density
 	double tolerance;
 	double Fx,Fy,Fz,flux;

From 793d294aa33550aa7cefc063f321306bea2c512b Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Sat, 1 Feb 2020 17:03:42 -0500
Subject: [PATCH 038/121] CPU version update: remove the higher-order terms in
 body force

---
 cpu/Greyscale.cpp | 422 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 320 insertions(+), 102 deletions(-)

diff --git a/cpu/Greyscale.cpp b/cpu/Greyscale.cpp
index 16fad1e0..b4b017c8 100644
--- a/cpu/Greyscale.cpp
+++ b/cpu/Greyscale.cpp
@@ -69,94 +69,173 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int finis
             Fz=Gz;
         }
 
+        //------------------------ BGK collison where body force has higher-order terms ----------------------------------------------------------//
+//		// q=0
+//		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 1
+//		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q=2
+//		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 3
+//		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 4
+//		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 5
+//		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+//
+//		// q = 6
+//		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+//
+//		// q = 7
+//		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 8
+//		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 9
+//		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 10
+//		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 11
+//		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+//
+//		// q = 12
+//		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 13
+//		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q= 14
+//		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+//
+//		// q = 15
+//		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+//
+//		// q = 16
+//		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 17
+//		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 18
+//		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+        //----------------------------------------------------------------------------------------------------------------------------------------//
+
+        //------------------------ BGK collison where body force has NO higher-order terms ----------------------------------------------------------//
 		// q=0
-		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity);
 
 		// q = 1
 		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3.));
 
 		// q=2
 		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3.));
 
 		// q = 3
 		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(3.));
 
 		// q = 4
 		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(-3.));
 
 		// q = 5
 		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(3.));
 
 		// q = 6
 		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(-3.));
 
 		// q = 7
 		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(3.));
 
 		// q = 8
 		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(-3.));
 
 		// q = 9
 		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(-3.));
 
 		// q = 10
 		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(3.));
 
 		// q = 11
 		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(3.));
 
 		// q = 12
 		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(-3.));
 
 		// q = 13
 		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(-3.));
 
 		// q= 14
 		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(3.));
 
 		// q = 15
 		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(3.));
 
 		// q = 16
 		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(-3.));
 
 		// q = 17
 		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(-3.));
 
 		// q = 18
 		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
-
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(3.));
+        //-------------------------------------------------------------------------------------------------------------------------------------------//
+        
         //Update velocity on device
 		Velocity[0*Np+n] = ux;
 		Velocity[1*Np+n] = uy;
@@ -291,93 +370,176 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist, in
             Fz=Gz;
         }
 
+        //------------------------ BGK collison where body force has higher-order terms ----------------------------------------------------------//
+//		// q=0
+//		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 1
+//		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q=2
+//		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 3
+//		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 4
+//		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 5
+//		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+//
+//		// q = 6
+//		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+//
+//		// q = 7
+//		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 8
+//		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 9
+//		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 10
+//		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 11
+//		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+//
+//		// q = 12
+//		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 13
+//		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q= 14
+//		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+//
+//		// q = 15
+//		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+//
+//		// q = 16
+//		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 17
+//		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 18
+//		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+        //----------------------------------------------------------------------------------------------------------------------------------------//
+
+
+
+        //------------------------ BGK collison where body force has NO higher-order terms ----------------------------------------------------------//
 		// q=0
-		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity);
 
 		// q = 1
 		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3.));
 
 		// q=2
 		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3.));
 
 		// q = 3
 		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(3.));
 
 		// q = 4
 		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(-3.));
 
 		// q = 5
 		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(3.));
 
 		// q = 6
 		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(-3.));
 
 		// q = 7
 		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(3.));
 
 		// q = 8
 		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(-3.));
 
 		// q = 9
 		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(-3.));
 
 		// q = 10
 		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(3.));
 
 		// q = 11
 		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(3.));
 
 		// q = 12
 		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(-3.));
 
 		// q = 13
 		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(-3.));
 
 		// q= 14
 		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(3.));
 
 		// q = 15
 		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(3.));
 
 		// q = 16
 		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(-3.));
 
 		// q = 17
 		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(-3.));
 
 		// q = 18
 		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(3.));
+        //-------------------------------------------------------------------------------------------------------------------------------------------//
+
+
 
         //Update velocity on device
 		Velocity[0*Np+n] = ux;
@@ -730,11 +892,45 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
         //Calculate pressure for Incompressible-MRT model
         pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
+        //-------------------- IMRT collison where body force has higher-order terms -------------//
+//		//..............carry out relaxation process...............................................
+//		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+//                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+//		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+//                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+//        jx = jx + Fx;
+//		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+//        jy = jy + Fy;
+//		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+//        jz = jz + Fz;
+//		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+//		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+//                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+//		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+//                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+//		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+//                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+//		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+//                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+//		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+//                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+//		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+//                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+//		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+//                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+//		m16 = m16 + rlx_setB*( - m16);
+//		m17 = m17 + rlx_setB*( - m17);
+//		m18 = m18 + rlx_setB*( - m18);
+//		//.......................................................................................................
+
+
+        //-------------------- IMRT collison where body force has NO higher-order terms -------------//
 		//..............carry out relaxation process...............................................
-		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
-                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
-		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
-                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
         jx = jx + Fx;
 		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
                 + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
@@ -744,25 +940,19 @@ extern "C" void ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start, int
         jz = jz + Fz;
 		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
                 + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
-		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
-                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
-		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
-		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
-                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
-		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
-                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
-		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
-                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
-		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
-                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
-		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
-                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
 		m16 = m16 + rlx_setB*( - m16);
 		m17 = m17 + rlx_setB*( - m17);
 		m18 = m18 + rlx_setB*( - m18);
 		//.......................................................................................................
 
+
 		//.................inverse transformation......................................................
 		// q=0
 		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;
@@ -1209,11 +1399,45 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
         //Calculate pressure for Incompressible-MRT model
         pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
+        //-------------------- IMRT collison where body force has higher-order terms -------------//
+//		//..............carry out relaxation process...............................................
+//		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+//                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+//		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+//                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+//        jx = jx + Fx;
+//		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+//        jy = jy + Fy;
+//		m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+//        jz = jz + Fz;
+//		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+//                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+//		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+//                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+//		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+//                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+//		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+//                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+//		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+//                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+//		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+//                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+//		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+//                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+//		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+//                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+//		m16 = m16 + rlx_setB*( - m16);
+//		m17 = m17 + rlx_setB*( - m17);
+//		m18 = m18 + rlx_setB*( - m18);
+//		//.......................................................................................................
+       
+
+        //-------------------- IMRT collison where body force has NO higher-order terms -------------//
 		//..............carry out relaxation process...............................................
-		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
-                + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
-		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
-                + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+		m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+		m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
         jx = jx + Fx;
 		m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
                 + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
@@ -1223,25 +1447,19 @@ extern "C" void ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double *dis
         jz = jz + Fz;
 		m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
                 + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
-		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
-                + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
-		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-                  + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
-		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
-                  + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
-		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
-                  + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
-		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
-                  + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
-		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
-                  + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
-		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
-                  + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+		m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+		m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+		m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+		m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+		m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+		m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+		m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
 		m16 = m16 + rlx_setB*( - m16);
 		m17 = m17 + rlx_setB*( - m17);
 		m18 = m18 + rlx_setB*( - m18);
 		//.......................................................................................................
-       
+
+
 		//.................inverse transformation......................................................
 		// q=0
 		fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;

From 46b8c1de7fb0253c3a14414a5b1036f0a3f4b972 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Sat, 1 Feb 2020 17:22:13 -0500
Subject: [PATCH 039/121] GPU version update: remove higher-order terms in body
 force

---
 gpu/Greyscale.cu | 414 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 313 insertions(+), 101 deletions(-)

diff --git a/gpu/Greyscale.cu b/gpu/Greyscale.cu
index d3fd52ab..0a9a63e0 100644
--- a/gpu/Greyscale.cu
+++ b/gpu/Greyscale.cu
@@ -77,93 +77,173 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale(double *dist, int start, int f
             Fz=Gz;
         }
 
+        //------------------------ BGK collison where body force has higher-order terms ----------------------------------------------------------//
+//		// q=0
+//		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 1
+//		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q=2
+//		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 3
+//		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 4
+//		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 5
+//		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+//
+//		// q = 6
+//		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+//
+//		// q = 7
+//		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 8
+//		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 9
+//		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 10
+//		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 11
+//		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+//
+//		// q = 12
+//		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 13
+//		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q= 14
+//		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+//
+//		// q = 15
+//		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+//
+//		// q = 16
+//		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 17
+//		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 18
+//		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+        //----------------------------------------------------------------------------------------------------------------------------------------//
+
+
+        //------------------------ BGK collison where body force has NO higher-order terms ----------------------------------------------------------//
 		// q=0
-		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = f0*(1.0-rlx)+ rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity);
 
 		// q = 1
 		dist[1*Np+n] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3.));
 
 		// q=2
 		dist[2*Np+n] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3.));
 
 		// q = 3
 		dist[3*Np+n] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(3.));
 
 		// q = 4
 		dist[4*Np+n] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(-3.));
 
 		// q = 5
 		dist[5*Np+n] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(3.));
 
 		// q = 6
 		dist[6*Np+n] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(-3.));
 
 		// q = 7
 		dist[7*Np+n] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(3.));
 
 		// q = 8
 		dist[8*Np+n] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(-3.));
 
 		// q = 9
 		dist[9*Np+n] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(-3.));
 
 		// q = 10
 		dist[10*Np+n] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(3.));
 
 		// q = 11
 		dist[11*Np+n] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(3.));
 
 		// q = 12
 		dist[12*Np+n] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(-3.));
 
 		// q = 13
 		dist[13*Np+n] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(-3.));
 
 		// q= 14
 		dist[14*Np+n] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(3.));
 
 		// q = 15
 		dist[15*Np+n] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(3.));
 
 		// q = 16
 		dist[16*Np+n] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(-3.));
 
 		// q = 17
 		dist[17*Np+n] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(-3.));
 
 		// q = 18
 		dist[18*Np+n] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(3.));
+        //-------------------------------------------------------------------------------------------------------------------------------------------//
 
         //Update velocity on device
 		Velocity[0*Np+n] = ux;
@@ -304,93 +384,174 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale(int *neighborList, double *dist
             Fz=Gz;
         }
 
+        //------------------------ BGK collison where body force has higher-order terms ----------------------------------------------------------//
+//		// q=0
+//		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 1
+//		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q=2
+//		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 3
+//		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 4
+//		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 5
+//		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+//
+//		// q = 6
+//		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
+//				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+//
+//		// q = 7
+//		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 8
+//		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 9
+//		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 10
+//		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
+//  Fz*(0. - (3.*uz)/porosity));
+//
+//		// q = 11
+//		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+//
+//		// q = 12
+//		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 13
+//		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
+//  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q= 14
+//		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+//
+//		// q = 15
+//		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+//
+//		// q = 16
+//		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 17
+//		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
+//  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+//
+//		// q = 18
+//		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
+//				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
+//  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+        //----------------------------------------------------------------------------------------------------------------------------------------//
+
+
+        //------------------------ BGK collison where body force has NO higher-order terms ----------------------------------------------------------//
 		// q=0
-		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-                  + 0.3333333333333333*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+		dist[n] = f0*(1.0-rlx) + rlx*0.3333333333333333*rho*(1. - (1.5*(ux*ux + uy*uy + uz*uz))/porosity);
 
 		// q = 1
 		dist[nr2] = f1*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(3.));
 
 		// q=2
 		dist[nr1] = f2*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*ux + (4.5*ux*ux)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3. + (6.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+            +0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(-3.));
 
 		// q = 3
 		dist[nr4] = f3*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(3.));
 
 		// q = 4
 		dist[nr3] = f4*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uy + (4.5*uy*uy)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)  
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. + (6.*uy)/porosity) + Fz*(0. - (3.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fy*(-3.));
 
 		// q = 5
 		dist[nr6] = f5*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 + 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(3.));
 
 		// q = 6
 		dist[nr5] = f6*(1.0-rlx) + rlx*0.05555555555555555*rho*(1 - 3.*uz + (4.5*uz*uz)/porosity - (1.5*(ux*ux+ uy*uy + uz*uz))/porosity) 
-				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(0. - (3.*uy)/porosity) + Fz*(-3. + (6.*uz)/porosity));
+				+0.05555555555555555*rho*(1. - 0.5*rlx)*(Fz*(-3.));
 
 		// q = 7
 		dist[nr8] = f7*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uy) + (4.5*(ux + uy)*(ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(3.));
 
 		// q = 8
 		dist[nr7] = f8*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uy) + (4.5*(-ux - uy)*(-ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uy))/porosity) + Fy*(-3. - (9.*(-ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(-3.));
 
 		// q = 9
 		dist[nr10] = f9*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uy) + (4.5*(ux - uy)*(ux - uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3. - (3.*ux)/porosity + (9.*(ux - uy))/porosity) + Fy*(-3. - (9.*(ux - uy))/porosity - (3.*uy)/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fy*(-3.));
 
 		// q = 10
 		dist[nr9] = f10*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uy) + (4.5*(-ux + uy)*(-ux + uy))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uy))/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(-ux + uy))/porosity) + 
-  Fz*(0. - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fy*(3.));
 
 		// q = 11
 		dist[nr12] = f11*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux + uz) + (4.5*(ux + uz)*(ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(3.));
 
 		// q = 12
 		dist[nr11] = f12*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux - uz) + (4.5*(-ux - uz)*(-ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(-ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(-3.));
 
 		// q = 13
 		dist[nr14] = f13*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(ux - uz) + (4.5*(ux - uz)*(ux - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(3. - (3.*ux)/porosity + (9.*(ux - uz))/porosity) + 
-  Fz*(-3. - (9.*(ux - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(3.) + Fz*(-3.));
 
 		// q= 14
 		dist[nr13] = f14*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-ux + uz) + (4.5*(-ux + uz)*(-ux + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(0. - (3.*uy)/porosity) + Fx*(-3. - (3.*ux)/porosity - (9.*(-ux + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-ux + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(-3.) + Fz*(3.));
 
 		// q = 15
 		dist[nr16] = f15*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy + uz) + (4.5*(uy + uz)*(uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(3.));
 
 		// q = 16
 		dist[nr15] = f16*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy - uz) + (4.5*(-uy - uz)*(-uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(-uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(-3.));
 
 		// q = 17
 		dist[nr18] = f17*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(uy - uz) + (4.5*(uy - uz)*(uy - uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity) 
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(3. - (3.*uy)/porosity + (9.*(uy - uz))/porosity) + 
-  Fz*(-3. - (9.*(uy - uz))/porosity - (3.*uz)/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(3.) + Fz*(-3.));
 
 		// q = 18
 		dist[nr17] = f18*(1.0-rlx) + rlx*0.027777777777777776*rho*(1 + 3.*(-uy + uz) + (4.5*(-uy + uz)*(-uy + uz))/porosity - (1.5*(ux*ux + uy*uy + uz*uz))/porosity)
-				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fx*(0. - (3.*ux)/porosity) + Fy*(-3. - (3.*uy)/porosity - (9.*(-uy + uz))/porosity) + 
-  Fz*(3. - (3.*uz)/porosity + (9.*(-uy + uz))/porosity));
+				+0.027777777777777776*rho*(1. - 0.5*rlx)*(Fy*(-3.) + Fz*(3.));
+        //-------------------------------------------------------------------------------------------------------------------------------------------//
+
 
         //Update velocity on device
 		Velocity[0*Np+n] = ux;
@@ -750,11 +911,43 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
             //Calculate pressure for Incompressible-MRT model
             pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
+//            //..............carry out relaxation process...............................................
+//            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+//                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+//            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+//                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+//            jx = jx + Fx;
+//            m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+//            jy = jy + Fy;
+//            m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+//            jz = jz + Fz;
+//            m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+//            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+//                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+//            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+//                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+//            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+//                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+//            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+//                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+//            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+//                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+//            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+//                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+//            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+//                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+//            m16 = m16 + rlx_setB*( - m16);
+//            m17 = m17 + rlx_setB*( - m17);
+//            m18 = m18 + rlx_setB*( - m18);
+//            //.......................................................................................................
+
+            //-------------------- IMRT collison where body force has NO higher-order terms -------------//
             //..............carry out relaxation process...............................................
-            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
-                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
-            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
-                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
             jx = jx + Fx;
             m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
                     + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
@@ -764,20 +957,13 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_Greyscale_IMRT(double *dist, int start,
             jz = jz + Fz;
             m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
                     + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
-            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
-                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
-            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
-            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
-                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
-            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
-                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
-            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
-                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
-            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
-                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
-            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
-                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
             m16 = m16 + rlx_setB*( - m16);
             m17 = m17 + rlx_setB*( - m17);
             m18 = m18 + rlx_setB*( - m18);
@@ -1238,11 +1424,43 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
             //Calculate pressure for Incompressible-MRT model
             pressure=0.5/porosity*(pressure-0.5*Den*u_mag*u_mag/porosity);
 
+//            //..............carry out relaxation process...............................................
+//            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
+//                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+//            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
+//                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+//            jx = jx + Fx;
+//            m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+//            jy = jy + Fy;
+//            m6 = m6 + rlx_setB*((-0.6666666666666666*uy*Den) - m6)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+//            jz = jz + Fz;
+//            m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
+//                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+//            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
+//                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+//            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+//                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+//            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
+//                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+//            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
+//                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+//            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
+//                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+//            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
+//                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+//            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
+//                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+//            m16 = m16 + rlx_setB*( - m16);
+//            m17 = m17 + rlx_setB*( - m17);
+//            m18 = m18 + rlx_setB*( - m18);
+//            //.......................................................................................................
+           
+            //-------------------- IMRT collison where body force has NO higher-order terms -------------//
             //..............carry out relaxation process...............................................
-            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1) 
-                    + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
-            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2)
-                    + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+            m1 = m1 + rlx_setA*((-30*Den+19*(ux*ux+uy*uy+uz*uz)/porosity + 57*pressure*porosity) - m1);
+            m2 = m2 + rlx_setA*((12*Den - 5.5*(ux*ux+uy*uy+uz*uz)/porosity-27*pressure*porosity) - m2);
             jx = jx + Fx;
             m4 = m4 + rlx_setB*((-0.6666666666666666*ux*Den) - m4)
                     + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
@@ -1252,25 +1470,19 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_Greyscale_IMRT(int *neighborList, double
             jz = jz + Fz;
             m8 = m8 + rlx_setB*((-0.6666666666666666*uz*Den) - m8)
                     + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
-            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9)
-                    + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
-            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-                      + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
-            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11)
-                      + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
-            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12)
-                      + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
-            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13)
-                      + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
-            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14)
-                      + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
-            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15)
-                      + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+            m9 = m9 + rlx_setA*((Den*(2*ux*ux-uy*uy-uz*uz)/porosity) - m9);
+            m10 = m10 + rlx_setA*(-0.5*Den*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+            m11 = m11 + rlx_setA*((Den*(uy*uy-uz*uz)/porosity) - m11);
+            m12 = m12 + rlx_setA*(-0.5*(Den*(uy*uy-uz*uz)/porosity)- m12);
+            m13 = m13 + rlx_setA*((Den*ux*uy/porosity) - m13);
+            m14 = m14 + rlx_setA*((Den*uy*uz/porosity) - m14);
+            m15 = m15 + rlx_setA*((Den*ux*uz/porosity) - m15);
             m16 = m16 + rlx_setB*( - m16);
             m17 = m17 + rlx_setB*( - m17);
             m18 = m18 + rlx_setB*( - m18);
             //.......................................................................................................
-           
+
+
             //.................inverse transformation......................................................
             // q=0
             fq = mrt_V1*Den-mrt_V2*m1+mrt_V3*m2;

From 8751fa245bbe3bad5c0d2a0799d57a943faf14ba Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Mon, 3 Feb 2020 12:41:09 -0500
Subject: [PATCH 040/121] Fixing minor issues with some operating systems

---
 common/Utilities.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index 11d2b261..723b34f8 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -16,6 +16,20 @@
 #include <mutex>
 
 
+// OS specific includes / definitions
+// clang-format off
+#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
+    #define USE_WINDOWS
+#elif defined( __APPLE__ )
+    #define USE_MAC
+#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
+    #define USE_LINUX
+#else
+    #error Unknown OS
+#endif
+// clang-format on
+
+
 // Mutex for Utility functions
 static std::mutex Utilities_mutex;
 

From 6ed57841b87dbc0893e21521609181c348e4ae41 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Mon, 3 Feb 2020 12:59:52 -0500
Subject: [PATCH 041/121] update TwoPhase analysis for vector / tensor objects

---
 analysis/TwoPhase.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index 1dbdfbfa..d878a663 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -1098,13 +1098,19 @@ void TwoPhase::Reduce()
 	vol_n_global = Dm->Comm.sumReduce( vol_n );
 	paw_global = Dm->Comm.sumReduce( paw );
 	pan_global = Dm->Comm.sumReduce( pan );
-	vaw_global(0) = Dm->Comm.sumReduce( vaw(0) );
-	van_global(0) = Dm->Comm.sumReduce( van(0) );
-	vawn_global(0) = Dm->Comm.sumReduce( vawn(0) );
-	vawns_global(0) = Dm->Comm.sumReduce( vawns(0) );
-	Gwn_global(0) = Dm->Comm.sumReduce( Gwn(0) );
-	Gns_global(0) = Dm->Comm.sumReduce( Gns(0) );
-	Gws_global(0) = Dm->Comm.sumReduce( Gws(0) );
+	for (int idx=0; idx<3; idx++)
+		vaw_global(idx) = Dm->Comm.sumReduce( vaw(idx) );
+	for (int idx=0; idx<3; idx++)
+		van_global(idx) = Dm->Comm.sumReduce( van(idx));
+	for (int idx=0; idx<3; idx++)
+		vawn_global(idx) = Dm->Comm.sumReduce( vawn(idx) );
+	for (int idx=0; idx<3; idx++)
+		vawns_global(idx) = Dm->Comm.sumReduce( vawns(idx) );
+	for (int idx=0; idx<6; idx++){
+		Gwn_global(idx) = Dm->Comm.sumReduce( Gwn(idx) );
+		Gns_global(idx) = Dm->Comm.sumReduce( Gns(idx) );
+		Gws_global(idx) = Dm->Comm.sumReduce( Gws(idx) );
+	}
 	trawn_global = Dm->Comm.sumReduce( trawn );
 	trJwn_global = Dm->Comm.sumReduce( trJwn );
 	trRwn_global = Dm->Comm.sumReduce( trRwn );

From b73208b4718604c4e95a0593bd83531d9b9b9971 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Mon, 3 Feb 2020 14:05:23 -0500
Subject: [PATCH 042/121] fix water seed

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 5a9c56d4..3b58fff1 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -937,7 +937,7 @@ void ScaLBL_ColorModel::Run(){
 				else if (USE_SEED){
 					delta_volume = volA*Dm->Volume - initial_volume;
 					CURRENT_MORPH_TIMESTEPS += analysis_interval;
-					//double massChange = SeedPhaseField(seed_water);
+					double massChange = SeedPhaseField(seed_water);
 					if (rank==0) printf("***Seed water in oil %f, volume change %f / %f ***\n", seed_water, delta_volume, delta_volume_target);
 				}
 				else if (USE_MORPHOPEN_OIL){

From c426aa7d1db9fd637f72308d88ed5b89158aeb83 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Mon, 3 Feb 2020 15:13:45 -0500
Subject: [PATCH 043/121] remove deprecated pressure BC routines

---
 common/ScaLBL.h |   5 --
 cpu/D3Q19.cpp   | 139 ------------------------------------------------
 2 files changed, 144 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index d7f012d1..610fce5d 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -101,11 +101,6 @@ extern "C" void ScaLBL_D3Q19_Gradient_DFH(int *NeighborList, double *Phi, double
 
 // BOUNDARY CONDITION ROUTINES
 
-//extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din,
-//		int Nx, int Ny, int Nz);
-//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout,
-//		int Nx, int Ny, int Nz, int outlet);
-
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *neighborList, int *list, double *dist, double din, int count, int Np);
 
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *neighborList, int *list, double *dist, double dout, int count, int Np);
diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp
index be081528..564eb96d 100644
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@@ -680,145 +680,6 @@ extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *d_neighborList, int *list,
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_Pressure_BC_z(int *list, double *dist, double din, int count, int Np)
-{
-	int n;
-	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
-	double ux,uy,uz;
-	double Cxz,Cyz;	
-
-	for (int idx=0; idx<count; idx++){
-		n = list[idx];
-		//........................................................................
-		// Read distributions from "opposite" memory convention
-		//........................................................................
-		//........................................................................
-		f0 = dist[n];
-		f1 = dist[Np+n];
-		f2 = dist[2*Np+n];
-		f3 = dist[3*Np+n];
-		f4 = dist[4*Np+n];
-		f6 = dist[6*Np+n];
-		f7 = dist[7*Np+n];
-		f8 = dist[8*Np+n];
-		f9 = dist[9*Np+n];
-		f10 = dist[10*Np+n];
-		f12 = dist[12*Np+n];
-		f13 = dist[13*Np+n];
-		f16 = dist[16*Np+n];
-		f17 = dist[17*Np+n];
-		//...................................................
-		//........Determine the inlet flow velocity.........
-		//			uz = -1 + (f0+f3+f4+f1+f2+f7+f8+f10+f9
-		//					   + 2*(f5+f15+f18+f11+f14))/din;
-		//........Set the unknown distributions..............
-		//			f6 = f5 - 0.3333333333333333*din*uz;
-		//			f16 = f15 - 0.1666666666666667*din*uz;
-		//			f17 = f16 - f3 + f4-f15+f18-f7+f8-f10+f9;
-		//			f12= 0.5*(-din*uz+f5+f15+f18+f11+f14-f6-f16-
-		//					  f17+f1-f2-f14+f11+f7-f8-f10+f9);
-		//			f13= -din*uz+f5+f15+f18+f11+f14-f6-f16-f17-f12;
-		// Determine the inlet flow velocity
-		ux = (f1-f2+f7-f8+f9-f10+f11-f12+f13-f14);
-		uy = (f3-f4+f7-f8-f9+f10+f15-f16+f17-f18);
-		uz = din - (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f6+f12+f13+f16+f17));
-
-		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
-		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
-
-		f5 = f6 + 0.33333333333333338*uz;
-		f11 = f12 + 0.16666666666666678*(uz+ux)-Cxz;
-		f14 = f13 + 0.16666666666666678*(uz-ux)+Cxz;
-		f15 = f16 + 0.16666666666666678*(uy+uz)-Cyz;
-		f18 = f17 + 0.16666666666666678*(uz-uy)+Cyz;
-		//........Store in "opposite" memory location..........
-		dist[5*Np+n] = f5;
-		dist[11*Np+n] = f11;
-		dist[14*Np+n] = f14;
-		dist[15*Np+n] = f15;
-		dist[18*Np+n] = f18;
-		
-		/*
-		printf("Site=%i\n",n);
-		printf("ux=%f, uy=%f, uz=%f\n",ux,uy,uz);
-		printf("Cxz=%f, Cyz=%f\n",Cxz,Cyz);
-		n = N;
-		 */
-		//...................................................
-	}
-}
-
-extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(int *list, double *dist, double dout, int count, int Np)
-{
-	int n;
-	// distributions
-	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9;
-	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
-	double ux,uy,uz;
-	double Cxz,Cyz;
-
-	for (int idx=0; idx<count; idx++){
-		n = list[idx];
-		
-		//........................................................................
-		// Read distributions 
-		//........................................................................
-		f0 = dist[n];
-		f1 = dist[Np+n];
-		f2 = dist[2*Np+n];
-		f3 = dist[3*Np+n];
-		f4 = dist[4*Np+n];
-		f5 = dist[5*Np+n];
-		f7 = dist[7*Np+n];
-		f8 = dist[8*Np+n];
-		f9 = dist[9*Np+n];
-		f10 = dist[10*Np+n];
-		f11 = dist[11*Np+n];
-		f14 = dist[14*Np+n];
-		f15 = dist[15*Np+n];
-		f18 = dist[18*Np+n];
-		//........Determine the outlet flow velocity.........
-		//			uz = 1 - (f0+f3+f4+f1+f2+f7+f8+f10+f9+
-		//					  2*(f6+f16+f17+f12+f13))/dout;
-		//...................................................
-		//........Set the Unknown Distributions..............
-		//			f5 = f6 + 0.33333333333333338*dout*uz;
-		//			f15 = f16 + 0.16666666666666678*dout*uz;
-		//			f18 = f15+f3-f4-f16+f17+f7-f8+f10-f9;
-		//			f11= 0.5*(dout*uz+f6+ f16+f17+f12+f13-f5
-		//				  -f15-f18-f1+f2-f13+f12-f7+f8+f10-f9);
-		//			f14= dout*uz+f6+ f16+f17+f12+f13-f5-f15-f18-f11;
-		// Determine the outlet flow velocity
-		//ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		//uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		//uz = -1.0 + (f0+f4+f3+f2+f1+f8+f7+f9+f10 + 2*(f6+f16+f17+f12+f13))/dout;
-
-		// Determine the inlet flow velocity
-		ux = f1-f2+f7-f8+f9-f10+f11-f12+f13-f14;
-		uy = f3-f4+f7-f8-f9+f10+f15-f16+f17-f18;
-		uz = -dout + (f0+f1+f2+f3+f4+f7+f8+f9+f10 + 2*(f5+f11+f14+f15+f18));
-
-		Cxz = 0.5*(f1+f7+f9-f2-f10-f8) - 0.3333333333333333*ux;
-		Cyz = 0.5*(f3+f7+f10-f4-f9-f8) - 0.3333333333333333*uy;
-
-		f6 = f5 - 0.33333333333333338*uz;
-		f12 = f11 - 0.16666666666666678*(uz+ux)+Cxz;
-		f13 = f14 - 0.16666666666666678*(uz-ux)-Cxz;
-		f16 = f15 - 0.16666666666666678*(uy+uz)+Cyz;
-		f17 = f18 - 0.16666666666666678*(uz-uy)-Cyz;
-
-		//........Store in "opposite" memory location..........
-		dist[6*Np+n] = f6;
-		dist[12*Np+n] = f12;
-		dist[13*Np+n] = f13;
-		dist[16*Np+n] = f16;
-		dist[17*Np+n] = f17;
-		//...................................................
-	}
-}
-
 extern "C" void ScaLBL_D3Q19_Velocity_BC_z(double *disteven, double *distodd, double uz,
 		int Nx, int Ny, int Nz)
 {

From d34a12891c01fc406d9cb92f40a87e44deb25ef0 Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 3 Feb 2020 17:22:13 -0500
Subject: [PATCH 044/121] add a weighting factor to the water seeding method

---
 models/ColorModel.cpp | 133 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 110 insertions(+), 23 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index df4afab9..bcffa9df 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1202,35 +1202,26 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 	double mass_loss =0.f;
 	double count =0.f;
 	double *Aq_tmp, *Bq_tmp;
+    double *Vel_tmp;
 	
-	Aq_tmp = new double [7*Np];
-	Bq_tmp = new double [7*Np];
+	Aq_tmp  = new double [7*Np];
+	Bq_tmp  = new double [7*Np];
+    Vel_tmp = new double [3*Np];
 
 	ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
 	ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
+	ScaLBL_CopyToHost(Vel_tmp, Velocity, 7*Np*sizeof(double));
 	
-/*	for (int k=1; k<Nz-1; k++){
-		for (int j=1; j<Ny-1; j++){
-			for (int i=1; i<Nx-1; i++){
-				double random_value = double(rand())/ RAND_MAX;
+    //Extract averged velocity
+	double vx_glb = (Averages->gnb.Px+Averages->gwb.Px)/(Averages->gnb.M+Averages->gwb.M); 
+	double vy_glb = (Averages->gnb.Py+Averages->gwb.Py)/(Averages->gnb.M+Averages->gwb.M); 
+	double vz_glb = (Averages->gnb.Pz+Averages->gwb.Pz)/(Averages->gnb.M+Averages->gwb.M); 
+    double v_mag_glb = sqrt(vx_glb*vx_glb+vy_glb*vy_glb+vz_glb*vz_glb);
 
-				if (Averages->SDs(i,j,k) < 0.f){
-					// skip
-				}
-				else if (phase(i,j,k) > 0.f ){
-					phase(i,j,k) -= random_value*seed_water_in_oil;
-					mass_loss += random_value*seed_water_in_oil;
-					count++;
-				}
-				else {
-
-				}
-			}
-		}
-	}
-	*/
 	for (int n=0; n < ScaLBL_Comm->LastExterior(); n++){
-		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+        double v_mag_local = sqrt(Vel_tmp[n]*Vel_tmp[n]+Vel_tmp[n+1*Np]*Vel_tmp[n+1*Np]+Vel_tmp[n+2*Np]*Vel_tmp[n+2*Np]);
+        double weight = (v_mag_local<v_mag_glb) ? v_mag_local/v_mag_glb : 1.0;
+		double random_value = weight*seed_water_in_oil*double(rand())/ RAND_MAX;
 		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
 		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
 		double phase_id = (dA - dB) / (dA + dB);
@@ -1255,7 +1246,9 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 	}
 
 	for (int n=ScaLBL_Comm->FirstInterior(); n < ScaLBL_Comm->LastInterior(); n++){
-		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+        double v_mag_local = sqrt(Vel_tmp[n]*Vel_tmp[n]+Vel_tmp[n+1*Np]*Vel_tmp[n+1*Np]+Vel_tmp[n+2*Np]*Vel_tmp[n+2*Np]);
+        double weight = (v_mag_local<v_mag_glb) ? v_mag_local/v_mag_glb : 1.0;
+		double random_value = weight*seed_water_in_oil*double(rand())/ RAND_MAX;
 		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
 		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
 		double phase_id = (dA - dB) / (dA + dB);
@@ -1291,6 +1284,100 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 	return(mass_loss);
 }
 
+//double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
+//	srand(time(NULL));
+//	double mass_loss =0.f;
+//	double count =0.f;
+//	double *Aq_tmp, *Bq_tmp;
+//	
+//	Aq_tmp = new double [7*Np];
+//	Bq_tmp = new double [7*Np];
+//
+//	ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
+//	ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
+//	
+///*	for (int k=1; k<Nz-1; k++){
+//		for (int j=1; j<Ny-1; j++){
+//			for (int i=1; i<Nx-1; i++){
+//				double random_value = double(rand())/ RAND_MAX;
+//
+//				if (Averages->SDs(i,j,k) < 0.f){
+//					// skip
+//				}
+//				else if (phase(i,j,k) > 0.f ){
+//					phase(i,j,k) -= random_value*seed_water_in_oil;
+//					mass_loss += random_value*seed_water_in_oil;
+//					count++;
+//				}
+//				else {
+//
+//				}
+//			}
+//		}
+//	}
+//	*/
+//	for (int n=0; n < ScaLBL_Comm->LastExterior(); n++){
+//		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+//		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
+//		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
+//		double phase_id = (dA - dB) / (dA + dB);
+//		if (phase_id > 0.0){
+//			Aq_tmp[n] -= 0.3333333333333333*random_value;
+//			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
+//			
+//			Bq_tmp[n] += 0.3333333333333333*random_value;
+//			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+//		}
+//		mass_loss += random_value*seed_water_in_oil;
+//	}
+//
+//	for (int n=ScaLBL_Comm->FirstInterior(); n < ScaLBL_Comm->LastInterior(); n++){
+//		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+//		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
+//		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
+//		double phase_id = (dA - dB) / (dA + dB);
+//		if (phase_id > 0.0){
+//			Aq_tmp[n] -= 0.3333333333333333*random_value;
+//			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
+//			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
+//			
+//			Bq_tmp[n] += 0.3333333333333333*random_value;
+//			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
+//			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+//		}
+//		mass_loss += random_value*seed_water_in_oil;
+//	}
+//
+//	count = Dm->Comm.sumReduce( count );
+//	mass_loss = Dm->Comm.sumReduce( mass_loss );
+//	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
+//
+//	// Need to initialize Aq, Bq, Den, Phi directly
+//	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));
+//	ScaLBL_CopyToDevice(Aq, Aq_tmp, 7*Np*sizeof(double));
+//	  ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
+//
+//	return(mass_loss);
+//}
+
 double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta_volume){
 	const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 

From a372d604503739f3e1564c7bec4bc683cfbb189a Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Tue, 4 Feb 2020 13:58:06 -0500
Subject: [PATCH 045/121] resolve some minor issues after the MPI backend
 updates

---
 models/GreyscaleModel.cpp          | 24 +++++++++++++++---------
 models/GreyscaleModel.h            |  2 +-
 tests/lbpm_greyscale_simulator.cpp |  2 +-
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 79b7a9c7..11d92c80 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -261,7 +261,7 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 	// Set Dm to match Mask
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
-	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
+	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=Dm->Comm.sumReduce(label_count[idx]);
 
     //Initialize a weighted porosity after considering grey voxels
     GreyPorosity=0.0;
@@ -595,11 +595,16 @@ void ScaLBL_GreyscaleModel::Run(){
 					}
 				}
 			}
-			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
-			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			//MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			//MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			//MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			//MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			
+            vax = Mask->Comm.sumReduce( vax_loc );
+            vay = Mask->Comm.sumReduce( vay_loc );
+            vaz = Mask->Comm.sumReduce( vaz_loc );
+            count = Mask->Comm.sumReduce( count_loc );
+
 			vax /= count;
 			vay /= count;
 			vaz /= count;
@@ -629,10 +634,11 @@ void ScaLBL_GreyscaleModel::Run(){
 			double As = Morphology.A();
 			double Hs = Morphology.H();
 			double Xs = Morphology.X();
-			Vs=sumReduce( Dm->Comm, Vs);
-			As=sumReduce( Dm->Comm, As);
-			Hs=sumReduce( Dm->Comm, Hs);
-			Xs=sumReduce( Dm->Comm, Xs);
+			Vs = Dm->Comm.sumReduce( Vs);
+			As = Dm->Comm.sumReduce( As);
+			Hs = Dm->Comm.sumReduce( Hs);
+			Xs = Dm->Comm.sumReduce( Xs);
+
 			double h = Dm->voxel_length;
 			//double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
 			double absperm = h*h*mu*GreyPorosity*flow_rate / force_mag;
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index c670239f..a99925b1 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -10,7 +10,7 @@ Implementation of color lattice boltzmann model
 #include <fstream>
 
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "common/Database.h"
 #include "common/ScaLBL.h"
 #include "ProfilerApp.h"
diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index b7ed442e..a54b6fc4 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -8,7 +8,7 @@
 
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
-#include "common/MPI_Helpers.h"
+#include "common/MPI.h"
 #include "models/GreyscaleModel.h"
 //#define WRITE_SURFACES
 

From 6d4e68d8b8b870519862a07584e3cfcb3808b554 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 4 Feb 2020 14:02:49 -0500
Subject: [PATCH 046/121] set morphological target from kr

---
 models/ColorModel.cpp | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index bcffa9df..b8578f4e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -524,6 +524,24 @@ void ScaLBL_ColorModel::Run(){
 	int RESCALE_FORCE_COUNT = 0;
 	int RESCALE_FORCE_MAX = 0;
 	
+	/* history for morphological algoirthm */
+	double KRA_MORPH_FACTOR=0.8;
+	double volA_prev = 0.0; 
+	double log_krA_prev = 1.0;
+	double log_krA_target = 1.0;
+	double log_krA = 0.0;
+	double slope_krA_volume = 0.0;
+	if (color_db->keyExists( "vol_A_previous" )){
+		volA_prev  = color_db->getScalar<double>( "vol_A_previous" );
+	}
+	if (color_db->keyExists( "log_krA_previous" )){
+		log_krA_prev  = color_db->getScalar<double>( "log_krA_previous" );
+	}
+	if (color_db->keyExists( "krA_morph_factor" )){
+		KRA_MORPH_FACTOR  = color_db->getScalar<double>( "krA_morph_factor" );
+	}
+	
+	/* defaults for simulation protocols */
 	auto protocol = color_db->getWithDefault<std::string>( "protocol", "none" );
 	if (protocol == "image sequence"){
 		// Get the list of images
@@ -811,7 +829,17 @@ void ScaLBL_ColorModel::Run(){
 				if ( isSteady ){
 					MORPH_ADAPT = true;
 					CURRENT_MORPH_TIMESTEPS=0;
-					delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
+					//delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
+					/** morphological target based on relative permeability for A **/
+					double krA_TMP= fabs(muA*flow_rate_A / force_mag);
+					log_krA = log(krA_TMP);
+					log_krA_target = log(KRA_MORPH_FACTOR*(krA_TMP));
+					slope_krA_volume = (log_krA - log_krA_prev)/(Dm->Volume*(volA - volA_prev));
+					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
+					log_krA_prev = log_krA;
+					volA_prev = volA;
+					printf("   ",log_krA, log_krA_target, vol_A, );
+					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);
 					analysis.WriteVisData(timestep, current_db, *Averages, Phi, Pressure, Velocity, fq, Den );
@@ -1279,7 +1307,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 	// Need to initialize Aq, Bq, Den, Phi directly
 	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));
 	ScaLBL_CopyToDevice(Aq, Aq_tmp, 7*Np*sizeof(double));
-	  ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
+	ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
 
 	return(mass_loss);
 }

From 57156d16fca13963fa85ed4167911bdca6465b86 Mon Sep 17 00:00:00 2001
From: Mark Berrill <berrillma@ornl.gov>
Date: Wed, 5 Feb 2020 07:35:13 -0500
Subject: [PATCH 047/121] Fixing build issue

---
 cmake/libraries.cmake | 1 +
 common/MPI.cpp        | 2 +-
 gpu/Color.cu          | 6 +-----
 gpu/D3Q19.cu          | 1 -
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cmake/libraries.cmake b/cmake/libraries.cmake
index dca31ea9..43d2726e 100644
--- a/cmake/libraries.cmake
+++ b/cmake/libraries.cmake
@@ -308,5 +308,6 @@ MACRO ( CONFIGURE_LBPM )
     # Suppress some common warnings
     IF ( USING_GCC )
         SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -Wno-unused-parameter")
+        SET( CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -Wno-reorder,-Wno-unused-parameter")
     ENDIF()
 ENDMACRO ()
diff --git a/common/MPI.cpp b/common/MPI.cpp
index 9495372d..8b09bc49 100644
--- a/common/MPI.cpp
+++ b/common/MPI.cpp
@@ -35,7 +35,7 @@
 // Using MAC
 #define USE_MAC
 #include <sched.h>
-#elif defined( __linux ) || defined( __unix ) || defined( __posix )
+#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
 // We are using linux
 #define USE_LINUX
 #include <sched.h>
diff --git a/gpu/Color.cu b/gpu/Color.cu
index 347858b9..30c16b51 100644
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
@@ -128,7 +128,7 @@ __global__  void dvc_ScaLBL_Color_InitDistance(char *ID, double *Den, double *Ph
 
 __global__  void dvc_ScaLBL_Color_BC(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np)
 {
-	int idx,n,nm;
+	int idx,n;
 	// Fill the outlet with component b
 	idx = blockIdx.x*blockDim.x + threadIdx.x;
 	if (idx < count){
@@ -3471,13 +3471,11 @@ __global__ void dvc_ScaLBL_D3Q19_AAeven_ColorMass(double *Aq, double *Bq, double
 		double *Velocity, double *ColorGrad, double beta, int start, int finish, int Np){
 
 	int n;
-	double fq;
 	// non-conserved moments
 	double nA,nB; // number density
 	double a1,b1,a2,b2,nAB,delta;
 	double C,nx,ny,nz; //color gradient magnitude and direction
 	double ux,uy,uz;
-	double phi,tau,rho0,rlx_setA,rlx_setB;
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
@@ -3565,13 +3563,11 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_ColorMass(int *neighborList, double *Aq,
 		double *Velocity, double *ColorGrad, double beta, int start, int finish, int Np){
 
 	int n,nread;
-	double fq;
 	// non-conserved moments
 	double nA,nB; // number density
 	double a1,b1,a2,b2,nAB,delta;
 	double C,nx,ny,nz; //color gradient magnitude and direction
 	double ux,uy,uz;
-	double phi,tau,rho0,rlx_setA,rlx_setB;
 
 	int S = Np/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){
diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu
index 2df4db6c..133c8455 100644
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
@@ -1538,7 +1538,6 @@ __global__  void dvc_ScaLBL_D3Q19_Momentum(double *dist, double *vel, int N)
 	double f1,f2,f3,f4,f5,f6,f7,f8,f9;
 	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double vx,vy,vz;
-	char id;
 
 	int S = N/NBLOCKS/NTHREADS + 1;
 	for (int s=0; s<S; s++){

From 7569c9bf7315c11dcf5814fe7e8aad1c5bf09c00 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Wed, 12 Feb 2020 14:19:16 -0500
Subject: [PATCH 048/121] support for grid file in MRT model

---
 models/MRTModel.cpp | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index 2ddba403..60847e54 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -3,6 +3,7 @@
  */
 #include "models/MRTModel.h"
 #include "analysis/distance.h"
+#include "common/ReadMicroCT.h"
 
 ScaLBL_MRTModel::ScaLBL_MRTModel(int RANK, int NP, const Utilities::MPI& COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),
@@ -98,15 +99,29 @@ void ScaLBL_MRTModel::ReadInput(){
     sprintf(LocalRankFilename,"%s%s","ID.",LocalRankString);
     sprintf(LocalRestartFile,"%s%s","Restart.",LocalRankString);
 
-	if (domain_db->keyExists( "Filename" )){
-		auto Filename = domain_db->getScalar<std::string>( "Filename" );
-		Mask->Decomp(Filename);
-	}
-	else{
-		Mask->ReadIDs();
-	}
+    
+    if (domain_db->keyExists( "Filename" )){
+    	auto Filename = domain_db->getScalar<std::string>( "Filename" );
+    	Mask->Decomp(Filename);
+    }
+    else if (domain_db->keyExists( "GridFile" )){
+    	// Read the local domain data
+    	auto input_id = readMicroCT( *domain_db, comm );
+    	// Fill the halo (assuming GCW of 1)
+    	array<int,3> size0 = { (int) input_id.size(0), (int) input_id.size(1), (int) input_id.size(2) };
+    	ArraySize size1 = { (size_t) Mask->Nx, (size_t) Mask->Ny, (size_t) Mask->Nz };
+    	ASSERT( (int) size1[0] == size0[0]+2 && (int) size1[1] == size0[1]+2 && (int) size1[2] == size0[2]+2 );
+    	fillHalo<signed char> fill( comm, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
+    	Array<signed char> id_view;
+    	id_view.viewRaw( size1, Mask->id );
+    	fill.copy( input_id, id_view );
+    	fill.fill( id_view );
+    }
+    else{
+    	Mask->ReadIDs();
+    }
 
-	// Generate the signed distance map
+    // Generate the signed distance map
 	// Initialize the domain and communication
 	Array<char> id_solid(Nx,Ny,Nz);
 	// Solve for the position of the solid phase

From 46c407695620df3b45f3eabba073576b643c453f Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Mon, 17 Feb 2020 12:06:58 -0500
Subject: [PATCH 049/121] fix some typo

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index b8578f4e..36c40224 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -838,7 +838,7 @@ void ScaLBL_ColorModel::Run(){
 					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
 					log_krA_prev = log_krA;
 					volA_prev = volA;
-					printf("   ",log_krA, log_krA_target, vol_A, );
+					printf("   ",log_krA, log_krA_target, volA);
 					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);

From 586bc09f842efac51a399a5842e5469399f8a4eb Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Fri, 21 Feb 2020 11:11:59 -0500
Subject: [PATCH 050/121] fix print bug

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index b8578f4e..097d53e4 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -838,7 +838,7 @@ void ScaLBL_ColorModel::Run(){
 					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
 					log_krA_prev = log_krA;
 					volA_prev = volA;
-					printf("   ",log_krA, log_krA_target, vol_A, );
+					printf("   log(kr)=%f, TARGET log(kr)=%f, volume=%f \n",log_krA, log_krA_target, vol_A, );
 					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);

From a42a0c84408d652d382cf8b5c66910a89718d44c Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Fri, 21 Feb 2020 11:16:26 -0500
Subject: [PATCH 051/121] fix print bug

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 097d53e4..1f695bed 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -838,7 +838,7 @@ void ScaLBL_ColorModel::Run(){
 					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
 					log_krA_prev = log_krA;
 					volA_prev = volA;
-					printf("   log(kr)=%f, TARGET log(kr)=%f, volume=%f \n",log_krA, log_krA_target, vol_A, );
+					printf("   log(kr)=%f, TARGET log(kr)=%f, volume=%f \n",log_krA, log_krA_target, vol_A);
 					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);

From b99d32ef0c120e9fa7d9399d7a68b4c4a5b85779 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Fri, 21 Feb 2020 11:28:18 -0500
Subject: [PATCH 052/121] fix print

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 82680a1c..8293e09f 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -838,7 +838,7 @@ void ScaLBL_ColorModel::Run(){
 					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
 					log_krA_prev = log_krA;
 					volA_prev = volA;
-					printf("   log(kr)=%f, TARGET log(kr)=%f, volume=%f \n",log_krA, log_krA_target, volA);
+					printf("   log(kr)=%f, volume=%f, TARGET log(kr)=%f, volume change=%f \n",log_krA, volA, log_krA_target, delta_volume_target/(volA*Dm->Volume));
 					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);

From 81a25b99977f03f729fffd20d9d8938ca455eeb6 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Fri, 21 Feb 2020 11:43:58 -0500
Subject: [PATCH 053/121] try for better Ca target

---
 models/ColorModel.cpp | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 8293e09f..2c773c8e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -521,8 +521,6 @@ void ScaLBL_ColorModel::Run(){
 	double NOISE_THRESHOLD = 0.0;
 	double BUMP_RATE = 2.0;
 	bool USE_BUMP_RATE = false;
-	int RESCALE_FORCE_COUNT = 0;
-	int RESCALE_FORCE_MAX = 0;
 	
 	/* history for morphological algoirthm */
 	double KRA_MORPH_FACTOR=0.8;
@@ -801,6 +799,20 @@ void ScaLBL_ColorModel::Run(){
 			double flow_rate_B = volB*(vB_x*dir_x + vB_y*dir_y + vB_z*dir_z);
 			double Ca = fabs(muA*flow_rate_A + muB*flow_rate_B)/(5.796*alpha);
 			
+			if (SET_CAPILLARY_NUMBER && CURRENT_STEADY_TIMESTEPS%MIN_STEADY_TIMESTEPS < analysis_interval ){
+				Fx *= capillary_number / Ca;
+				Fy *= capillary_number / Ca;
+				Fz *= capillary_number / Ca;
+				if (force_mag > 1e-3){
+					Fx *= 1e-3/force_mag;   // impose ceiling for stability
+					Fy *= 1e-3/force_mag;   
+					Fz *= 1e-3/force_mag;   
+				}
+				if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
+				Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
+				color_db->putVector<double>("F",{Fx,Fy,Fz});
+			}
+			
 			if ( morph_timesteps > morph_interval ){
 				
 				bool isSteady = false;
@@ -808,23 +820,6 @@ void ScaLBL_ColorModel::Run(){
 					isSteady = true;
 				if (CURRENT_STEADY_TIMESTEPS > MAX_STEADY_TIMESTEPS)
 					isSteady = true;
-				
-				if (SET_CAPILLARY_NUMBER  && RESCALE_FORCE_COUNT < RESCALE_FORCE_MAX){
-					RESCALE_FORCE_COUNT++;
-					Fx *= capillary_number / Ca;
-					Fy *= capillary_number / Ca;
-					Fz *= capillary_number / Ca;
-
-					if (force_mag > 1e-3){
-						Fx *= 1e-3/force_mag;   // impose ceiling for stability
-						Fy *= 1e-3/force_mag;   
-						Fz *= 1e-3/force_mag;   
-					}
-					
-					if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
-					Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
-					color_db->putVector<double>("F",{Fx,Fy,Fz});
-				}
 
 				if ( isSteady ){
 					MORPH_ADAPT = true;
@@ -913,7 +908,6 @@ void ScaLBL_ColorModel::Run(){
 						Fx *= capillary_number / Ca;
 						Fy *= capillary_number / Ca;
 						Fz *= capillary_number / Ca;
-						RESCALE_FORCE_COUNT = 1;
 						if (force_mag > 1e-3){
 							Fx *= 1e-3/force_mag;   // impose ceiling for stability
 							Fy *= 1e-3/force_mag;   
@@ -933,6 +927,7 @@ void ScaLBL_ColorModel::Run(){
 						Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
 						color_db->putVector<double>("F",{Fx,Fy,Fz});
 					}
+					
 					CURRENT_STEADY_TIMESTEPS = 0;
 				}
 				else{

From 1694f4530ccaeb93aa484ad8883a320c0c714b4c Mon Sep 17 00:00:00 2001
From: Rex Zhe Li <zhe.rex.li@gmail.com>
Date: Fri, 21 Feb 2020 21:22:54 -0500
Subject: [PATCH 054/121] comment out the variable rescale_force_count that was
 deprecated

---
 models/ColorModel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 2c773c8e..3fef03d1 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -583,9 +583,9 @@ void ScaLBL_ColorModel::Run(){
 		SET_CAPILLARY_NUMBER=true;
 		//RESCALE_FORCE_MAX = 1;
 	}
-	if (analysis_db->keyExists( "rescale_force_count" )){
-		RESCALE_FORCE_MAX = analysis_db->getScalar<int>( "rescale_force_count" );
-	}
+//	if (analysis_db->keyExists( "rescale_force_count" )){
+//		RESCALE_FORCE_MAX = analysis_db->getScalar<int>( "rescale_force_count" );
+//	}
 	if (color_db->keyExists( "timestep" )){
 		timestep = color_db->getScalar<int>( "timestep" );
 	}

From fa61d19095187f8fac5c1671ee756eedda565c4d Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 4 Mar 2020 14:50:53 -0500
Subject: [PATCH 055/121] Update helper functions to read input database

---
 example/Workflow/HelperFunctions.R | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/example/Workflow/HelperFunctions.R b/example/Workflow/HelperFunctions.R
index 497cb262..6c8bd903 100644
--- a/example/Workflow/HelperFunctions.R
+++ b/example/Workflow/HelperFunctions.R
@@ -2,6 +2,31 @@ require("ggplot2")
 
 GG_THEME=theme_bw()+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())
 
+ReadDatabase<-function(FILE){
+
+    INPUT<-gsub(';','',readLines(FILE))
+
+    S<-gsub('tauA = ','',gsub("\\s+"," ",(grep("tauA",INPUT,value=TRUE))))
+    TAU_A = as.numeric(S)
+    S<-gsub('tauB = ','',gsub("\\s+"," ",(grep("tauB",INPUT,value=TRUE))))
+    TAU_B = as.numeric(S)
+    S<-gsub('rhoA = ','',gsub("\\s+"," ",(grep("rhoA",INPUT,value=TRUE))))
+    RHO_A = as.numeric(S)
+    S<-gsub('rhoB = ','',gsub("\\s+"," ",(grep("rhoB",INPUT,value=TRUE))))
+    RHO_B = as.numeric(S)
+
+    S<-gsub('alpha = ','',gsub("\\s+"," ",(grep("alpha",INPUT,value=TRUE))))
+    ALPHA = as.numeric(S)
+
+    # Read the affinity
+    S<-gsub('ComponentAffinity = ','',gsub("\\s+"," ",(grep("ComponentAffinity",INPUT,value=TRUE))))
+    AFFINITY<-as.numeric(unlist(strsplit(S,", ")))
+
+    PARAMETERS<-c(TAU_A,TAU_B,RHO_A,RHO_B,ALPHA,AFFINITY)
+    
+    return(PARAMETERS)
+}
+
 ReadSubphase<-function(PATH){
   FILE=paste0(PATH,"/subphase.csv")
   S<-read.csv(FILE,head=TRUE,sep=" ")

From 7bb01557d838a1231bc5780a5737f0af4da5d43c Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 17 Mar 2020 13:45:51 -0400
Subject: [PATCH 056/121] updated bugfix with old ScaLBL

---
 common/ScaLBL.cpp | 251 +++++++++++++++++++++++-----------------------
 common/ScaLBL.h   |   9 +-
 2 files changed, 134 insertions(+), 126 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 6f2966e7..21656757 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -5,7 +5,9 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	Lock=false; // unlock the communicator
 	//......................................................................................
 	// Create a separate copy of the communicator for the device
-    MPI_COMM_SCALBL = Dm->Comm.dup();
+	//MPI_Comm_group(Dm->Comm,&Group);
+	//MPI_Comm_create(Dm->Comm,Group,&MPI_COMM_SCALBL);
+	MPI_Comm_dup(Dm->Comm,&MPI_COMM_SCALBL);
 	//......................................................................................
 	// Copy the domain size and communication information directly from Dm
 	Nx = Dm->Nx;
@@ -213,7 +215,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	ScaLBL_CopyToZeroCopy(dvcRecvList_Yz,Dm->recvList_Yz,recvCount_Yz*sizeof(int));
 	//......................................................................................
 
-	MPI_COMM_SCALBL.barrier();
+	MPI_Barrier(MPI_COMM_SCALBL);
 
 	//...................................................................................
 	// Set up the recieve distribution lists
@@ -286,7 +288,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	//...................................................................................
 
 	//......................................................................................
-	MPI_COMM_SCALBL.barrier();
+	MPI_Barrier(MPI_COMM_SCALBL);
 	ScaLBL_DeviceBarrier();
 	//......................................................................................
 	SendCount = sendCount_x+sendCount_X+sendCount_y+sendCount_Y+sendCount_z+sendCount_Z+
@@ -363,7 +365,7 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	int idx,i,j,k,n;
 
 	// Check that Map has size matching sub-domain
-	if ( (int) Map.size(0) != Nx)
+	if (Map.size(0) != Nx)
 		ERROR("ScaLBL_Communicator::MemoryOptimizedLayout: Map array dimensions do not match! \n");
 
 	// Initialize Map
@@ -867,8 +869,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(12,dvcSendList_x,3*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	ScaLBL_D3Q19_Pack(14,dvcSendList_x,4*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 5*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 5*recvCount_X,rank_X,recvtag);
+	MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,dist,N);
@@ -876,8 +878,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(11,dvcSendList_X,3*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(13,dvcSendList_X,4*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 5*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 5*recvCount_x,rank_x,recvtag);
+	MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(8,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,dist,N);
@@ -885,8 +887,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_y,3*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_y,4*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 5*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 5*recvCount_Y,rank_Y,recvtag);
+	MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
@@ -894,8 +896,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Y,3*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Y,4*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 5*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 5*recvCount_y,rank_y,recvtag);
+	MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(12,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,dist,N);
@@ -903,8 +905,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_z,3*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_z,4*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 5*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 5*recvCount_Z,rank_Z,recvtag);
+	MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,dist,N);
@@ -913,57 +915,57 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Z,3*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_Z,4*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 5*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 5*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 	
 	//...Pack the xy edge (8)................................
 	ScaLBL_D3Q19_Pack(8,dvcSendList_xy,0,sendCount_xy,sendbuf_xy,dist,N);
-	req1[6] = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-	req2[6] = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
 	//...Pack the Xy edge (9)................................
 	ScaLBL_D3Q19_Pack(9,dvcSendList_Xy,0,sendCount_Xy,sendbuf_Xy,dist,N);
-	req1[8] = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-	req2[8] = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
 	//...Pack the xY edge (10)................................
 	ScaLBL_D3Q19_Pack(10,dvcSendList_xY,0,sendCount_xY,sendbuf_xY,dist,N);
-	req1[9] = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-	req2[9] = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
 	//...Pack the XY edge (7)................................
 	ScaLBL_D3Q19_Pack(7,dvcSendList_XY,0,sendCount_XY,sendbuf_XY,dist,N);
-	req1[7] = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-	req2[7] = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
 	//...Pack the xz edge (12)................................
 	ScaLBL_D3Q19_Pack(12,dvcSendList_xz,0,sendCount_xz,sendbuf_xz,dist,N);
-	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
 	//...Pack the xZ edge (14)................................
 	ScaLBL_D3Q19_Pack(14,dvcSendList_xZ,0,sendCount_xZ,sendbuf_xZ,dist,N);
-	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
 	//...Pack the Xz edge (13)................................
 	ScaLBL_D3Q19_Pack(13,dvcSendList_Xz,0,sendCount_Xz,sendbuf_Xz,dist,N);
-	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
 	//...Pack the XZ edge (11)................................
 	ScaLBL_D3Q19_Pack(11,dvcSendList_XZ,0,sendCount_XZ,sendbuf_XZ,dist,N);
-	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
 	//...Pack the yz edge (16)................................
 	ScaLBL_D3Q19_Pack(16,dvcSendList_yz,0,sendCount_yz,sendbuf_yz,dist,N);
-	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
 	//...Pack the yZ edge (18)................................
 	ScaLBL_D3Q19_Pack(18,dvcSendList_yZ,0,sendCount_yZ,sendbuf_yZ,dist,N);
-	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
 	//...Pack the Yz edge (17)................................
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Yz,0,sendCount_Yz,sendbuf_Yz,dist,N);
-	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
 	//...Pack the YZ edge (15)................................
 	ScaLBL_D3Q19_Pack(15,dvcSendList_YZ,0,sendCount_YZ,sendbuf_YZ,dist,N);
-	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
 	//...................................................................................
 
 }
@@ -973,8 +975,8 @@ void ScaLBL_Communicator::RecvD3Q19AA(double *dist){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1057,8 +1059,8 @@ void ScaLBL_Communicator::RecvGrad(double *phi, double *grad){
 	// Recieves halo and incorporates into D3Q19 based stencil gradient computation
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1151,36 +1153,36 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,0,sendCount_x,sendbuf_x,Aq,N);
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,sendCount_x,sendCount_x,sendbuf_x,Bq,N);
 
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 2*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 2*recvCount_X,rank_X,recvtag);
+	MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
 	
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,Aq,N);
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,Bq,N);
 	
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 2*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 2*recvCount_x,rank_x,recvtag);
+	MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
 
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,Aq,N);
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,Bq,N);
 
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 2*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 2*recvCount_Y,rank_Y,recvtag);
+	MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
 	
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,Aq,N);
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,Bq,N);
 
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 2*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 2*recvCount_y,rank_y,recvtag);
+	MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
 	
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,Aq,N);
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,Bq,N);
 	
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 2*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 2*recvCount_Z,rank_Z,recvtag);
+	MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,Aq,N);
@@ -1188,8 +1190,8 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 
 	//...................................................................................
 	// Send all the distributions
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 2*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 2*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 
 }
 
@@ -1199,8 +1201,8 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(6,req1);
-	MPI_COMM_SCALBL.waitAll(6,req2);
+	MPI_Waitall(6,req1,stat1);
+	MPI_Waitall(6,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1291,18 +1293,18 @@ void ScaLBL_Communicator::TriSendD3Q7AA(double *Aq, double *Bq, double *Cq){
 
 	//...................................................................................
 	// Send all the distributions
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 3*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 3*recvCount_X,rank_X,recvtag);
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 3*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 3*recvCount_x,rank_x,recvtag);
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 3*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 3*recvCount_Y,rank_Y,recvtag);
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 3*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 3*recvCount_y,rank_y,recvtag);
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 3*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 3*recvCount_Z,rank_Z,recvtag);
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 3*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 3*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_x, 3*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 3*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	MPI_Isend(sendbuf_X, 3*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 3*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	MPI_Isend(sendbuf_y, 3*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 3*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	MPI_Isend(sendbuf_Y, 3*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 3*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	MPI_Isend(sendbuf_z, 3*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 3*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	MPI_Isend(sendbuf_Z, 3*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 3*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 
 }
 
@@ -1312,8 +1314,8 @@ void ScaLBL_Communicator::TriRecvD3Q7AA(double *Aq, double *Bq, double *Cq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(6,req1);
-	MPI_COMM_SCALBL.waitAll(6,req2);
+	MPI_Waitall(6,req1,stat1);
+	MPI_Waitall(6,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1407,49 +1409,49 @@ void ScaLBL_Communicator::SendHalo(double *data){
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
 
-	req1[0]  = MPI_COMM_SCALBL.Isend(sendbuf_x, sendCount_x,rank_x,sendtag);
-	req2[0]  = MPI_COMM_SCALBL.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag);
-	req1[1]  = MPI_COMM_SCALBL.Isend(sendbuf_X, sendCount_X,rank_X,sendtag);
-	req2[1]  = MPI_COMM_SCALBL.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag);
-	req1[2]  = MPI_COMM_SCALBL.Isend(sendbuf_y, sendCount_y,rank_y,sendtag);
-	req2[2]  = MPI_COMM_SCALBL.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag);
-	req1[3]  = MPI_COMM_SCALBL.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag);
-	req2[3]  = MPI_COMM_SCALBL.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag);
-	req1[4]  = MPI_COMM_SCALBL.Isend(sendbuf_z, sendCount_z,rank_z,sendtag);
-	req2[4]  = MPI_COMM_SCALBL.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag);
-	req1[5]  = MPI_COMM_SCALBL.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag);
-	req2[5]  = MPI_COMM_SCALBL.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag);
-	req1[6]  = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-	req2[6]  = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
-	req1[7]  = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-	req2[7]  = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
-	req1[8]  = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-	req2[8]  = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
-	req1[9]  = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-	req2[9]  = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
-	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
-	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
-	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
-	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
-	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
-	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
-	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
-	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
 	//...................................................................................
 }
 void ScaLBL_Communicator::RecvHalo(double *data){
 
 	//...................................................................................
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 	//...................................................................................
 	//...................................................................................
@@ -1478,6 +1480,7 @@ void ScaLBL_Communicator::RecvHalo(double *data){
 
 void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, DoubleArray &regdata){
 	// Gets data from the device and stores in regular layout
+	int i,j,k,n,idx;
 	int Nx = map.size(0);
 	int Ny = map.size(1);
 	int Nz = map.size(2);
@@ -1489,10 +1492,11 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 	double value;
 	TmpDat = new double [N];
 	ScaLBL_CopyToHost(&TmpDat[0],&data[0], N*sizeof(double));
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
-				auto idx=map(i,j,k);
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
+				n=k*Nx*Ny+j*Nx+i;
+				idx=map(i,j,k);
 				if (!(idx<0)){
 					value=TmpDat[idx];
 					regdata(i,j,k)=value;
@@ -1506,9 +1510,8 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 }
 
 
-void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB)
-{
-	//double Value=(vA-vB)/(vA+vB);
+void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
+	double Value=(vA-vB)/(vA+vB);
 	if (kproc == 0) {
 		// Set the phase indicator field and density on the z inlet
 		ScaLBL_Color_BC_z(dvcSendList_z, Map, Phi, Den, vA, vB, sendCount_z, N);
@@ -1516,9 +1519,8 @@ void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double
 	}
 }
 
-void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB)
-{
-	//double Value=(vA-vB)/(vA+vB);
+void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
+	double Value=(vA-vB)/(vA+vB);
 	if (kproc == nprocz-1){
 		// Set the phase indicator field and density on the Z outlet
 		ScaLBL_Color_BC_Z(dvcSendList_Z, Map, Phi, Den, vA, vB, sendCount_Z, N);
@@ -1526,8 +1528,7 @@ void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double
 	}
 }
 
-void ScaLBL_Communicator::D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time)
-{
+void ScaLBL_Communicator::D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time){
     //ScaLBL_D3Q19_Pressure_BC_z(int *LIST,fq,din,Nx,Ny,Nz);
 	if (kproc == 0) {
 		if (time%2==0){
@@ -1562,7 +1563,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		LocInletArea = double(sendCount_z);
 	else LocInletArea = 0.f;
 	
-	InletArea = MPI_COMM_SCALBL.sumReduce( LocInletArea );
+	MPI_Allreduce(&LocInletArea,&InletArea,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 	//printf("Inlet area = %f \n", InletArea);
 
 	// Set the flux BC
@@ -1571,7 +1572,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAeven_Flux_BC_z(dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 		
-		sum = MPI_COMM_SCALBL.sumReduce( locsum );
+		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 		din = flux/InletArea + sum;
 		//if (rank==0) printf("computed din (even) =%f \n",din);
 		if (kproc == 0)
@@ -1581,7 +1582,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAodd_Flux_BC_z(neighborList, dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 
-		sum = MPI_COMM_SCALBL.sumReduce( locsum );
+		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 		din = flux/InletArea + sum;
 		
 		//if (rank==0) printf("computed din (odd)=%f \n",din);
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 610fce5d..a50ab7ed 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -101,6 +101,11 @@ extern "C" void ScaLBL_D3Q19_Gradient_DFH(int *NeighborList, double *Phi, double
 
 // BOUNDARY CONDITION ROUTINES
 
+//extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din,
+//		int Nx, int Ny, int Nz);
+//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout,
+//		int Nx, int Ny, int Nz, int outlet);
+
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *neighborList, int *list, double *dist, double din, int count, int Np);
 
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *neighborList, int *list, double *dist, double dout, int count, int Np);
@@ -201,8 +206,10 @@ private:
 	int sendtag,recvtag;
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
-	Utilities::MPI MPI_COMM_SCALBL;		// MPI Communicator for this domain
+	MPI_Group Group;	// Group of processors associated with this domain
+	MPI_Comm MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 	//......................................................................................
 	// MPI ranks for all 18 neighbors
 	//......................................................................................

From 9f5b44dfe4b4b18d4e8308e19df5e5edd0f1ee00 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 17 Mar 2020 21:23:18 -0400
Subject: [PATCH 057/121] Revert "Moving more MPI calls to the wrapper"

This reverts commit 0f91767b6c870101084fbae0978280c04c85a004.
---
 IO/netcdf.cpp                         |   2 +-
 StackTrace/ErrorHandlers.h            |   2 +-
 StackTrace/Utilities.cpp              |   2 +-
 analysis/TwoPhase.cpp                 |   7 +-
 analysis/morphology.cpp               | 132 ++---
 common/Communication.h                | 216 ++++----
 common/Domain.cpp                     | 206 ++++----
 common/Domain.h                       |   3 +
 common/MPI.I                          |  33 --
 common/MPI.cpp                        |  48 --
 common/MPI.h                          |   7 -
 common/ScaLBL.h                       |   1 +
 common/Utilities.cpp                  |   2 +-
 cpu/exe/lb2_Color_mpi.cpp             | 538 +++++++++----------
 cpu/exe/lb2_Color_wia_mpi_bubble.cpp  | 711 ++++++++++++++------------
 gpu/exe/lb1_MRT_mpi.cpp               | 348 +++++++------
 gpu/exe/lb1_MRT_mpi.cu                | 352 +++++++------
 gpu/exe/lb2_Color.cu                  |  65 ++-
 gpu/exe/lb2_Color_mpi.cpp             | 541 ++++++++++----------
 gpu/exe/lb2_Color_pBC_wia_mpi.cpp     | 621 ++++++++++++----------
 models/ColorModel.cpp                 |   8 +-
 models/DFHModel.cpp                   |   4 +-
 models/MRTModel.cpp                   |   4 +-
 tests/BlobAnalyzeParallel.cpp         |  22 +-
 tests/GenerateSphereTest.cpp          |  54 +-
 tests/TestBlobAnalyze.cpp             |  28 +-
 tests/TestBubble.cpp                  |  41 +-
 tests/TestBubbleDFH.cpp               |   4 +-
 tests/TestColorGrad.cpp               |  20 +-
 tests/TestCommD3Q19.cpp               |   4 +-
 tests/TestForceD3Q19.cpp              |   4 +-
 tests/TestForceMoments.cpp            |   4 +-
 tests/TestMRT.cpp                     |  28 +-
 tests/TestMicroCTReader.cpp           |   1 +
 tests/TestMomentsD3Q19.cpp            |   2 +-
 tests/TestNetcdf.cpp                  |   2 +-
 tests/TestSegDist.cpp                 |   4 +-
 tests/lb2_CMT_wia.cpp                 |  30 +-
 tests/lb2_Color_blob_wia_mpi.cpp      | 427 ++++++++--------
 tests/lbpm_BGK_simulator.cpp          |  48 +-
 tests/lbpm_color_macro_simulator.cpp  |  61 +--
 tests/lbpm_disc_pp.cpp                |  34 +-
 tests/lbpm_inkbottle_pp.cpp           |  22 +-
 tests/lbpm_juanes_bench_disc_pp.cpp   |  35 +-
 tests/lbpm_nondarcy_simulator.cpp     |  52 +-
 tests/lbpm_nonnewtonian_simulator.cpp |  26 +-
 tests/lbpm_plates_pp.cpp              |  24 +-
 tests/lbpm_porenetwork_pp.cpp         |  25 +-
 tests/lbpm_random_pp.cpp              |  92 ++--
 tests/lbpm_segmented_decomp.cpp       |  48 +-
 tests/lbpm_segmented_pp.cpp           |   2 +-
 tests/lbpm_sphere_pp.cpp              |  16 +-
 tests/lbpm_squaretube_pp.cpp          |  25 +-
 53 files changed, 2678 insertions(+), 2360 deletions(-)

diff --git a/IO/netcdf.cpp b/IO/netcdf.cpp
index 6c3773e3..e061579a 100644
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@@ -119,7 +119,7 @@ std::string VariableTypeName( VariableType type )
 int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm )
 {
     int fid = 0;
-    if ( comm.isNull() ) {
+    if ( comm == MPI_COMM_NULL ) {
         if ( mode == READ ) {
             int err = nc_open( filename.c_str(), NC_NOWRITE, &fid );
             CHECK_NC_ERR( err );
diff --git a/StackTrace/ErrorHandlers.h b/StackTrace/ErrorHandlers.h
index e43a4688..12b8d7de 100644
--- a/StackTrace/ErrorHandlers.h
+++ b/StackTrace/ErrorHandlers.h
@@ -6,7 +6,7 @@
 
 #include <functional>
 
-#include "common/MPI.h"
+#include "mpi.h"
 
 
 namespace StackTrace
diff --git a/StackTrace/Utilities.cpp b/StackTrace/Utilities.cpp
index 5fb8e9b8..11f05777 100644
--- a/StackTrace/Utilities.cpp
+++ b/StackTrace/Utilities.cpp
@@ -14,7 +14,7 @@
 #include <typeinfo>
 
 #ifdef USE_MPI
-#include "common/MPI.h"
+#include "mpi.h"
 #endif
 
 #ifdef USE_TIMER
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index d878a663..812490e7 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -890,14 +890,14 @@ void TwoPhase::ComponentAverages()
 	RecvBuffer.resize(BLOB_AVG_COUNT,NumberComponents_NWP);
 
 /*	for (int b=0; b<NumberComponents_NWP; b++){
-		Dm->Comm.barrier();
-		Dm->Comm.sumReduce(&ComponentAverages_NWP(0,b),&RecvBuffer(0),BLOB_AVG_COUNT);
+		MPI_Barrier(Dm->Comm);
+		MPI_Allreduce(&ComponentAverages_NWP(0,b),&RecvBuffer(0),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_NWP(idx,b)=RecvBuffer(idx);
 	}
 	*/
 	Dm->Comm.barrier();
 	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP);
-	//	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT);
+	//	MPI_Reduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
 
 	if (Dm->rank()==0){
 		printf("rescaling... \n");
@@ -994,6 +994,7 @@ void TwoPhase::ComponentAverages()
 	// reduce the wetting phase averages
 	for (int b=0; b<NumberComponents_WP; b++){
 		Dm->Comm.barrier();
+//		MPI_Allreduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		Dm->Comm.sumReduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_WP(idx,b)=RecvBuffer(idx);
 	}
diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index adde093a..ab4312f8 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -201,24 +201,42 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		PackID(Dm->sendList_yZ, Dm->sendCount_yZ ,sendID_yZ, id);
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
-		Dm->Comm.sendrecv(sendID_x,Dm->sendCount_x,Dm->rank_x(),sendtag,recvID_X,Dm->recvCount_X,Dm->rank_X(),recvtag);
-		Dm->Comm.sendrecv(sendID_X,Dm->sendCount_X,Dm->rank_X(),sendtag,recvID_x,Dm->recvCount_x,Dm->rank_x(),recvtag);
-		Dm->Comm.sendrecv(sendID_y,Dm->sendCount_y,Dm->rank_y(),sendtag,recvID_Y,Dm->recvCount_Y,Dm->rank_Y(),recvtag);
-		Dm->Comm.sendrecv(sendID_Y,Dm->sendCount_Y,Dm->rank_Y(),sendtag,recvID_y,Dm->recvCount_y,Dm->rank_y(),recvtag);
-		Dm->Comm.sendrecv(sendID_z,Dm->sendCount_z,Dm->rank_z(),sendtag,recvID_Z,Dm->recvCount_Z,Dm->rank_Z(),recvtag);
-		Dm->Comm.sendrecv(sendID_Z,Dm->sendCount_Z,Dm->rank_Z(),sendtag,recvID_z,Dm->recvCount_z,Dm->rank_z(),recvtag);
-		Dm->Comm.sendrecv(sendID_xy,Dm->sendCount_xy,Dm->rank_xy(),sendtag,recvID_XY,Dm->recvCount_XY,Dm->rank_XY(),recvtag);
-		Dm->Comm.sendrecv(sendID_XY,Dm->sendCount_XY,Dm->rank_XY(),sendtag,recvID_xy,Dm->recvCount_xy,Dm->rank_xy(),recvtag);
-		Dm->Comm.sendrecv(sendID_Xy,Dm->sendCount_Xy,Dm->rank_Xy(),sendtag,recvID_xY,Dm->recvCount_xY,Dm->rank_xY(),recvtag);
-		Dm->Comm.sendrecv(sendID_xY,Dm->sendCount_xY,Dm->rank_xY(),sendtag,recvID_Xy,Dm->recvCount_Xy,Dm->rank_Xy(),recvtag);
-		Dm->Comm.sendrecv(sendID_xz,Dm->sendCount_xz,Dm->rank_xz(),sendtag,recvID_XZ,Dm->recvCount_XZ,Dm->rank_XZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_XZ,Dm->sendCount_XZ,Dm->rank_XZ(),sendtag,recvID_xz,Dm->recvCount_xz,Dm->rank_xz(),recvtag);
-		Dm->Comm.sendrecv(sendID_Xz,Dm->sendCount_Xz,Dm->rank_Xz(),sendtag,recvID_xZ,Dm->recvCount_xZ,Dm->rank_xZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_xZ,Dm->sendCount_xZ,Dm->rank_xZ(),sendtag,recvID_Xz,Dm->recvCount_Xz,Dm->rank_Xz(),recvtag);
-		Dm->Comm.sendrecv(sendID_yz,Dm->sendCount_yz,Dm->rank_yz(),sendtag,recvID_YZ,Dm->recvCount_YZ,Dm->rank_YZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_YZ,Dm->sendCount_YZ,Dm->rank_YZ(),sendtag,recvID_yz,Dm->recvCount_yz,Dm->rank_yz(),recvtag);
-		Dm->Comm.sendrecv(sendID_Yz,Dm->sendCount_Yz,Dm->rank_Yz(),sendtag,recvID_yZ,Dm->recvCount_yZ,Dm->rank_yZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_yZ,Dm->sendCount_yZ,Dm->rank_yZ(),sendtag,recvID_Yz,Dm->recvCount_Yz,Dm->rank_Yz(),recvtag);
+		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -285,7 +303,7 @@ double morph_open()
 	fillHalo<char> fillChar(Dm->Comm,Dm->rank_info,{Nx-2,Ny-2,Nz-2},{1,1,1},0,1);
 
 
-	GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
+	MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 
 	count = 0.f;
 	for (int k=1; k<Nz-1; k++){
@@ -298,7 +316,7 @@ double morph_open()
 			}
 		}
 	}
-	countGlobal = Dm->Comm.sumReduce( count );
+	MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 	return countGlobal;
 }
 */
@@ -488,42 +506,42 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		PackID(Dm->sendList_yZ, Dm->sendCount_yZ ,sendID_yZ, id);
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
-		Dm->Comm.sendrecv(sendID_x,Dm->sendCount_x,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,Dm->rank_X(),recvtag);
-		Dm->Comm.sendrecv(sendID_X,Dm->sendCount_X,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,Dm->rank_x(),recvtag);
-		Dm->Comm.sendrecv(sendID_y,Dm->sendCount_y,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,Dm->rank_Y(),recvtag);
-		Dm->Comm.sendrecv(sendID_Y,Dm->sendCount_Y,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,Dm->rank_y(),recvtag);
-		Dm->Comm.sendrecv(sendID_z,Dm->sendCount_z,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,Dm->rank_Z(),recvtag);
-		Dm->Comm.sendrecv(sendID_Z,Dm->sendCount_Z,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,Dm->rank_z(),recvtag);
-		Dm->Comm.sendrecv(sendID_xy,Dm->sendCount_xy,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,Dm->rank_XY(),recvtag);
-		Dm->Comm.sendrecv(sendID_XY,Dm->sendCount_XY,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,Dm->rank_xy(),recvtag);
-		Dm->Comm.sendrecv(sendID_Xy,Dm->sendCount_Xy,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,Dm->rank_xY(),recvtag);
-		Dm->Comm.sendrecv(sendID_xY,Dm->sendCount_xY,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,Dm->rank_Xy(),recvtag);
-		Dm->Comm.sendrecv(sendID_xz,Dm->sendCount_xz,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,Dm->rank_XZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_XZ,Dm->sendCount_XZ,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,Dm->rank_xz(),recvtag);
-		Dm->Comm.sendrecv(sendID_Xz,Dm->sendCount_Xz,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,Dm->rank_xZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_xZ,Dm->sendCount_xZ,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,Dm->rank_Xz(),recvtag);
-		Dm->Comm.sendrecv(sendID_yz,Dm->sendCount_yz,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,Dm->rank_YZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_YZ,Dm->sendCount_YZ,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,Dm->rank_yz(),recvtag);
-		Dm->Comm.sendrecv(sendID_Yz,Dm->sendCount_Yz,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,Dm->rank_yZ(),recvtag);
-		Dm->Comm.sendrecv(sendID_yZ,Dm->sendCount_yZ,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,Dm->rank_Yz(),recvtag);
+		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -599,7 +617,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		}
 		
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		Dm->Comm.barrier();
+		MPI_Barrier(Dm->Comm);
 		
 		for (int k=1; k<nz-1; k++){
 			for (int j=1; j<ny-1; j++){
diff --git a/common/Communication.h b/common/Communication.h
index cf83ffe3..7c2f8d08 100644
--- a/common/Communication.h
+++ b/common/Communication.h
@@ -134,7 +134,7 @@ void InitializeRanks( const int rank, const int nprocx, const int nprocy, const
 
 
 //***************************************************************************************
-inline void CommunicateSendRecvCounts( const Utilities::MPI& comm, int sendtag, int recvtag, 
+inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
 		int rank_x, int rank_y, int rank_z, 
 		int rank_X, int rank_Y, int rank_Z,
 		int rank_xy, int rank_XY, int rank_xY, int rank_Xy,
@@ -152,53 +152,54 @@ inline void CommunicateSendRecvCounts( const Utilities::MPI& comm, int sendtag,
 		int& recvCount_yz, int& recvCount_YZ, int& recvCount_yZ, int& recvCount_Yz )
 {
 	MPI_Request req1[18], req2[18];
-	req1[0] = comm.Isend(&sendCount_x,1,rank_x,sendtag+0);
-	req2[0] = comm.Irecv(&recvCount_X,1,rank_X,recvtag+0);
-	req1[1] = comm.Isend(&sendCount_X,1,rank_X,sendtag+1);
-	req2[1] = comm.Irecv(&recvCount_x,1,rank_x,recvtag+1);
-	req1[2] = comm.Isend(&sendCount_y,1,rank_y,sendtag+2);
-	req2[2] = comm.Irecv(&recvCount_Y,1,rank_Y,recvtag+2);
-	req1[3] = comm.Isend(&sendCount_Y,1,rank_Y,sendtag+3);
-	req2[3] = comm.Irecv(&recvCount_y,1,rank_y,recvtag+3);
-	req1[4] = comm.Isend(&sendCount_z,1,rank_z,sendtag+4);
-	req2[4] = comm.Irecv(&recvCount_Z,1,rank_Z,recvtag+4);
-	req1[5] = comm.Isend(&sendCount_Z,1,rank_Z,sendtag+5);
-	req2[5] = comm.Irecv(&recvCount_z,1,rank_z,recvtag+5);
+	MPI_Status stat1[18],stat2[18];
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator.getCommunicator(),&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator.getCommunicator(),&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator.getCommunicator(),&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator.getCommunicator(),&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator.getCommunicator(),&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator.getCommunicator(),&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator.getCommunicator(),&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator.getCommunicator(),&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator.getCommunicator(),&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator.getCommunicator(),&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator.getCommunicator(),&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator.getCommunicator(),&req2[5]);
 
-	req1[6] = comm.Isend(&sendCount_xy,1,rank_xy,sendtag+6);
-	req2[6] = comm.Irecv(&recvCount_XY,1,rank_XY,recvtag+6);
-	req1[7] = comm.Isend(&sendCount_XY,1,rank_XY,sendtag+7);
-	req2[7] = comm.Irecv(&recvCount_xy,1,rank_xy,recvtag+7);
-	req1[8] = comm.Isend(&sendCount_Xy,1,rank_Xy,sendtag+8);
-	req2[8] = comm.Irecv(&recvCount_xY,1,rank_xY,recvtag+8);
-	req1[9] = comm.Isend(&sendCount_xY,1,rank_xY,sendtag+9);
-	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_Xy,recvtag+9);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator.getCommunicator(),&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator.getCommunicator(),&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator.getCommunicator(),&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator.getCommunicator(),&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator.getCommunicator(),&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator.getCommunicator(),&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator.getCommunicator(),&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator.getCommunicator(),&req2[9]);
 
-	req1[10] = comm.Isend(&sendCount_xz,1,rank_xz,sendtag+10);
-	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_XZ,recvtag+10);
-	req1[11] = comm.Isend(&sendCount_XZ,1,rank_XZ,sendtag+11);
-	req2[11] = comm.Irecv(&recvCount_xz,1,rank_xz,recvtag+11);
-	req1[12] = comm.Isend(&sendCount_Xz,1,rank_Xz,sendtag+12);
-	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_xZ,recvtag+12);
-	req1[13] = comm.Isend(&sendCount_xZ,1,rank_xZ,sendtag+13);
-	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_Xz,recvtag+13);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator.getCommunicator(),&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator.getCommunicator(),&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator.getCommunicator(),&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator.getCommunicator(),&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator.getCommunicator(),&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator.getCommunicator(),&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator.getCommunicator(),&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator.getCommunicator(),&req2[13]);
 
-	req1[14] = comm.Isend(&sendCount_yz,1,rank_yz,sendtag+14);
-	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_YZ,recvtag+14);
-	req1[15] = comm.Isend(&sendCount_YZ,1,rank_YZ,sendtag+15);
-	req2[15] = comm.Irecv(&recvCount_yz,1,rank_yz,recvtag+15);
-	req1[16] = comm.Isend(&sendCount_Yz,1,rank_Yz,sendtag+16);
-	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_yZ,recvtag+16);
-	req1[17] = comm.Isend(&sendCount_yZ,1,rank_yZ,sendtag+17);
-	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_Yz,recvtag+17);
-    comm.waitAll( 18, req1 );
-    comm.waitAll( 18, req2 );
-	comm.barrier();
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator.getCommunicator(),&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator.getCommunicator(),&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator.getCommunicator(),&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator.getCommunicator(),&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator.getCommunicator(),&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator.getCommunicator(),&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator.getCommunicator(),&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator.getCommunicator(),&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	Communicator.barrier();
 }
 
 
 //***************************************************************************************
-inline void CommunicateRecvLists( const Utilities::MPI& comm, int sendtag, int recvtag, 
+inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
 		int *sendList_x, int *sendList_y, int *sendList_z, int *sendList_X, int *sendList_Y, int *sendList_Z,
 		int *sendList_xy, int *sendList_XY, int *sendList_xY, int *sendList_Xy,
 		int *sendList_xz, int *sendList_XZ, int *sendList_xZ, int *sendList_Xz,
@@ -219,52 +220,53 @@ inline void CommunicateRecvLists( const Utilities::MPI& comm, int sendtag, int r
 		int rank_Xy, int rank_xz, int rank_XZ, int rank_xZ, int rank_Xz, int rank_yz, int rank_YZ, int rank_yZ, int rank_Yz)
 {
 	MPI_Request req1[18], req2[18];
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_x,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_X,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_X,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_x,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_Y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_Y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_Z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_Z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_z,recvtag);
+	MPI_Status stat1[18],stat2[18];
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator.getCommunicator(),&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator.getCommunicator(),&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator.getCommunicator(),&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator.getCommunicator(),&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator.getCommunicator(),&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator.getCommunicator(),&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator.getCommunicator(),&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator.getCommunicator(),&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator.getCommunicator(),&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator.getCommunicator(),&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator.getCommunicator(),&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator.getCommunicator(),&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_xy,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_XY,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_XY,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_xy,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_xY,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_xY,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator.getCommunicator(),&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator.getCommunicator(),&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator.getCommunicator(),&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator.getCommunicator(),&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator.getCommunicator(),&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator.getCommunicator(),&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator.getCommunicator(),&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator.getCommunicator(),&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_xz,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_xz,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator.getCommunicator(),&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator.getCommunicator(),&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator.getCommunicator(),&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator.getCommunicator(),&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator.getCommunicator(),&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator.getCommunicator(),&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator.getCommunicator(),&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator.getCommunicator(),&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_yz,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_yz,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz,recvtag);
-    comm.waitAll( 18, req1 );
-    comm.waitAll( 18, req2 );
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator.getCommunicator(),&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator.getCommunicator(),&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator.getCommunicator(),&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator.getCommunicator(),&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator.getCommunicator(),&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator.getCommunicator(),&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator.getCommunicator(),&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator.getCommunicator(),&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 }
 
 
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& comm,
+inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communicator,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -314,24 +316,42 @@ inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& comm,
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendbuf_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
-	comm.sendrecv(sendbuf_x,sendCount_x,rank_x,sendtag,recvbuf_X,recvCount_X,rank_X,recvtag);
-	comm.sendrecv(sendbuf_X,sendCount_X,rank_X,sendtag,recvbuf_x,recvCount_x,rank_x,recvtag);
-	comm.sendrecv(sendbuf_y,sendCount_y,rank_y,sendtag,recvbuf_Y,recvCount_Y,rank_Y,recvtag);
-	comm.sendrecv(sendbuf_Y,sendCount_Y,rank_Y,sendtag,recvbuf_y,recvCount_y,rank_y,recvtag);
-	comm.sendrecv(sendbuf_z,sendCount_z,rank_z,sendtag,recvbuf_Z,recvCount_Z,rank_Z,recvtag);
-	comm.sendrecv(sendbuf_Z,sendCount_Z,rank_Z,sendtag,recvbuf_z,recvCount_z,rank_z,recvtag);
-	comm.sendrecv(sendbuf_xy,sendCount_xy,rank_xy,sendtag,recvbuf_XY,recvCount_XY,rank_XY,recvtag);
-	comm.sendrecv(sendbuf_XY,sendCount_XY,rank_XY,sendtag,recvbuf_xy,recvCount_xy,rank_xy,recvtag);
-	comm.sendrecv(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag,recvbuf_xY,recvCount_xY,rank_xY,recvtag);
-	comm.sendrecv(sendbuf_xY,sendCount_xY,rank_xY,sendtag,recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
-	comm.sendrecv(sendbuf_xz,sendCount_xz,rank_xz,sendtag,recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
-	comm.sendrecv(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag,recvbuf_xz,recvCount_xz,rank_xz,recvtag);
-	comm.sendrecv(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag,recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
-	comm.sendrecv(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag,recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
-	comm.sendrecv(sendbuf_yz,sendCount_yz,rank_yz,sendtag,recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
-	comm.sendrecv(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag,recvbuf_yz,recvCount_yz,rank_yz,recvtag);
-	comm.sendrecv(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag,recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
-	comm.sendrecv(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag,recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
+	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_DOUBLE,rank_x,sendtag,
+			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_DOUBLE,rank_X,sendtag,
+			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_DOUBLE,rank_y,sendtag,
+			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,
+			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_DOUBLE,rank_z,sendtag,
+			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,
+			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,
+			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,
+			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,
+			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,
+			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,
+			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,
+			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,
+			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,
+			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,
+			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,
+			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,
+			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,
+			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
diff --git a/common/Domain.cpp b/common/Domain.cpp
index b920bffd..ff6c6b68 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -837,44 +837,44 @@ void Domain::CommInit()
 	sendBuf_YZ = new int [sendCount_YZ];
 	sendBuf_XZ = new int [sendCount_XZ];
 	//......................................................................................
-	req1[0] = Comm.Isend(&sendCount_x,1,rank_x(),sendtag+0);
-	req2[0] = Comm.Irecv(&recvCount_X,1,rank_X(),recvtag+0);
-	req1[1] = Comm.Isend(&sendCount_X,1,rank_X(),sendtag+1);
-	req2[1] = Comm.Irecv(&recvCount_x,1,rank_x(),recvtag+1);
-	req1[2] = Comm.Isend(&sendCount_y,1,rank_y(),sendtag+2);
-	req2[2] = Comm.Irecv(&recvCount_Y,1,rank_Y(),recvtag+2);
-	req1[3] = Comm.Isend(&sendCount_Y,1,rank_Y(),sendtag+3);
-	req2[3] = Comm.Irecv(&recvCount_y,1,rank_y(),recvtag+3);
-	req1[4] = Comm.Isend(&sendCount_z,1,rank_z(),sendtag+4);
-	req2[4] = Comm.Irecv(&recvCount_Z,1,rank_Z(),recvtag+4);
-	req1[5] = Comm.Isend(&sendCount_Z,1,rank_Z(),sendtag+5);
-	req2[5] = Comm.Irecv(&recvCount_z,1,rank_z(),recvtag+5);
-	req1[6] = Comm.Isend(&sendCount_xy,1,rank_xy(),sendtag+6);
-	req2[6] = Comm.Irecv(&recvCount_XY,1,rank_XY(),recvtag+6);
-	req1[7] = Comm.Isend(&sendCount_XY,1,rank_XY(),sendtag+7);
-	req2[7] = Comm.Irecv(&recvCount_xy,1,rank_xy(),recvtag+7);
-	req1[8] = Comm.Isend(&sendCount_Xy,1,rank_Xy(),sendtag+8);
-	req2[8] = Comm.Irecv(&recvCount_xY,1,rank_xY(),recvtag+8);
-	req1[9] = Comm.Isend(&sendCount_xY,1,rank_xY(),sendtag+9);
-	req2[9] = Comm.Irecv(&recvCount_Xy,1,rank_Xy(),recvtag+9);
-	req1[10] = Comm.Isend(&sendCount_xz,1,rank_xz(),sendtag+10);
-	req2[10] = Comm.Irecv(&recvCount_XZ,1,rank_XZ(),recvtag+10);
-	req1[11] = Comm.Isend(&sendCount_XZ,1,rank_XZ(),sendtag+11);
-	req2[11] = Comm.Irecv(&recvCount_xz,1,rank_xz(),recvtag+11);
-	req1[12] = Comm.Isend(&sendCount_Xz,1,rank_Xz(),sendtag+12);
-	req2[12] = Comm.Irecv(&recvCount_xZ,1,rank_xZ(),recvtag+12);
-	req1[13] = Comm.Isend(&sendCount_xZ,1,rank_xZ(),sendtag+13);
-	req2[13] = Comm.Irecv(&recvCount_Xz,1,rank_Xz(),recvtag+13);
-	req1[14] = Comm.Isend(&sendCount_yz,1,rank_yz(),sendtag+14);
-	req2[14] = Comm.Irecv(&recvCount_YZ,1,rank_YZ(),recvtag+14);
-	req1[15] = Comm.Isend(&sendCount_YZ,1,rank_YZ(),sendtag+15);
-	req2[15] = Comm.Irecv(&recvCount_yz,1,rank_yz(),recvtag+15);
-	req1[16] = Comm.Isend(&sendCount_Yz,1,rank_Yz(),sendtag+16);
-	req2[16] = Comm.Irecv(&recvCount_yZ,1,rank_yZ(),recvtag+16);
-	req1[17] = Comm.Isend(&sendCount_yZ,1,rank_yZ(),sendtag+17);
-	req2[17] = Comm.Irecv(&recvCount_Yz,1,rank_Yz(),recvtag+17);
-	Comm.waitAll(18,req1);
-	Comm.waitAll(18,req2);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm.getCommunicator(),&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm.getCommunicator(),&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm.getCommunicator(),&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm.getCommunicator(),&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm.getCommunicator(),&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm.getCommunicator(),&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm.getCommunicator(),&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm.getCommunicator(),&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm.getCommunicator(),&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm.getCommunicator(),&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm.getCommunicator(),&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm.getCommunicator(),&req2[5]);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm.getCommunicator(),&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm.getCommunicator(),&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm.getCommunicator(),&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm.getCommunicator(),&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm.getCommunicator(),&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm.getCommunicator(),&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm.getCommunicator(),&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm.getCommunicator(),&req2[9]);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm.getCommunicator(),&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm.getCommunicator(),&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm.getCommunicator(),&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm.getCommunicator(),&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm.getCommunicator(),&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm.getCommunicator(),&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm.getCommunicator(),&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm.getCommunicator(),&req2[13]);
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm.getCommunicator(),&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm.getCommunicator(),&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm.getCommunicator(),&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm.getCommunicator(),&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm.getCommunicator(),&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm.getCommunicator(),&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm.getCommunicator(),&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm.getCommunicator(),&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	Comm.barrier();
 	//......................................................................................
 	// recv buffers
@@ -897,44 +897,44 @@ void Domain::CommInit()
 	recvList_YZ = new int [recvCount_YZ];
 	recvList_XZ = new int [recvCount_XZ];
 	//......................................................................................
-	req1[0] = Comm.Isend(sendList_x,sendCount_x,rank_x(),sendtag);
-	req2[0] = Comm.Irecv(recvList_X,recvCount_X,rank_X(),recvtag);
-	req1[1] = Comm.Isend(sendList_X,sendCount_X,rank_X(),sendtag);
-	req2[1] = Comm.Irecv(recvList_x,recvCount_x,rank_x(),recvtag);
-	req1[2] = Comm.Isend(sendList_y,sendCount_y,rank_y(),sendtag);
-	req2[2] = Comm.Irecv(recvList_Y,recvCount_Y,rank_Y(),recvtag);
-	req1[3] = Comm.Isend(sendList_Y,sendCount_Y,rank_Y(),sendtag);
-	req2[3] = Comm.Irecv(recvList_y,recvCount_y,rank_y(),recvtag);
-	req1[4] = Comm.Isend(sendList_z,sendCount_z,rank_z(),sendtag);
-	req2[4] = Comm.Irecv(recvList_Z,recvCount_Z,rank_Z(),recvtag);
-	req1[5] = Comm.Isend(sendList_Z,sendCount_Z,rank_Z(),sendtag);
-	req2[5] = Comm.Irecv(recvList_z,recvCount_z,rank_z(),recvtag);
-	req1[6] = Comm.Isend(sendList_xy,sendCount_xy,rank_xy(),sendtag);
-	req2[6] = Comm.Irecv(recvList_XY,recvCount_XY,rank_XY(),recvtag);
-	req1[7] = Comm.Isend(sendList_XY,sendCount_XY,rank_XY(),sendtag);
-	req2[7] = Comm.Irecv(recvList_xy,recvCount_xy,rank_xy(),recvtag);
-	req1[8] = Comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy(),sendtag);
-	req2[8] = Comm.Irecv(recvList_xY,recvCount_xY,rank_xY(),recvtag);
-	req1[9] = Comm.Isend(sendList_xY,sendCount_xY,rank_xY(),sendtag);
-	req2[9] = Comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy(),recvtag);
-	req1[10] = Comm.Isend(sendList_xz,sendCount_xz,rank_xz(),sendtag);
-	req2[10] = Comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ(),recvtag);
-	req1[11] = Comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ(),sendtag);
-	req2[11] = Comm.Irecv(recvList_xz,recvCount_xz,rank_xz(),recvtag);
-	req1[12] = Comm.Isend(sendList_Xz,sendCount_Xz,rank_Xz(),sendtag);
-	req2[12] = Comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ(),recvtag);
-	req1[13] = Comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ(),sendtag);
-	req2[13] = Comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz(),recvtag);
-	req1[14] = Comm.Isend(sendList_yz,sendCount_yz,rank_yz(),sendtag);
-	req2[14] = Comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ(),recvtag);
-	req1[15] = Comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ(),sendtag);
-	req2[15] = Comm.Irecv(recvList_yz,recvCount_yz,rank_yz(),recvtag);
-	req1[16] = Comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz(),sendtag);
-	req2[16] = Comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ(),recvtag);
-	req1[17] = Comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ(),sendtag);
-	req2[17] = Comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz(),recvtag);
-	Comm.waitAll(18,req1);
-	Comm.waitAll(18,req2);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm.getCommunicator(),&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm.getCommunicator(),&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm.getCommunicator(),&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm.getCommunicator(),&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm.getCommunicator(),&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm.getCommunicator(),&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm.getCommunicator(),&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm.getCommunicator(),&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm.getCommunicator(),&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm.getCommunicator(),&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm.getCommunicator(),&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm.getCommunicator(),&req2[5]);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm.getCommunicator(),&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm.getCommunicator(),&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm.getCommunicator(),&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm.getCommunicator(),&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm.getCommunicator(),&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm.getCommunicator(),&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm.getCommunicator(),&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm.getCommunicator(),&req2[9]);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm.getCommunicator(),&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm.getCommunicator(),&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm.getCommunicator(),&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm.getCommunicator(),&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm.getCommunicator(),&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm.getCommunicator(),&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm.getCommunicator(),&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm.getCommunicator(),&req2[13]);
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm.getCommunicator(),&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm.getCommunicator(),&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm.getCommunicator(),&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm.getCommunicator(),&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm.getCommunicator(),&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm.getCommunicator(),&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm.getCommunicator(),&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm.getCommunicator(),&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)    recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)    recvList_X[idx] += (Nx-2);
@@ -1124,24 +1124,42 @@ void Domain::CommunicateMeshHalo(DoubleArray &Mesh)
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendData_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendData_YZ, MeshData);
 	//......................................................................................
-	Comm.sendrecv(sendData_x,sendCount_x,rank_x(),sendtag,recvData_X,recvCount_X,rank_X(),recvtag);
-	Comm.sendrecv(sendData_X,sendCount_X,rank_X(),sendtag,recvData_x,recvCount_x,rank_x(),recvtag);
-	Comm.sendrecv(sendData_y,sendCount_y,rank_y(),sendtag,recvData_Y,recvCount_Y,rank_Y(),recvtag);
-	Comm.sendrecv(sendData_Y,sendCount_Y,rank_Y(),sendtag,recvData_y,recvCount_y,rank_y(),recvtag);
-	Comm.sendrecv(sendData_z,sendCount_z,rank_z(),sendtag,recvData_Z,recvCount_Z,rank_Z(),recvtag);
-	Comm.sendrecv(sendData_Z,sendCount_Z,rank_Z(),sendtag,recvData_z,recvCount_z,rank_z(),recvtag);
-	Comm.sendrecv(sendData_xy,sendCount_xy,rank_xy(),sendtag,recvData_XY,recvCount_XY,rank_XY(),recvtag);
-	Comm.sendrecv(sendData_XY,sendCount_XY,rank_XY(),sendtag,recvData_xy,recvCount_xy,rank_xy(),recvtag);
-	Comm.sendrecv(sendData_Xy,sendCount_Xy,rank_Xy(),sendtag,recvData_xY,recvCount_xY,rank_xY(),recvtag);
-	Comm.sendrecv(sendData_xY,sendCount_xY,rank_xY(),sendtag,recvData_Xy,recvCount_Xy,rank_Xy(),recvtag);
-	Comm.sendrecv(sendData_xz,sendCount_xz,rank_xz(),sendtag,recvData_XZ,recvCount_XZ,rank_XZ(),recvtag);
-	Comm.sendrecv(sendData_XZ,sendCount_XZ,rank_XZ(),sendtag,recvData_xz,recvCount_xz,rank_xz(),recvtag);
-	Comm.sendrecv(sendData_Xz,sendCount_Xz,rank_Xz(),sendtag,recvData_xZ,recvCount_xZ,rank_xZ(),recvtag);
-	Comm.sendrecv(sendData_xZ,sendCount_xZ,rank_xZ(),sendtag,recvData_Xz,recvCount_Xz,rank_Xz(),recvtag);
-	Comm.sendrecv(sendData_yz,sendCount_yz,rank_yz(),sendtag,recvData_YZ,recvCount_YZ,rank_YZ(),recvtag);
-	Comm.sendrecv(sendData_YZ,sendCount_YZ,rank_YZ(),sendtag,recvData_yz,recvCount_yz,rank_yz(),recvtag);
-	Comm.sendrecv(sendData_Yz,sendCount_Yz,rank_Yz(),sendtag,recvData_yZ,recvCount_yZ,rank_yZ(),recvtag);
-	Comm.sendrecv(sendData_yZ,sendCount_yZ,rank_yZ(),sendtag,recvData_Yz,recvCount_Yz,rank_Yz(),recvtag);
+	MPI_Sendrecv(sendData_x,sendCount_x,MPI_DOUBLE,rank_x(),sendtag,
+			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_X,sendCount_X,MPI_DOUBLE,rank_X(),sendtag,
+			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_y,sendCount_y,MPI_DOUBLE,rank_y(),sendtag,
+			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_Y,sendCount_Y,MPI_DOUBLE,rank_Y(),sendtag,
+			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_z,sendCount_z,MPI_DOUBLE,rank_z(),sendtag,
+			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_Z,sendCount_Z,MPI_DOUBLE,rank_Z(),sendtag,
+			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_xy,sendCount_xy,MPI_DOUBLE,rank_xy(),sendtag,
+			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_XY,sendCount_XY,MPI_DOUBLE,rank_XY(),sendtag,
+			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy(),sendtag,
+			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_xY,sendCount_xY,MPI_DOUBLE,rank_xY(),sendtag,
+			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_xz,sendCount_xz,MPI_DOUBLE,rank_xz(),sendtag,
+			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ(),sendtag,
+			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz(),sendtag,
+			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ(),sendtag,
+			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_yz,sendCount_yz,MPI_DOUBLE,rank_yz(),sendtag,
+			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ(),sendtag,
+			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz(),sendtag,
+			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendData_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ(),sendtag,
+			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvData_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvData_X, MeshData);
diff --git a/common/Domain.h b/common/Domain.h
index 4343854f..22b05af7 100755
--- a/common/Domain.h
+++ b/common/Domain.h
@@ -120,6 +120,8 @@ public: // Public variables (need to create accessors instead)
 
     int BoundaryCondition;
 
+    MPI_Group Group;    // Group of processors associated with this domain
+
     //**********************************
     // MPI ranks for all 18 neighbors
     //**********************************
@@ -189,6 +191,7 @@ private:
     
 	//......................................................................................
 	MPI_Request req1[18], req2[18];
+	MPI_Status stat1[18],stat2[18];
 
     int *sendBuf_x, *sendBuf_y, *sendBuf_z, *sendBuf_X, *sendBuf_Y, *sendBuf_Z;
     int *sendBuf_xy, *sendBuf_yz, *sendBuf_xz, *sendBuf_Xy, *sendBuf_Yz, *sendBuf_xZ;
diff --git a/common/MPI.I b/common/MPI.I
index 0361e108..8cbc9c09 100644
--- a/common/MPI.I
+++ b/common/MPI.I
@@ -579,39 +579,6 @@ inline MPI_Request MPI_CLASS::Irecv(
 }
 
 
-/************************************************************************
- *  sendrecv                                                                 *
- ************************************************************************/
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::sendrecv<char>( const char*, int, int, int, char*, int, int, int ) const;
-template<>
-void MPI_CLASS::sendrecv<int>( const int*, int, int, int, int*, int, int, int ) const;
-template<>
-void MPI_CLASS::sendrecv<float>( const float*, int, int, int, float*, int, int, int ) const;
-template<>
-void MPI_CLASS::sendrecv<double>( const double*, int, int, int, double*, int, int, int ) const;
-template<class TYPE>
-void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag, 
-                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
-{
-    ERROR( "Not implimented" );
-}
-#else
-template<class TYPE>
-void MPI_CLASS::sendrecv( const TYPE *sendbuf, int sendcount, int dest, int sendtag, 
-                          TYPE *recvbuf, int recvcount, int source, int recvtag ) const
-{
-    ASSERT( dest == 0 );
-    ASSERT( source == 0 );
-    ASSERT( sendcount == recvcount );
-    ASSERT( sendtag == recvtag );
-    memcpy( recvbuf, sendbuf, sendcount * sizeof( TYPE ) );
-}
-#endif
-
-
-
 /************************************************************************
  *  allGather                                                            *
  ************************************************************************/
diff --git a/common/MPI.cpp b/common/MPI.cpp
index 8b09bc49..73932d03 100644
--- a/common/MPI.cpp
+++ b/common/MPI.cpp
@@ -2805,54 +2805,6 @@ MPI_Request MPI_CLASS::IrecvBytes(
 }
 
 
-
-/************************************************************************
- *  sendrecv                                                             *
- ************************************************************************/
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::sendrecv<char>( const char* sendbuf, int sendcount, int dest, int sendtag,
-                                char* recvbuf, int recvcount, int source, int recvtag ) const
-{
-    PROFILE_START( "sendrecv<char>", profile_level );
-    MPI_Sendrecv( sendbuf, sendcount, MPI_CHAR, dest, sendtag, 
-                  recvbuf, recvcount, MPI_CHAR, source, recvtag,
-                  communicator, MPI_STATUS_IGNORE );
-    PROFILE_STOP( "sendrecv<char>", profile_level );
-}
-template<>
-void MPI_CLASS::sendrecv<int>( const int* sendbuf, int sendcount, int dest, int sendtag,
-                               int* recvbuf, int recvcount, int source, int recvtag ) const
-{
-    PROFILE_START( "sendrecv<int>", profile_level );
-    MPI_Sendrecv( sendbuf, sendcount, MPI_INT, dest, sendtag, 
-                  recvbuf, recvcount, MPI_INT, source, recvtag,
-                  communicator, MPI_STATUS_IGNORE );
-    PROFILE_STOP( "sendrecv<int>", profile_level );
-}
-template<>
-void MPI_CLASS::sendrecv<float>( const float* sendbuf, int sendcount, int dest, int sendtag,
-                                 float* recvbuf, int recvcount, int source, int recvtag ) const
-{
-    PROFILE_START( "sendrecv<float>", profile_level );
-    MPI_Sendrecv( sendbuf, sendcount, MPI_FLOAT, dest, sendtag, 
-                  recvbuf, recvcount, MPI_FLOAT, source, recvtag,
-                  communicator, MPI_STATUS_IGNORE );
-    PROFILE_STOP( "sendrecv<float>", profile_level );
-}
-template<>
-void MPI_CLASS::sendrecv<double>( const double* sendbuf, int sendcount, int dest, int sendtag,
-                                  double* recvbuf, int recvcount, int source, int recvtag ) const
-{
-    PROFILE_START( "sendrecv<double>", profile_level );
-    MPI_Sendrecv( sendbuf, sendcount, MPI_DOUBLE, dest, sendtag, 
-                  recvbuf, recvcount, MPI_DOUBLE, source, recvtag,
-                  communicator, MPI_STATUS_IGNORE );
-    PROFILE_STOP( "sendrecv<double>", profile_level );
-}
-#endif
-
-
 /************************************************************************
  *  allGather                                                            *
  *  Note: these specializations are only called when using MPI.          *
diff --git a/common/MPI.h b/common/MPI.h
index 4161d6a7..e3fd3e13 100644
--- a/common/MPI.h
+++ b/common/MPI.h
@@ -792,13 +792,6 @@ public: // Member functions
         void *buf, const int N_bytes, const int send_proc, const int tag ) const;
 
 
-    /*!
-     * @brief This function sends and recieves data using a blocking call
-     */
-    template<class type>
-    void sendrecv( const type *sendbuf, int sendcount, int dest, int sendtag, type *recvbuf, int recvcount, int source, int recvtag ) const;
-
-
     /*!
      * Each processor sends every other processor a single value.
      * @param[in] x      Input value for allGather
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 610fce5d..51195f5a 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -201,6 +201,7 @@ private:
 	int sendtag,recvtag;
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
+	MPI_Group Group;	// Group of processors associated with this domain
 	Utilities::MPI MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
 	//......................................................................................
diff --git a/common/Utilities.cpp b/common/Utilities.cpp
index 723b34f8..9c89e024 100644
--- a/common/Utilities.cpp
+++ b/common/Utilities.cpp
@@ -8,7 +8,7 @@
 #endif
 
 #ifdef USE_MPI
-#include "common/MPI.h"
+#include "mpi.h"
 #endif
 
 #include <algorithm>
diff --git a/cpu/exe/lb2_Color_mpi.cpp b/cpu/exe/lb2_Color_mpi.cpp
index cdf56af9..0cade21e 100644
--- a/cpu/exe/lb2_Color_mpi.cpp
+++ b/cpu/exe/lb2_Color_mpi.cpp
@@ -36,11 +36,15 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -54,6 +58,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -110,30 +115,31 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
 	// **************************************************************
 
@@ -163,7 +169,7 @@ int main(int argc, char **argv)
 
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -445,7 +451,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -582,7 +588,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -593,46 +599,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	comm.Send(&sendCount_x,1,rank_X,sendtag);
-	comm.Recv(&recvCount_X,1,rank_x,recvtag);
-	comm.Send(&sendCount_X,1,rank_x,sendtag);
-	comm.Recv(&recvCount_x,1,rank_X,recvtag);
-	comm.Send(&sendCount_y,1,rank_Y,sendtag);
-	comm.Recv(&recvCount_Y,1,rank_y,recvtag);
-	comm.Send(&sendCount_Y,1,rank_y,sendtag);
-	comm.Recv(&recvCount_y,1,rank_Y,recvtag);
-	comm.Send(&sendCount_z,1,rank_Z,sendtag);
-	comm.Recv(&recvCount_Z,1,rank_z,recvtag);
-	comm.Send(&sendCount_Z,1,rank_z,sendtag);
-	comm.Recv(&recvCount_z,1,rank_Z,recvtag);
+	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_xy,1,rank_XY,sendtag);
-	comm.Recv(&recvCount_XY,1,rank_xy,recvtag);
-	comm.Send(&sendCount_XY,1,rank_xy,sendtag);
-	comm.Recv(&recvCount_xy,1,rank_XY,recvtag);
-	comm.Send(&sendCount_Xy,1,rank_xY,sendtag);
-	comm.Recv(&recvCount_xY,1,rank_Xy,recvtag);
-	comm.Send(&sendCount_xY,1,rank_Xy,sendtag);
-	comm.Recv(&recvCount_Xy,1,rank_xY,recvtag);
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_xz,1,rank_XZ,sendtag);
-	comm.Recv(&recvCount_XZ,1,rank_xz,recvtag);
-	comm.Send(&sendCount_XZ,1,rank_xz,sendtag);
-	comm.Recv(&recvCount_xz,1,rank_XZ,recvtag);
-	comm.Send(&sendCount_Xz,1,rank_xZ,sendtag);
-	comm.Recv(&recvCount_xZ,1,rank_Xz,recvtag);
-	comm.Send(&sendCount_xZ,1,rank_Xz,sendtag);
-	comm.Recv(&recvCount_Xz,1,rank_xZ,recvtag);
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_yz,1,rank_YZ,sendtag);
-	comm.Recv(&recvCount_YZ,1,rank_yz,recvtag);
-	comm.Send(&sendCount_YZ,1,rank_yz,sendtag);
-	comm.Recv(&recvCount_yz,1,rank_YZ,recvtag);
-	comm.Send(&sendCount_Yz,1,rank_yZ,sendtag);
-	comm.Recv(&recvCount_yZ,1,rank_Yz,recvtag);
-	comm.Send(&sendCount_yZ,1,rank_Yz,sendtag);
-	comm.Recv(&recvCount_Yz,1,rank_yZ,recvtag);
-	comm.barrier();
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -663,48 +669,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -840,24 +846,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -890,7 +914,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -970,49 +994,48 @@ int main(int argc, char **argv)
 	PackValues(sendList_YZ, sendCount_YZ,sendbuf_YZ, Phi, N);
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
-	//...................................................................................
-	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
-	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 	//...................................................................................
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	//...................................................................................
 	//...................................................................................
 	UnpackValues(recvList_x, recvCount_x,recvbuf_x, Phi, N);
@@ -1041,8 +1064,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	//.........................................
 
 	sendtag = recvtag = 5;
@@ -1135,42 +1158,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1188,8 +1211,8 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1270,23 +1293,23 @@ int main(int argc, char **argv)
 		//...................................................................................
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		req1[0] = comm.Isend(recvbuf_x, 2*recvCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(sendbuf_X, 2*sendCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(recvbuf_X, 2*recvCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(sendbuf_x, 2*sendCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(recvbuf_y, 2*recvCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(sendbuf_Y, 2*sendCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(recvbuf_Y, 2*recvCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(sendbuf_y, 2*sendCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(recvbuf_z, 2*recvCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(sendbuf_Z, 2*sendCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(recvbuf_Z, 2*recvCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(sendbuf_z, 2*sendCount_z,rank_Z,recvtag);
+		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-	    comm.waitAll(6,req1);
-	    comm.waitAll(6,req2);
+		MPI_Waitall(6,req1,stat1);
+		MPI_Waitall(6,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 		UnpackDenD3Q7(sendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1322,49 +1345,48 @@ int main(int argc, char **argv)
 		PackValues(sendList_YZ, sendCount_YZ,sendbuf_YZ, Phi, N);
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
+		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
-		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
-		//...................................................................................
-	    comm.waitAll(18,req1);
-	    comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 		UnpackValues(recvList_x, recvCount_x,recvbuf_x, Phi, N);
@@ -1387,14 +1409,14 @@ int main(int argc, char **argv)
 		UnpackValues(recvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Iteration completed!
 		timestep++;
 		//...................................................................
 	}
 	//************************************************************************/
 
-	stoptime = Utilities::MPI::time();
+	stoptime = MPI_Wtime();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1437,7 +1459,7 @@ int main(int argc, char **argv)
 	fwrite(ColorGrad,8,3*N,COLORGRAD);
 	fclose(COLORGRAD);
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
index 5fc04e77..298e3fae 100644
--- a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
+++ b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
@@ -62,7 +62,7 @@ inline void UnpackMeshData(int *list, int count, double *recvbuf, DoubleArray &V
 	}
 }
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &MeshData, const Utilities::MPI& Communicator,
+inline void CommunicateMeshHalo(DoubleArray &MeshData, MPI_Comm Communicator,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -111,24 +111,42 @@ inline void CommunicateMeshHalo(DoubleArray &MeshData, const Utilities::MPI& Com
 	PackMeshData(sendList_yZ, sendCount_yZ ,sendbuf_yZ, MeshData);
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
-	comm.sendrecv(sendbuf_x,sendCount_x,rank_x,sendtag,recvbuf_X,recvCount_X,rank_X,recvtag);
-	comm.sendrecv(sendbuf_X,sendCount_X,rank_X,sendtag,recvbuf_x,recvCount_x,rank_x,recvtag);
-	comm.sendrecv(sendbuf_y,sendCount_y,rank_y,sendtag,recvbuf_Y,recvCount_Y,rank_Y,recvtag);
-	comm.sendrecv(sendbuf_Y,sendCount_Y,rank_Y,sendtag,recvbuf_y,recvCount_y,rank_y,recvtag);
-	comm.sendrecv(sendbuf_z,sendCount_z,rank_z,sendtag,recvbuf_Z,recvCount_Z,rank_Z,recvtag);
-	comm.sendrecv(sendbuf_Z,sendCount_Z,rank_Z,sendtag,recvbuf_z,recvCount_z,rank_z,recvtag);
-	comm.sendrecv(sendbuf_xy,sendCount_xy,rank_xy,sendtag,recvbuf_XY,recvCount_XY,rank_XY,recvtag);
-	comm.sendrecv(sendbuf_XY,sendCount_XY,rank_XY,sendtag,recvbuf_xy,recvCount_xy,rank_xy,recvtag);
-	comm.sendrecv(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag,recvbuf_xY,recvCount_xY,rank_xY,recvtag);
-	comm.sendrecv(sendbuf_xY,sendCount_xY,rank_xY,sendtag,recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
-	comm.sendrecv(sendbuf_xz,sendCount_xz,rank_xz,sendtag,recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
-	comm.sendrecv(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag,recvbuf_xz,recvCount_xz,rank_xz,recvtag);
-	comm.sendrecv(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag,recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
-	comm.sendrecv(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag,recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
-	comm.sendrecv(sendbuf_yz,sendCount_yz,rank_yz,sendtag,recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
-	comm.sendrecv(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag,recvbuf_yz,recvCount_yz,rank_yz,recvtag);
-	comm.sendrecv(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag,recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
-	comm.sendrecv(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag,recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
+	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
+			recvbuf_X,recvCount_X,MPI_CHAR,rank_X,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
+			recvbuf_x,recvCount_x,MPI_CHAR,rank_x,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
+			recvbuf_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
+			recvbuf_y,recvCount_y,MPI_CHAR,rank_y,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
+			recvbuf_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
+			recvbuf_z,recvCount_z,MPI_CHAR,rank_z,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
+			recvbuf_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
+			recvbuf_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
+			recvbuf_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
+			recvbuf_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
+			recvbuf_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
+			recvbuf_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
+			recvbuf_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
+			recvbuf_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
+			recvbuf_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
+			recvbuf_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
+			recvbuf_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,Communicator,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
+			recvbuf_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,Communicator,MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
@@ -154,11 +172,15 @@ inline void CommunicateMeshHalo(DoubleArray &MeshData, const Utilities::MPI& Com
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -172,6 +194,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -260,39 +283,38 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&xIntPos,1,0);
-	comm.bcast(&wp_saturation,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&Restart,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
-
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&xIntPos,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&wp_saturation,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 	// Computational domain
-	comm.bcast(&Nz,1,0);
-//	comm.bcast(&nBlocks,1,0);
-//	comm.bcast(&nthreads,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+//	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+//	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
 	// **************************************************************
 	double Ps = -(das-dbs)/(das+dbs);
@@ -324,7 +346,7 @@ int main(int argc, char **argv)
 		printf("********************************************************\n");
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -663,14 +685,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,nspheres,0);
-	comm.bcast(cy,nspheres,0);
-	comm.bcast(cz,nspheres,0);
-	comm.bcast(rad,nspheres,0);
+	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -703,7 +725,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	porosity = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -837,7 +859,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -848,48 +870,89 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	req1[0] = comm.Isend(&sendCount_x,1,rank_x,sendtag);
-	req2[0] = comm.Irecv(&recvCount_X,1,rank_X,recvtag);
-	req1[1] = comm.Isend(&sendCount_X,1,rank_X,sendtag);
-	req2[1] = comm.Irecv(&recvCount_x,1,rank_x,recvtag);
-	req1[2] = comm.Isend(&sendCount_y,1,rank_y,sendtag);
-	req2[2] = comm.Irecv(&recvCount_Y,1,rank_Y,recvtag);
-	req1[3] = comm.Isend(&sendCount_Y,1,rank_Y,sendtag);
-	req2[3] = comm.Irecv(&recvCount_y,1,rank_y,recvtag);
-	req1[4] = comm.Isend(&sendCount_z,1,rank_z,sendtag);
-	req2[4] = comm.Irecv(&recvCount_Z,1,rank_Z,recvtag);
-	req1[5] = comm.Isend(&sendCount_Z,1,rank_Z,sendtag);
-	req2[5] = comm.Irecv(&recvCount_z,1,rank_z,recvtag);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag,comm,&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag,comm,&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag,comm,&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag,comm,&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag,comm,&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag,comm,&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag,comm,&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag,comm,&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag,comm,&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag,comm,&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag,comm,&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(&sendCount_xy,1,rank_xy,sendtag);
-	req2[6] = comm.Irecv(&recvCount_XY,1,rank_XY,recvtag);
-	req1[7] = comm.Isend(&sendCount_XY,1,rank_XY,sendtag);
-	req2[7] = comm.Irecv(&recvCount_xy,1,rank_xy,recvtag);
-	req1[8] = comm.Isend(&sendCount_Xy,1,rank_Xy,sendtag);
-	req2[8] = comm.Irecv(&recvCount_xY,1,rank_xY,recvtag);
-	req1[9] = comm.Isend(&sendCount_xY,1,rank_xY,sendtag);
-	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_Xy,recvtag);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag,comm,&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag,comm,&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag,comm,&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag,comm,&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag,comm,&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag,comm,&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag,comm,&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(&sendCount_xz,1,rank_xz,sendtag);
-	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_XZ,recvtag);
-	req1[11] = comm.Isend(&sendCount_XZ,1,rank_XZ,sendtag);
-	req2[11] = comm.Irecv(&recvCount_xz,1,rank_xz,recvtag);
-	req1[12] = comm.Isend(&sendCount_Xz,1,rank_Xz,sendtag);
-	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_xZ,recvtag);
-	req1[13] = comm.Isend(&sendCount_xZ,1,rank_xZ,sendtag);
-	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_Xz,recvtag);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag,comm,&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag,comm,&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag,comm,&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag,comm,&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag,comm,&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag,comm,&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag,comm,&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(&sendCount_yz,1,rank_yz,sendtag);
-	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_YZ,recvtag);
-	req1[15] = comm.Isend(&sendCount_YZ,1,rank_YZ,sendtag);
-	req2[15] = comm.Irecv(&recvCount_yz,1,rank_yz,recvtag);
-	req1[16] = comm.Isend(&sendCount_Yz,1,rank_Yz,sendtag);
-	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_yZ,recvtag);
-	req1[17] = comm.Isend(&sendCount_yZ,1,rank_yZ,sendtag);
-	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_Yz,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag,comm,&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag,comm,&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag,comm,&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag,comm,&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag,comm,&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag,comm,&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag,comm,&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
+/*	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
+*/	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -919,48 +982,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_x,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_X,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_X,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_x,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_Y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_Y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_Z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_Z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_xy,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_XY,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_XY,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_xy,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_Xy,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_xY,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_xY,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_Xy,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_xz,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_XZ,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_xz,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,endCount_Xz,rank_Xz,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_xZ,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_Xz,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_yz,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_YZ,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_yz,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_yZ,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_Yz,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -1075,7 +1138,7 @@ int main(int argc, char **argv)
 	dvc_AllocateDeviceMemory((void **) &dvcRecvList_Yz, recvCount_Yz*sizeof(int));	// Allocate device memory
 	dvc_AllocateDeviceMemory((void **) &dvcRecvList_YZ, recvCount_YZ*sizeof(int));	// Allocate device memory
 	//......................................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("Prepare to copy send/recv Lists to device \n");
 	dvc_CopyToDevice(dvcSendList_x,sendList_x,sendCount_x*sizeof(int));
 	dvc_CopyToDevice(dvcSendList_X,sendList_X,sendCount_X*sizeof(int));
@@ -1182,24 +1245,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_x,sendtag,recvID_X,recvCount_X,rank_X,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_X,sendtag,recvID_x,recvCount_x,rank_x,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_y,sendtag,recvID_Y,recvCount_Y,rank_Y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_Y,sendtag,recvID_y,recvCount_y,rank_y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_z,sendtag,recvID_Z,recvCount_Z,rank_Z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_Z,sendtag,recvID_z,recvCount_z,rank_z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_xy,sendtag,recvID_XY,recvCount_XY,rank_XY,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_XY,sendtag,recvID_xy,recvCount_xy,rank_xy,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_Xy,sendtag,recvID_xY,recvCount_xY,rank_xY,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_xY,sendtag,recvID_Xy,recvCount_Xy,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_xz,sendtag,recvID_XZ,recvCount_XZ,rank_XZ,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_XZ,sendtag,recvID_xz,recvCount_xz,rank_xz,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_Xz,sendtag,recvID_xZ,recvCount_xZ,rank_xZ,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_xZ,sendtag,recvID_Xz,recvCount_Xz,rank_Xz,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_yz,sendtag,recvID_YZ,recvCount_YZ,rank_YZ,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_YZ,sendtag,recvID_yz,recvCount_yz,rank_yz,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_Yz,sendtag,recvID_yZ,recvCount_yZ,rank_yZ,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_yZ,sendtag,recvID_Yz,recvCount_Yz,rank_Yz,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1267,7 +1348,7 @@ int main(int argc, char **argv)
 	recvMeshData_YZ = new double [recvCount_YZ];
 	recvMeshData_XZ = new double [recvCount_XZ];
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1447,8 +1528,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	//.........................................
 	//...........................................................................
 	//				MAIN  VARIABLES INITIALIZED HERE
@@ -1519,7 +1600,7 @@ int main(int argc, char **argv)
 			dvc_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
 			dvc_CopyToDevice(Den,cDen,2*N*sizeof(double));
 			dvc_Barrier();
-			comm.barrier();
+			MPI_Barrier(comm);
 		}
 		// Pack the buffers (zeros out the halo region)
 		dvc_PackDenD3Q7(dvcRecvList_x,recvCount_x,recvbuf_x,2,Den,N);
@@ -1558,48 +1639,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_x,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_X,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_X,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_x,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_Y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_Y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_Z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_Z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
+		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		dvc_Barrier();
 		//...................................................................................
 		//...................................................................................
@@ -1645,7 +1726,7 @@ int main(int argc, char **argv)
 		dvc_CopyToHost(Phase.data,Phi,N*sizeof(double));
 		dvc_CopyToHost(Press.data,Pressure,N*sizeof(double));
 		dvc_CopyToHost(Vel,Velocity,3*N*sizeof(double));
-		comm.barrier();
+		MPI_Barrier(comm);
 		//...........................................................................
 
 		timestep=0;
@@ -1741,42 +1822,42 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Send all the distributions
-			req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_x,sendtag);
-			req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_X,recvtag);
-			req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_X,sendtag);
-			req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_x,recvtag);
-			req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_y,sendtag);
-			req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_Y,recvtag);
-			req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_Y,sendtag);
-			req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_y,recvtag);
-			req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_z,sendtag);
-			req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_Z,recvtag);
-			req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_Z,sendtag);
-			req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_z,recvtag);
-			req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
-			req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
-			req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
-			req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
-			req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
-			req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
-			req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
-			req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
-			req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
-			req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
-			req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
-			req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
-			req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
-			req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
-			req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
-			req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
-			req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
-			req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
-			req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
-			req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
-			req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
-			req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
-			req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
-			req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
+			MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+			MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+			MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+			MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+			MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+			MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+			MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+			MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+			MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+			MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+			MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+			MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+			MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+			MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+			MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+			MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+			MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+			MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+			MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+			MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+			MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+			MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+			MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+			MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+			MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+			MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+			MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+			MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+			MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+			MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+			MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+			MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+			MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+			MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+			MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+			MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 			//...................................................................................
 
 			//*************************************************************************
@@ -1793,8 +1874,8 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Wait for completion of D3Q19 communication
-			comm.waitAll(18,req1);
-			comm.waitAll(18,req2);
+			MPI_Waitall(18,req1,stat1);
+			MPI_Waitall(18,req2,stat2);
 
 			//...................................................................................
 			// Unpack the distributions on the device
@@ -1877,23 +1958,23 @@ int main(int argc, char **argv)
 
 			//...................................................................................
 			// Send all the D3Q7 distributions
-			req1[0] = comm.Isend(recvbuf_x,2*recvCount_x,rank_x,sendtag);
-			req2[0] = comm.Irecv(sendbuf_X,2*sendCount_X,rank_X,recvtag);
-			req1[1] = comm.Isend(recvbuf_X,2*recvCount_X,rank_X,sendtag);
-			req2[1] = comm.Irecv(sendbuf_x,2*sendCount_x,rank_x,recvtag);
-			req1[2] = comm.Isend(recvbuf_y,2*recvCount_y,rank_y,sendtag);
-			req2[2] = comm.Irecv(sendbuf_Y,2*sendCount_Y,rank_Y,recvtag);
-			req1[3] = comm.Isend(recvbuf_Y,2*recvCount_Y,rank_Y,sendtag);
-			req2[3] = comm.Irecv(sendbuf_y,2*sendCount_y,rank_y,recvtag);
-			req1[4] = comm.Isend(recvbuf_z,2*recvCount_z,rank_z,sendtag);
-			req2[4] = comm.Irecv(sendbuf_Z,2*sendCount_Z,rank_Z,recvtag);
-			req1[5] = comm.Isend(recvbuf_Z,2*recvCount_Z,rank_Z,sendtag);
-			req2[5] = comm.Irecv(sendbuf_z,2*sendCount_z,rank_z,recvtag);
+			MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+			MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+			MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+			MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+			MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+			MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+			MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+			MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+			MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+			MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+			MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+			MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
 			//...................................................................................
 			//...................................................................................
 			// Wait for completion of D3Q7 communication
-			comm.waitAll(6,req1);
-			comm.waitAll(6,req2);
+			MPI_Waitall(6,req1,stat1);
+			MPI_Waitall(6,req2,stat2);
 			//...................................................................................
 			//...................................................................................
 			dvc_UnpackDenD3Q7(dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1932,48 +2013,48 @@ int main(int argc, char **argv)
 			//...................................................................................
 			// Send / Recv all the phase indcator field values
 			//...................................................................................
-			req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_x,sendtag);
-			req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_X,recvtag);
-			req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_X,sendtag);
-			req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_x,recvtag);
-			req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_y,sendtag);
-			req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_Y,recvtag);
-			req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_Y,sendtag);
-			req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_y,recvtag);
-			req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_z,sendtag);
-			req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_Z,recvtag);
-			req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_Z,sendtag);
-			req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_z,recvtag);
-			req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_xy,sendtag);
-			req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_XY,recvtag);
-			req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_XY,sendtag);
-			req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_xy,recvtag);
-			req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_Xy,sendtag);
-			req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_xY,recvtag);
-			req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_xY,sendtag);
-			req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_Xy,recvtag);
-			req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_xz,sendtag);
-			req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_XZ,recvtag);
-			req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_XZ,sendtag);
-			req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_xz,recvtag);
-			req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_Xz,sendtag);
-			req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_xZ,recvtag);
-			req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_xZ,sendtag);
-			req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_Xz,recvtag);
-			req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_yz,sendtag);
-			req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_YZ,recvtag);
-			req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_YZ,sendtag);
-			req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_yz,recvtag);
-			req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_Yz,sendtag);
-			req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_yZ,recvtag);
-			req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_yZ,sendtag);
-			req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_Yz,recvtag);
+			MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+			MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+			MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+			MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+			MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+			MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+			MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+			MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+			MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+			MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+			MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+			MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+			MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+			MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+			MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+			MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+			MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+			MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+			MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+			MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+			MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+			MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+			MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+			MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+			MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+			MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+			MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+			MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+			MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+			MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+			MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+			MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+			MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+			MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+			MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+			MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 			//...................................................................................
 			//...................................................................................
 			// Wait for completion of Indicator Field communication
 			//...................................................................................
-			comm.waitAll(18,req1);
-			comm.waitAll(18,req2);
+			MPI_Waitall(18,req1,stat1);
+			MPI_Waitall(18,req2,stat2);
 			dvc_Barrier();
 			//...................................................................................
 			//...................................................................................
@@ -2003,7 +2084,7 @@ int main(int argc, char **argv)
 			dvc_UnpackValues(dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 			dvc_UnpackValues(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 			//...................................................................................
-			comm.barrier();
+			MPI_Barrier(comm);
 
 			// Iteration completed!
 			timestep++;
@@ -2283,27 +2364,27 @@ int main(int argc, char **argv)
 			//...........................................................................
 		}
 		//...........................................................................
-		comm.barrier();
-		nwp_volume_global = comm.sumReduce( nwp_volume );
-		awn_global = comm.sumReduce( awn );
-		ans_global = comm.sumReduce( ans );
-		aws_global = comm.sumReduce( aws );
-		lwns_global = comm.sumReduce( lwns );
-		As_global = comm.sumReduce( As );
-		Jwn_global = comm.sumReduce( Jwn );
-		efawns_global = comm.sumReduce( efawns );
+		MPI_Barrier(comm);
+		MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
 		// Phase averages
-		vol_w_global = comm.sumReduce( vol_w );
-		vol_n_global = comm.sumReduce( vol_n );
-		paw_global = comm.sumReduce( paw );
-		pan_global = comm.sumReduce( pan );
-		vaw_global(0) = comm.sumReduce( vaw(0) );
-		van_global(0) = comm.sumReduce( van(0) );
-		vawn_global(0) = comm.sumReduce( vawn(0) );
-		Gwn_global(0) = comm.sumReduce( Gwn(0) );
-		Gns_global(0) = comm.sumReduce( Gns(0) );
-		Gws_global(0) = comm.sumReduce( Gws(0) );
-		comm.barrier();
+		MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+		MPI_Barrier(comm);
 		//.........................................................................
 		// Compute the change in the total surface energy based on the defined interval
 		// See McClure, Prins and Miller (2013) 
@@ -2370,8 +2451,8 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	dvc_Barrier();
-	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
@@ -2408,7 +2489,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb1_MRT_mpi.cpp b/gpu/exe/lb1_MRT_mpi.cpp
index 68bf4edf..7ef41e90 100644
--- a/gpu/exe/lb1_MRT_mpi.cpp
+++ b/gpu/exe/lb1_MRT_mpi.cpp
@@ -6,7 +6,7 @@
 #include <iostream>
 #include <fstream>
 #include <string.h>
-#include "common/MPI.h"
+#include <mpi.h>
 #include <stdlib.h>
 
 using namespace std;
@@ -64,11 +64,15 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -82,6 +86,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 	//**********************************
 	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	//!!!!!!!!!!! Random debugging communications!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -131,23 +136,24 @@ int main(int argc, char **argv)
 
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&tau,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
 
 	double rlx_setA = 1.f/tau;
@@ -170,7 +176,7 @@ int main(int argc, char **argv)
 		printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -451,7 +457,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -588,7 +594,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -599,46 +605,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	comm.send(&sendCount_x,1,rank_X,sendtag);
-	comm.recv(&recvCount_X,1,rank_x,recvtag);
-	comm.send(&sendCount_X,1,rank_x,sendtag);
-	comm.recv(&recvCount_x,1,rank_X,recvtag);
-	comm.send(&sendCount_y,1,rank_Y,sendtag);
-	comm.recv(&recvCount_Y,1,rank_y,recvtag);
-	comm.send(&sendCount_Y,1,rank_y,sendtag);
-	comm.recv(&recvCount_y,1,rank_Y,recvtag);
-	comm.send(&sendCount_z,1,rank_Z,sendtag);
-	comm.recv(&recvCount_Z,1,rank_z,recvtag);
-	comm.send(&sendCount_Z,1,rank_z,sendtag);
-	comm.recv(&recvCount_z,1,rank_Z,recvtag);
+	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_xy,1,rank_XY,sendtag);
-	comm.recv(&recvCount_XY,1,rank_xy,recvtag);
-	comm.send(&sendCount_XY,1,rank_xy,sendtag);
-	comm.recv(&recvCount_xy,1,rank_XY,recvtag);
-	comm.send(&sendCount_Xy,1,rank_xY,sendtag);
-	comm.recv(&recvCount_xY,1,rank_Xy,recvtag);
-	comm.send(&sendCount_xY,1,rank_Xy,sendtag);
-	comm.recv(&recvCount_Xy,1,rank_xY,recvtag);
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_xz,1,rank_XZ,sendtag);
-	comm.recv(&recvCount_XZ,1,rank_xz,recvtag);
-	comm.send(&sendCount_XZ,1,rank_xz,sendtag);
-	comm.recv(&recvCount_xz,1,rank_XZ,recvtag);
-	comm.send(&sendCount_Xz,1,rank_xZ,sendtag);
-	comm.recv(&recvCount_xZ,1,rank_Xz,recvtag);
-	comm.send(&sendCount_xZ,1,rank_Xz,sendtag);
-	comm.recv(&recvCount_Xz,1,rank_xZ,recvtag);
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_yz,1,rank_YZ,sendtag);
-	comm.recv(&recvCount_YZ,1,rank_yz,recvtag);
-	comm.send(&sendCount_YZ,1,rank_yz,sendtag);
-	comm.recv(&recvCount_yz,1,rank_YZ,recvtag);
-	comm.send(&sendCount_Yz,1,rank_yZ,sendtag);
-	comm.recv(&recvCount_yZ,1,rank_Yz,recvtag);
-	comm.send(&sendCount_yZ,1,rank_Yz,sendtag);
-	comm.recv(&recvCount_Yz,1,rank_yZ,recvtag);
-	comm.barrier();
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -669,48 +675,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	double *sendbuf_x, *sendbuf_y, *sendbuf_z, *sendbuf_X, *sendbuf_Y, *sendbuf_Z;
 	double *sendbuf_xy, *sendbuf_yz, *sendbuf_xz, *sendbuf_Xy, *sendbuf_Yz, *sendbuf_xZ;
@@ -909,24 +915,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -959,7 +983,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -999,8 +1023,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1112,48 +1136,48 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1236,7 +1260,7 @@ int main(int argc, char **argv)
 		//*****************************************************************************
 		//*****************************************************************************
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Iteration completed!
 		timestep++;
 		//...................................................................
@@ -1245,8 +1269,8 @@ int main(int argc, char **argv)
 
 //	cudaThreadSynchronize();
 	dvc_Barrier();
-	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1280,7 +1304,7 @@ int main(int argc, char **argv)
 //	dvc_CopyToDevice(velocity, vel, 3*dist_mem_size, dvc_CopyToDeviceDeviceToHost);
 	//..............................................................................
 //	cudaThreadSynchronize();
-//	comm.barrier();
+//	MPI_Barrier(comm);
 	//............................................................
 	//....Write the z-velocity to test poiseuille flow............
 //	double vz,vz_avg;
@@ -1309,7 +1333,7 @@ int main(int argc, char **argv)
 //	free (velocity);	free(id);
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb1_MRT_mpi.cu b/gpu/exe/lb1_MRT_mpi.cu
index 776ea29f..0c0863c7 100644
--- a/gpu/exe/lb1_MRT_mpi.cu
+++ b/gpu/exe/lb1_MRT_mpi.cu
@@ -1,10 +1,8 @@
-#include "common/MPI.h"
-
 #include <stdio.h>
 #include <iostream>
 #include <fstream>
 #include <cuda.h>
-
+#include <mpi.h>
 
 inline void PackID(int *list, int count, char *sendbuf, char *ID){
 	// Fill in the phase ID values from neighboring processors
@@ -555,11 +553,15 @@ void Write_Out(double *array, int Nx, int Ny, int Nz){
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -573,6 +575,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 	//**********************************
 	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	//!!!!!!!!!!! Random debugging communications!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -622,21 +625,24 @@ int main(int argc, char **argv)
 
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&tau,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&iterMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&iterMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	//.................................................
+	MPI_Barrier(comm);
 	// **************************************************************
 
 	double rlx_setA = 1.f/tau;
@@ -659,7 +665,7 @@ int main(int argc, char **argv)
 		printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -940,7 +946,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -1077,7 +1083,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -1088,46 +1094,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	comm.send(&sendCount_x,1,rank_X,sendtag);
-	comm.recv(&recvCount_X,1,rank_x,recvtag);
-	comm.send(&sendCount_X,1,rank_x,sendtag);
-	comm.recv(&recvCount_x,1,rank_X,recvtag);
-	comm.send(&sendCount_y,1,rank_Y,sendtag);
-	comm.recv(&recvCount_Y,1,rank_y,recvtag);
-	comm.send(&sendCount_Y,1,rank_y,sendtag);
-	comm.recv(&recvCount_y,1,rank_Y,recvtag);
-	comm.send(&sendCount_z,1,rank_Z,sendtag);
-	comm.recv(&recvCount_Z,1,rank_z,recvtag);
-	comm.send(&sendCount_Z,1,rank_z,sendtag);
-	comm.recv(&recvCount_z,1,rank_Z,recvtag);
+	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_xy,1,rank_XY,sendtag);
-	comm.recv(&recvCount_XY,1,rank_xy,recvtag);
-	comm.send(&sendCount_XY,1,rank_xy,sendtag);
-	comm.recv(&recvCount_xy,1,rank_XY,recvtag);
-	comm.send(&sendCount_Xy,1,rank_xY,sendtag);
-	comm.recv(&recvCount_xY,1,rank_Xy,recvtag);
-	comm.send(&sendCount_xY,1,rank_Xy,sendtag);
-	comm.recv(&recvCount_Xy,1,rank_xY,recvtag);
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_xz,1,rank_XZ,sendtag);
-	comm.recv(&recvCount_XZ,1,rank_xz,recvtag);
-	comm.send(&sendCount_XZ,1,rank_xz,sendtag);
-	comm.recv(&recvCount_xz,1,rank_XZ,recvtag);
-	comm.send(&sendCount_Xz,1,rank_xZ,sendtag);
-	comm.recv(&recvCount_xZ,1,rank_Xz,recvtag);
-	comm.send(&sendCount_xZ,1,rank_Xz,sendtag);
-	comm.recv(&recvCount_Xz,1,rank_xZ,recvtag);
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.send(&sendCount_yz,1,rank_YZ,sendtag);
-	comm.recv(&recvCount_YZ,1,rank_yz,recvtag);
-	comm.send(&sendCount_YZ,1,rank_yz,sendtag);
-	comm.recv(&recvCount_yz,1,rank_YZ,recvtag);
-	comm.send(&sendCount_Yz,1,rank_yZ,sendtag);
-	comm.recv(&recvCount_yZ,1,rank_Yz,recvtag);
-	comm.send(&sendCount_yZ,1,rank_Yz,sendtag);
-	comm.recv(&recvCount_Yz,1,rank_yZ,recvtag);
-	comm.barrier();
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
 	//**********************************************************************************
 	//recvCount_x = sendCount_x;
 	//recvCount_X = sendCount_X;
@@ -1151,7 +1157,7 @@ int main(int argc, char **argv)
 	//......................................................................................
 	// Use MPI to fill in the appropriate values
 	//	int tag = 5;
-	//	Mcomm.sendrecv(sendCount_x,1,rank_x,tag,sendCount_X,1);
+	//	MPI_Sendrecv(sendCount_x,1,MPI_INT,rank_x,tag,sendCount_X,1,MPI_INT,comm,req);
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -1181,48 +1187,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	double *sendbuf_x, *sendbuf_y, *sendbuf_z, *sendbuf_X, *sendbuf_Y, *sendbuf_Z;
 	double *sendbuf_xy, *sendbuf_yz, *sendbuf_xz, *sendbuf_Xy, *sendbuf_Yz, *sendbuf_xZ;
@@ -1421,24 +1427,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1471,7 +1495,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1511,8 +1535,8 @@ int main(int argc, char **argv)
 	
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1609,48 +1633,48 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1734,7 +1758,7 @@ int main(int argc, char **argv)
 		//*****************************************************************************
 		//*****************************************************************************
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Iteration completed!
 		iter++;
 		//...................................................................
@@ -1742,8 +1766,8 @@ int main(int argc, char **argv)
 	//************************************************************************/
 	
 	cudaThreadSynchronize();
-	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*iter)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1778,7 +1802,7 @@ int main(int argc, char **argv)
 	cudaMemcpy(velocity, vel, 3*dist_mem_size, cudaMemcpyDeviceToHost);
 	//..............................................................................
 	cudaThreadSynchronize();
-	comm.barrier();
+	MPI_Barrier(comm);
 	//............................................................	
 	//....Write the z-velocity to test poiseuille flow............
 	double vz,vz_avg;	
@@ -1807,7 +1831,7 @@ int main(int argc, char **argv)
 	free (velocity);	free(id);
 	
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb2_Color.cu b/gpu/exe/lb2_Color.cu
index 1f227d08..1871b23c 100644
--- a/gpu/exe/lb2_Color.cu
+++ b/gpu/exe/lb2_Color.cu
@@ -1,4 +1,6 @@
-#include "common/MPI.h"
+#ifdef useMPI
+#include <mpi.h>
+#endif
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -60,10 +62,18 @@ int main(int argc, char *argv[])
 {
 	
 	//********** Initialize MPI ****************
+	int numprocs,rank;
+#ifdef useMPI
+	MPI_Status stat;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int numprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_size(comm,&numprocs);
+	MPI_Comm_rank(comm,&rank);
+#else
+    MPI_Comm comm = MPI_COMM_WORLD;
+	numprocs = 1;
+	rank = 0;
+#endif
 	//******************************************
 	
 	if (rank == 0){
@@ -113,31 +123,32 @@ int main(int argc, char *argv[])
 		input >> tol;				// error tolerance
 		//.............................................................
 	}
+#ifdef useMPI
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
+#endif
 	
 	double rlxA = 1.f/tau;
 	double rlxB = 8.f*(2.f-rlxA)/(8.f-rlxA);
@@ -232,7 +243,11 @@ int main(int argc, char *argv[])
 			if (k==4)	k=Nz-5;
 		}
 	}
-	comm.bcast(&id[0],N,0);
+#ifdef useMPI	//............................................................
+	MPI_Barrier(comm);
+	MPI_Bcast(&id[0],N,MPI_CHAR,0,comm);
+	MPI_Barrier(comm);
+#endif
 	if (rank == 0) printf("Domain set.\n");
 	//...........................................................................
 
diff --git a/gpu/exe/lb2_Color_mpi.cpp b/gpu/exe/lb2_Color_mpi.cpp
index a2f3d8a9..fe11d32f 100644
--- a/gpu/exe/lb2_Color_mpi.cpp
+++ b/gpu/exe/lb2_Color_mpi.cpp
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
-#include "common/MPI.h"
+#include <mpi.h>
 
 using namespace std;
 
@@ -98,11 +98,15 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -116,6 +120,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -172,30 +177,31 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
+
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
 	// **************************************************************
 
@@ -225,7 +231,7 @@ int main(int argc, char **argv)
 
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -507,7 +513,7 @@ int main(int argc, char **argv)
 	PM.close();
 //	printf("File porosity = %f\n", double(sum)/N);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//...........................................................................
 	// Write the communcation structure into a file for debugging
@@ -644,7 +650,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -655,46 +661,46 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	comm.Send(&sendCount_x,1,rank_X,sendtag);
-	comm.Recv(&recvCount_X,1,rank_x,recvtag);
-	comm.Send(&sendCount_X,1,rank_x,sendtag);
-	comm.Recv(&recvCount_x,1,rank_X,recvtag);
-	comm.Send(&sendCount_y,1,rank_Y,sendtag);
-	comm.Recv(&recvCount_Y,1,rank_y,recvtag);
-	comm.Send(&sendCount_Y,1,rank_y,sendtag);
-	comm.Recv(&recvCount_y,1,rank_Y,recvtag);
-	comm.Send(&sendCount_z,1,rank_Z,sendtag);
-	comm.Recv(&recvCount_Z,1,rank_z,recvtag);
-	comm.Send(&sendCount_Z,1,rank_z,sendtag);
-	comm.Recv(&recvCount_z,1,rank_Z,recvtag);
+	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_xy,1,rank_XY,sendtag);
-	comm.Recv(&recvCount_XY,1,rank_xy,recvtag);
-	comm.Send(&sendCount_XY,1,rank_xy,sendtag);
-	comm.Recv(&recvCount_xy,1,rank_XY,recvtag);
-	comm.Send(&sendCount_Xy,1,rank_xY,sendtag);
-	comm.Recv(&recvCount_xY,1,rank_Xy,recvtag);
-	comm.Send(&sendCount_xY,1,rank_Xy,sendtag);
-	comm.Recv(&recvCount_Xy,1,rank_xY,recvtag);
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_xz,1,rank_XZ,sendtag);
-	comm.Recv(&recvCount_XZ,1,rank_xz,recvtag);
-	comm.Send(&sendCount_XZ,1,rank_xz,sendtag);
-	comm.Recv(&recvCount_xz,1,rank_XZ,recvtag);
-	comm.Send(&sendCount_Xz,1,rank_xZ,sendtag);
-	comm.Recv(&recvCount_xZ,1,rank_Xz,recvtag);
-	comm.Send(&sendCount_xZ,1,rank_Xz,sendtag);
-	comm.Recv(&recvCount_Xz,1,rank_xZ,recvtag);
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
 
-	comm.Send(&sendCount_yz,1,rank_YZ,sendtag);
-	comm.Recv(&recvCount_YZ,1,rank_yz,recvtag);
-	comm.Send(&sendCount_YZ,1,rank_yz,sendtag);
-	comm.Recv(&recvCount_yz,1,rank_YZ,recvtag);
-	comm.Send(&sendCount_Yz,1,rank_yZ,sendtag);
-	comm.Recv(&recvCount_yZ,1,rank_Yz,recvtag);
-	comm.Send(&sendCount_yZ,1,rank_Yz,sendtag);
-	comm.Recv(&recvCount_Yz,1,rank_yZ,recvtag);
-	comm.barrier();
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
 	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
@@ -725,48 +731,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -987,24 +993,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1037,7 +1061,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1102,49 +1126,48 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
-	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	//...................................................................................
 	//...................................................................................
 	/*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1184,8 +1207,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	// Old cuda timer is below
 //	cudaEvent_t start, stop;
 //	float time;
@@ -1286,42 +1309,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1340,8 +1363,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-	    comm.waitAll(18,req1);
-	    comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1423,23 +1446,23 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		req1[0] = comm.Isend(recvbuf_x, 2*recvCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(sendbuf_X, 2*sendCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(recvbuf_X, 2*recvCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(sendbuf_x, 2*sendCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(recvbuf_y, 2*recvCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(sendbuf_Y, 2*sendCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(recvbuf_Y, 2*recvCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(sendbuf_y, 2*sendCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(recvbuf_z, 2*recvCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(sendbuf_Z, 2*sendCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(recvbuf_Z, 2*recvCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(sendbuf_z, 2*sendCount_z,rank_Z,recvtag);
+		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-	    comm.waitAll(6,req1);
-	    comm.waitAll(6,req2);
+		MPI_Waitall(6,req1,stat1);
+		MPI_Waitall(6,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 		dvc_UnpackDenD3Q7(faceGrid,packThreads,dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1484,48 +1507,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-	    comm.waitAll(18,req1);
-	    comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 /*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1554,7 +1577,7 @@ int main(int argc, char **argv)
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Iteration completed!
 		timestep++;
 		
@@ -1564,8 +1587,8 @@ int main(int argc, char **argv)
 
 //	cudaThreadSynchronize();
 	dvc_Barrier();
-	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
 //	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -1593,7 +1616,7 @@ int main(int argc, char **argv)
 	//************************************************************************/
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/gpu/exe/lb2_Color_pBC_wia_mpi.cpp b/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
index fe803470..c29e529e 100644
--- a/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
+++ b/gpu/exe/lb2_Color_pBC_wia_mpi.cpp
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <iostream>
 #include <fstream>
-#include "common/MPI.h"
+#include <mpi.h>
 
 #include "pmmc.h"
 #include "Domain.h"
@@ -101,11 +101,15 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -119,6 +123,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -198,35 +203,35 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 	// Computational domain
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nBlocks,1,0);
-	comm.bcast(&nthreads,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	// **************************************************************
 	// **************************************************************
 	double Ps = -(das-dbs)/(das+dbs);
@@ -258,7 +263,7 @@ int main(int argc, char **argv)
 		printf("********************************************************\n");
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	kproc = rank/(nprocx*nprocy);
 	jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 	iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -556,14 +561,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,nspheres,0);
-	comm.bcast(cy,nspheres,0);
-	comm.bcast(cz,nspheres,0);
-	comm.bcast(rad,nspheres,0);
+	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -713,7 +718,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -724,49 +729,89 @@ int main(int argc, char **argv)
 	//**********************************************************************************
 	// Fill in the recieve counts using MPI
 	sendtag = recvtag = 3;
-	req1[0] = comm.Isend(&sendCount_x,1,rank_X,sendtag);
-	req2[0] = comm.Irecv(&recvCount_X,1,rank_x,recvtag);
-	req1[1] = comm.Isend(&sendCount_X,1,rank_x,sendtag);
-	req2[1] = comm.Irecv(&recvCount_x,1,rank_X,recvtag);
-	req1[2] = comm.Isend(&sendCount_y,1,rank_Y,sendtag);
-	req2[2] = comm.Irecv(&recvCount_Y,1,rank_y,recvtag);
-	req1[3] = comm.Isend(&sendCount_Y,1,rank_y,sendtag);
-	req2[3] = comm.Irecv(&recvCount_y,1,rank_Y,recvtag);
-	req1[4] = comm.Isend(&sendCount_z,1,rank_Z,sendtag);
-	req2[4] = comm.Irecv(&recvCount_Z,1,rank_z,recvtag);
-	req1[5] = comm.Isend(&sendCount_Z,1,rank_z,sendtag);
-	req2[5] = comm.Irecv(&recvCount_z,1,rank_Z,recvtag);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(&sendCount_xy,1,rank_XY,sendtag);
-	req2[6] = comm.Irecv(&recvCount_XY,1,rank_xy,recvtag);
-	req1[7] = comm.Isend(&sendCount_XY,1,rank_xy,sendtag);
-	req2[7] = comm.Irecv(&recvCount_xy,1,rank_XY,recvtag);
-	req1[8] = comm.Isend(&sendCount_Xy,1,rank_xY,sendtag);
-	req2[8] = comm.Irecv(&recvCount_xY,1,rank_Xy,recvtag);
-	req1[9] = comm.Isend(&sendCount_xY,1,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(&recvCount_Xy,1,rank_xY,recvtag);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(&sendCount_xz,1,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(&recvCount_XZ,1,rank_xz,recvtag);
-	req1[11] = comm.Isend(&sendCount_XZ,1,rank_xz,sendtag);
-	req2[11] = comm.Irecv(&recvCount_xz,1,rank_XZ,recvtag);
-	req1[12] = comm.Isend(&sendCount_Xz,1,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(&recvCount_xZ,1,rank_Xz,recvtag);
-	req1[13] = comm.Isend(&sendCount_xZ,1,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(&recvCount_Xz,1,rank_xZ,recvtag);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(&sendCount_yz,1,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(&recvCount_YZ,1,rank_yz,recvtag);
-	req1[15] = comm.Isend(&sendCount_YZ,1,rank_yz,sendtag);
-	req2[15] = comm.Irecv(&recvCount_yz,1,rank_YZ,recvtag);
-	req1[16] = comm.Isend(&sendCount_Yz,1,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(&recvCount_yZ,1,rank_Yz,recvtag);
-	req1[17] = comm.Isend(&sendCount_yZ,1,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(&recvCount_Yz,1,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
-	//**********************************************************************************
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
+/*	MPI_Send(&sendCount_x,1,MPI_INT,rank_X,sendtag,comm);
+	MPI_Recv(&recvCount_X,1,MPI_INT,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_X,1,MPI_INT,rank_x,sendtag,comm);
+	MPI_Recv(&recvCount_x,1,MPI_INT,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_y,1,MPI_INT,rank_Y,sendtag,comm);
+	MPI_Recv(&recvCount_Y,1,MPI_INT,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Y,1,MPI_INT,rank_y,sendtag,comm);
+	MPI_Recv(&recvCount_y,1,MPI_INT,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_z,1,MPI_INT,rank_Z,sendtag,comm);
+	MPI_Recv(&recvCount_Z,1,MPI_INT,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Z,1,MPI_INT,rank_z,sendtag,comm);
+	MPI_Recv(&recvCount_z,1,MPI_INT,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_xy,1,MPI_INT,rank_XY,sendtag,comm);
+	MPI_Recv(&recvCount_XY,1,MPI_INT,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XY,1,MPI_INT,rank_xy,sendtag,comm);
+	MPI_Recv(&recvCount_xy,1,MPI_INT,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xy,1,MPI_INT,rank_xY,sendtag,comm);
+	MPI_Recv(&recvCount_xY,1,MPI_INT,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xY,1,MPI_INT,rank_Xy,sendtag,comm);
+	MPI_Recv(&recvCount_Xy,1,MPI_INT,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_xz,1,MPI_INT,rank_XZ,sendtag,comm);
+	MPI_Recv(&recvCount_XZ,1,MPI_INT,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_XZ,1,MPI_INT,rank_xz,sendtag,comm);
+	MPI_Recv(&recvCount_xz,1,MPI_INT,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Xz,1,MPI_INT,rank_xZ,sendtag,comm);
+	MPI_Recv(&recvCount_xZ,1,MPI_INT,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_xZ,1,MPI_INT,rank_Xz,sendtag,comm);
+	MPI_Recv(&recvCount_Xz,1,MPI_INT,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+
+	MPI_Send(&sendCount_yz,1,MPI_INT,rank_YZ,sendtag,comm);
+	MPI_Recv(&recvCount_YZ,1,MPI_INT,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_YZ,1,MPI_INT,rank_yz,sendtag,comm);
+	MPI_Recv(&recvCount_yz,1,MPI_INT,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_Yz,1,MPI_INT,rank_yZ,sendtag,comm);
+	MPI_Recv(&recvCount_yZ,1,MPI_INT,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Send(&sendCount_yZ,1,MPI_INT,rank_Yz,sendtag,comm);
+	MPI_Recv(&recvCount_Yz,1,MPI_INT,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Barrier(comm);
+*/	//**********************************************************************************
 	//......................................................................................
 	int *recvList_x, *recvList_y, *recvList_z, *recvList_X, *recvList_Y, *recvList_Z;
 	int *recvList_xy, *recvList_yz, *recvList_xz, *recvList_Xy, *recvList_Yz, *recvList_xZ;
@@ -796,48 +841,48 @@ int main(int argc, char **argv)
 	// Use MPI to fill in the appropriate values for recvList
 	// Fill in the recieve lists using MPI
 	sendtag = recvtag = 4;
-	req1[0] = comm.Isend(sendList_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvList_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendList_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvList_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendList_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvList_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendList_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvList_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendList_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvList_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendList_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvList_z,recvCount_z,rank_Z,recvtag);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_Z,recvtag,comm,&req2[5]);
 
-	req1[6] = comm.Isend(sendList_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvList_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendList_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvList_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendList_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvList_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendList_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvList_Xy,recvCount_Xy,rank_xY,recvtag);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_xY,recvtag,comm,&req2[9]);
 
-	req1[10] = comm.Isend(sendList_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvList_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendList_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvList_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendList_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvList_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendList_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvList_Xz,recvCount_Xz,rank_xZ,recvtag);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_xZ,recvtag,comm,&req2[13]);
 
-	req1[14] = comm.Isend(sendList_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvList_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendList_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvList_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendList_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvList_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendList_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvList_Yz,recvCount_Yz,rank_yZ,recvtag);
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
-	comm.barrier();
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_yZ,recvtag,comm,&req2[17]);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
+	MPI_Barrier(comm);
 	//......................................................................................
 	for (int idx=0; idx<recvCount_x; idx++)	recvList_x[idx] -= (Nx-2);
 	for (int idx=0; idx<recvCount_X; idx++)	recvList_X[idx] += (Nx-2);
@@ -1058,24 +1103,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_X,sendtag,recvID_X,recvCount_X,rank_x,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_x,sendtag,recvID_x,recvCount_x,rank_X,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_Y,sendtag,recvID_Y,recvCount_Y,rank_y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_y,sendtag,recvID_y,recvCount_y,rank_Y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_Z,sendtag,recvID_Z,recvCount_Z,rank_z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_z,sendtag,recvID_z,recvCount_z,rank_Z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_XY,sendtag,recvID_XY,recvCount_XY,rank_xy,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_xy,sendtag,recvID_xy,recvCount_xy,rank_XY,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_xY,sendtag,recvID_xY,recvCount_xY,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_Xy,sendtag,recvID_Xy,recvCount_Xy,rank_xY,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_XZ,sendtag,recvID_XZ,recvCount_XZ,rank_xz,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_xz,sendtag,recvID_xz,recvCount_xz,rank_XZ,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_xZ,sendtag,recvID_xZ,recvCount_xZ,rank_Xz,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_Xz,sendtag,recvID_Xz,recvCount_Xz,rank_xZ,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_YZ,sendtag,recvID_YZ,recvCount_YZ,rank_yz,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_yz,sendtag,recvID_yz,recvCount_yz,rank_YZ,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_yZ,sendtag,recvID_yZ,recvCount_yZ,rank_Yz,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_Yz,sendtag,recvID_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_X,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_x,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_Y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_Z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_XY,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_xy,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_xY,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_Xy,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_XZ,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_xz,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_xZ,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_Xz,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_YZ,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_yz,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_yZ,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_Yz,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1108,7 +1171,7 @@ int main(int argc, char **argv)
 	free(recvID_yz); free(recvID_YZ); free(recvID_yZ); free(recvID_Yz);
 */	//......................................................................................
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1259,48 +1322,48 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	req1[0] = comm.Isend(sendbuf_x,sendCount_x,rank_X,sendtag);
-	req2[0] = comm.Irecv(recvbuf_X,recvCount_X,rank_x,recvtag);
-	req1[1] = comm.Isend(sendbuf_X,sendCount_X,rank_x,sendtag);
-	req2[1] = comm.Irecv(recvbuf_x,recvCount_x,rank_X,recvtag);
-	req1[2] = comm.Isend(sendbuf_y,sendCount_y,rank_Y,sendtag);
-	req2[2] = comm.Irecv(recvbuf_Y,recvCount_Y,rank_y,recvtag);
-	req1[3] = comm.Isend(sendbuf_Y,sendCount_Y,rank_y,sendtag);
-	req2[3] = comm.Irecv(recvbuf_y,recvCount_y,rank_Y,recvtag);
-	req1[4] = comm.Isend(sendbuf_z,sendCount_z,rank_Z,sendtag);
-	req2[4] = comm.Irecv(recvbuf_Z,recvCount_Z,rank_z,recvtag);
-	req1[5] = comm.Isend(sendbuf_Z,sendCount_Z,rank_z,sendtag);
-	req2[5] = comm.Irecv(recvbuf_z,recvCount_z,rank_Z,recvtag);
-	req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-	req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-	req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-	req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-	req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-	req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-	req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-	req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-	req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-	req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-	req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-	req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-	req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-	req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-	req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-	req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-	req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-	req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-	req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-	req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-	req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-	req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-	req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-	req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	//...................................................................................
 	//...................................................................................
 	/*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1340,8 +1403,8 @@ int main(int argc, char **argv)
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
-	starttime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	starttime = MPI_Wtime();
 	//.........................................
 
 	sendtag = recvtag = 5;
@@ -1431,42 +1494,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,5*sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,5*recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,5*sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,5*recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,5*sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,5*recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,5*sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,5*recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,5*sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,5*recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,5*sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,5*recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy,sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY,recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY,sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy,recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy,sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY,recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY,sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy,recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz,sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ,recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ,sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz,recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz,sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ,recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ,sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz,recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz,sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ,recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ,sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz,recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz,sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ,recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ,sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz,recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1484,8 +1547,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1567,23 +1630,23 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the D3Q7 distributions
-		req1[0] = comm.Isend(recvbuf_x,2*recvCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(sendbuf_X,2*sendCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(recvbuf_X,2*recvCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(sendbuf_x,2*sendCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(recvbuf_y,2*recvCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(sendbuf_Y,2*sendCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(recvbuf_Y,2*recvCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(sendbuf_y,2*sendCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(recvbuf_z,2*recvCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(sendbuf_Z,2*sendCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(recvbuf_Z,2*recvCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(sendbuf_z,2*sendCount_z,rank_Z,recvtag);
+		MPI_Isend(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of D3Q7 communication
-	    comm.waitAll(6,req1);
-	    comm.waitAll(6,req2);
+		MPI_Waitall(6,req1,stat1);
+		MPI_Waitall(6,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 		dvc_UnpackDenD3Q7(faceGrid,packThreads,dvcSendList_x,sendCount_x,sendbuf_x,2,Den,N);
@@ -1622,48 +1685,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_X,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_x,recvtag);
-		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_x,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_X,recvtag);
-		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_Y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_Y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_Z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_Z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_XY,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_xy,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_xy,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_XY,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_xY,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_Xy,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_Xy,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_xY,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_XZ,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_xz,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_xz,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_XZ,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_xZ,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_Xz,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_Xz,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_xZ,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_YZ,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_yz,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_yz,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_YZ,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_yZ,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_Yz,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_Yz,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_yZ,recvtag);
+		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_X,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_x,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_x,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_X,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[17]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-	    comm.waitAll(18,req1);
-	    comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		//...................................................................................
 		//...................................................................................
 /*		dvc_UnpackValues(faceGrid, packThreads, dvcSendList_x, sendCount_x,sendbuf_x, Phi, N);
@@ -1692,7 +1755,7 @@ int main(int argc, char **argv)
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, Phi, N);
 		dvc_UnpackValues(faceGrid, packThreads,dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, Phi, N);
 		//...................................................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Iteration completed!
 		timestep++;
 		//...................................................................
@@ -1703,7 +1766,7 @@ int main(int argc, char **argv)
 			//...........................................................................
 			dvc_Barrier();
 			dvc_CopyToHost(Phase.data,Phi,N*sizeof(double));
-			comm.barrier();
+			MPI_Barrier(comm);
 			//...........................................................................
 			// Compute areas using porous medium marching cubes algorithm
 			// McClure, Adalsteinsson, et al. (2007)
@@ -1872,15 +1935,15 @@ int main(int argc, char **argv)
 				//*******************************************************************
 			}
 			//...........................................................................
-			comm.barrier();
-			nwp_volume_global = comm.sumReduce( nwp_volume );
-			awn_global = comm.sumReduce( awn );
-			ans_global = comm.sumReduce( ans );
-			aws_global = comm.sumReduce( aws );
-			lwns_global = comm.sumReduce( lwns );
-			As_global = comm.sumReduce( As );
+			MPI_Barrier(comm);
+			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
 
-			comm.barrier();
+			MPI_Barrier(comm);
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
 			// See McClure, Prins and Miller (2013) 
@@ -1909,8 +1972,8 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	dvc_Barrier();
-	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	MPI_Barrier(comm);
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 //	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 	cputime = stoptime - starttime;
@@ -1946,7 +2009,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index df4afab9..96d7f214 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -192,12 +192,12 @@ void ScaLBL_ColorModel::ReadInput(){
 	}
 	else if (domain_db->keyExists( "GridFile" )){
         // Read the local domain data
-	    auto input_id = readMicroCT( *domain_db, comm );
+	    auto input_id = readMicroCT( *domain_db, MPI_COMM_WORLD );
         // Fill the halo (assuming GCW of 1)
         array<int,3> size0 = { (int) input_id.size(0), (int) input_id.size(1), (int) input_id.size(2) };
         ArraySize size1 = { (size_t) Mask->Nx, (size_t) Mask->Ny, (size_t) Mask->Nz };
         ASSERT( (int) size1[0] == size0[0]+2 && (int) size1[1] == size0[1]+2 && (int) size1[2] == size0[2]+2 );
-        fillHalo<signed char> fill( comm, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
+        fillHalo<signed char> fill( MPI_COMM_WORLD, Mask->rank_info, size0, { 1, 1, 1 }, 0, 1 );
         Array<signed char> id_view;
         id_view.viewRaw( size1, Mask->id );
         fill.copy( input_id, id_view );
@@ -652,7 +652,7 @@ void ScaLBL_ColorModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	starttime = Utilities::MPI::time();
+	starttime = MPI_Wtime();
 	//.........................................
 
 	//************ MAIN ITERATION LOOP ***************************************/
@@ -991,7 +991,7 @@ void ScaLBL_ColorModel::Run(){
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/models/DFHModel.cpp b/models/DFHModel.cpp
index 9709b107..ced5853f 100644
--- a/models/DFHModel.cpp
+++ b/models/DFHModel.cpp
@@ -487,7 +487,7 @@ void ScaLBL_DFHModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	starttime = Utilities::MPI::time();
+	starttime = MPI_Wtime();
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
 
@@ -583,7 +583,7 @@ void ScaLBL_DFHModel::Run(){
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index 60847e54..23925930 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -227,7 +227,7 @@ void ScaLBL_MRTModel::Run(){
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
     comm.barrier();
-	starttime = Utilities::MPI::time();
+	starttime = MPI_Wtime();
 	if (rank==0) printf("Beginning AA timesteps, timestepMax = %i \n", timestepMax);
 	if (rank==0) printf("********************************************************\n");
 	timestep=0;
@@ -325,7 +325,7 @@ void ScaLBL_MRTModel::Run(){
 		}
 	}
 	//************************************************************************/
-	stoptime = Utilities::MPI::time();
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/tests/BlobAnalyzeParallel.cpp b/tests/BlobAnalyzeParallel.cpp
index 773309f9..48e9e230 100644
--- a/tests/BlobAnalyzeParallel.cpp
+++ b/tests/BlobAnalyzeParallel.cpp
@@ -138,16 +138,16 @@ int main(int argc, char **argv)
     }
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&nx,1,0);
-	comm.bcast(&ny,1,0);
-	comm.bcast(&nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 
@@ -291,7 +291,7 @@ int main(int argc, char **argv)
     }
     Dm.CommInit(); // Initialize communications for domains
 
-    sum_global = comm.sumReduce( sum );
+    MPI_Allreduce(&sum,&sum_global,1,MPI_DOUBLE,MPI_SUM,comm);
     porosity = sum_global/Dm.Volume;
     if (rank==0) printf("Porosity = %f \n",porosity);
 
diff --git a/tests/GenerateSphereTest.cpp b/tests/GenerateSphereTest.cpp
index d4340964..43434092 100644
--- a/tests/GenerateSphereTest.cpp
+++ b/tests/GenerateSphereTest.cpp
@@ -213,24 +213,42 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         PackID(Dm.sendList_yZ, Dm.sendCount_yZ ,sendID_yZ, id);
         PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
         //......................................................................................
-        Dm.Comm.sendrecv(sendID_x,Dm.sendCount_x,Dm.rank_x(),sendtag,recvID_X,Dm.recvCount_X,Dm.rank_X(),recvtag);
-        Dm.Comm.sendrecv(sendID_X,Dm.sendCount_X,Dm.rank_X(),sendtag,recvID_x,Dm.recvCount_x,Dm.rank_x(),recvtag);
-        Dm.Comm.sendrecv(sendID_y,Dm.sendCount_y,Dm.rank_y(),sendtag,recvID_Y,Dm.recvCount_Y,Dm.rank_Y(),recvtag);
-        Dm.Comm.sendrecv(sendID_Y,Dm.sendCount_Y,Dm.rank_Y(),sendtag,recvID_y,Dm.recvCount_y,Dm.rank_y(),recvtag);
-        Dm.Comm.sendrecv(sendID_z,Dm.sendCount_z,Dm.rank_z(),sendtag,recvID_Z,Dm.recvCount_Z,Dm.rank_Z(),recvtag);
-        Dm.Comm.sendrecv(sendID_Z,Dm.sendCount_Z,Dm.rank_Z(),sendtag,recvID_z,Dm.recvCount_z,Dm.rank_z(),recvtag);
-        Dm.Comm.sendrecv(sendID_xy,Dm.sendCount_xy,Dm.rank_xy(),sendtag,recvID_XY,Dm.recvCount_XY,Dm.rank_XY(),recvtag);
-        Dm.Comm.sendrecv(sendID_XY,Dm.sendCount_XY,Dm.rank_XY(),sendtag,recvID_xy,Dm.recvCount_xy,Dm.rank_xy(),recvtag);
-        Dm.Comm.sendrecv(sendID_Xy,Dm.sendCount_Xy,Dm.rank_Xy(),sendtag,recvID_xY,Dm.recvCount_xY,Dm.rank_xY(),recvtag);
-        Dm.Comm.sendrecv(sendID_xY,Dm.sendCount_xY,Dm.rank_xY(),sendtag,recvID_Xy,Dm.recvCount_Xy,Dm.rank_Xy(),recvtag);
-        Dm.Comm.sendrecv(sendID_xz,Dm.sendCount_xz,Dm.rank_xz(),sendtag,recvID_XZ,Dm.recvCount_XZ,Dm.rank_XZ(),recvtag);
-        Dm.Comm.sendrecv(sendID_XZ,Dm.sendCount_XZ,Dm.rank_XZ(),sendtag,recvID_xz,Dm.recvCount_xz,Dm.rank_xz(),recvtag);
-        Dm.Comm.sendrecv(sendID_Xz,Dm.sendCount_Xz,Dm.rank_Xz(),sendtag,recvID_xZ,Dm.recvCount_xZ,Dm.rank_xZ(),recvtag);
-        Dm.Comm.sendrecv(sendID_xZ,Dm.sendCount_xZ,Dm.rank_xZ(),sendtag,recvID_Xz,Dm.recvCount_Xz,Dm.rank_Xz(),recvtag);
-        Dm.Comm.sendrecv(sendID_yz,Dm.sendCount_yz,Dm.rank_yz(),sendtag,recvID_YZ,Dm.recvCount_YZ,Dm.rank_YZ(),recvtag);
-        Dm.Comm.sendrecv(sendID_YZ,Dm.sendCount_YZ,Dm.rank_YZ(),sendtag,recvID_yz,Dm.recvCount_yz,Dm.rank_yz(),recvtag);
-        Dm.Comm.sendrecv(sendID_Yz,Dm.sendCount_Yz,Dm.rank_Yz(),sendtag,recvID_yZ,Dm.recvCount_yZ,Dm.rank_yZ(),recvtag);
-        Dm.Comm.sendrecv(sendID_yZ,Dm.sendCount_yZ,Dm.rank_yZ(),sendtag,recvID_Yz,Dm.recvCount_Yz,Dm.rank_Yz(),recvtag);
+        MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
+		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
+		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
+		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
+		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
+		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
+		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
+		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
+		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
+		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
+		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
+		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
+		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
+		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
+		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
+		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
+		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
+		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+        MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
+		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
         //......................................................................................
         UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
         UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
diff --git a/tests/TestBlobAnalyze.cpp b/tests/TestBlobAnalyze.cpp
index 19360fe3..63d928c1 100644
--- a/tests/TestBlobAnalyze.cpp
+++ b/tests/TestBlobAnalyze.cpp
@@ -190,16 +190,16 @@ int main(int argc, char **argv)
     }
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&nx,1,0);
-	comm.bcast(&ny,1,0);
-	comm.bcast(&nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 
@@ -255,10 +255,10 @@ int main(int argc, char **argv)
 
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,nspheres,0);
-	comm.bcast(cy,nspheres,0);
-	comm.bcast(cz,nspheres,0);
-	comm.bcast(rad,nspheres,0);
+	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
 	comm.barrier();
 	//.......................................................................
diff --git a/tests/TestBubble.cpp b/tests/TestBubble.cpp
index 6eb74b37..e7e0ced8 100644
--- a/tests/TestBubble.cpp
+++ b/tests/TestBubble.cpp
@@ -45,6 +45,7 @@ int main(int argc, char **argv)
     int nprocx,nprocy,nprocz;
 
     MPI_Request req1[18],req2[18];
+    MPI_Status stat1[18],stat2[18];
 
     if (rank == 0){
         printf("********************************************************\n");
@@ -433,7 +434,7 @@ int main(int argc, char **argv)
     //.......create and start timer............
     double starttime,stoptime,cputime;
     comm.barrier();
-    starttime = Utilities::MPI::time();
+    starttime = MPI_Wtime();
     //.........................................
     //...........................................................................
     //                MAIN  VARIABLES INITIALIZED HERE
@@ -808,25 +809,25 @@ int main(int argc, char **argv)
         }
         //...........................................................................
         comm.barrier();
-        nwp_volume_global = comm.sumReduce( nwp_volume );
-        awn_global = comm.sumReduce( awn );
-        ans_global = comm.sumReduce( ans );
-        aws_global = comm.sumReduce( aws );
-        lwns_global = comm.sumReduce( lwns );
-        As_global  = comm.sumReduce( As );
-        Jwn_global = comm.sumReduce( Jwn );
-        efawns_global = comm.sumReduce( efawns );
+        MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
         // Phase averages
-        vol_w_global = comm.sumReduce( vol_w );
-        vol_n_global = comm.sumReduce( vol_n );
-        paw_global   = comm.sumReduce( paw );
-        pan_global   = comm.sumReduce( pan );
-        vaw_global(0) = comm.sumReduce( vaw(0) );
-        van_global(0) = comm.sumReduce( van(0) );
-        vawn_global(0) = comm.sumReduce( vawn(0) );
-        Gwn_global(0) = comm.sumReduce( Gwn(0) );
-        Gns_global(0) = comm.sumReduce( Gns(0) );
-        Gws_global(0) = comm.sumReduce( Gws(0) );
+        MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+        MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
         comm.barrier();
         //.........................................................................
         // Compute the change in the total surface energy based on the defined interval
@@ -951,7 +952,7 @@ int main(int argc, char **argv)
     //************************************************************************/
     ScaLBL_DeviceBarrier();
     comm.barrier();
-    stoptime = Utilities::MPI::time();
+    stoptime = MPI_Wtime();
     if (rank==0) printf("-------------------------------------------------------------------\n");
     // Compute the walltime per timestep
     cputime = (stoptime - starttime)/timestep;
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index 8b4f1a9b..7f5d0047 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -387,7 +387,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 		//.........................................
 
 		err = 1.0; 	
@@ -487,7 +487,7 @@ int main(int argc, char **argv)
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/TestColorGrad.cpp b/tests/TestColorGrad.cpp
index 2566f8c0..df1c1daf 100644
--- a/tests/TestColorGrad.cpp
+++ b/tests/TestColorGrad.cpp
@@ -114,16 +114,16 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		comm.bcast(&nspheres,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
 		comm.barrier();
 		// **************************************************************
diff --git a/tests/TestCommD3Q19.cpp b/tests/TestCommD3Q19.cpp
index c4a045ae..d2799355 100644
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@@ -378,7 +378,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 		//.........................................
 
 
@@ -403,7 +403,7 @@ int main(int argc, char **argv)
 			//...................................................................
 		}
 		//************************************************************************/
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
diff --git a/tests/TestForceD3Q19.cpp b/tests/TestForceD3Q19.cpp
index 31151584..f8569624 100644
--- a/tests/TestForceD3Q19.cpp
+++ b/tests/TestForceD3Q19.cpp
@@ -450,7 +450,7 @@ int main (int argc, char **argv)
 	for (int i=0; i<nprocs; i++) {
 		if ( rank==i )
 			printf("%i of %i: Testing force term \n",rank,nprocs);
-	    comm.barrier();
+		MPI_Barrier(MPI_COMM_WORLD);
 	}
 
 	// Create a memory leak for valgrind to find
@@ -540,6 +540,6 @@ int main (int argc, char **argv)
     printf("Fy = %f; Computed vy=%f \n",Fy,vel[Np+0]);
     printf("Fz = %f; Computed vz=%f \n",Fz,vel[2*Np+0]);
     
-    comm.barrier();
+    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
 }
diff --git a/tests/TestForceMoments.cpp b/tests/TestForceMoments.cpp
index b10954b1..0df4a726 100644
--- a/tests/TestForceMoments.cpp
+++ b/tests/TestForceMoments.cpp
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 
 		ScaLBL_DeviceBarrier(); comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 
 		/************ MAIN ITERATION LOOP (timing communications)***************************************/
 		//ScaLBL_Comm->SendD3Q19(dist, &dist[10*Np]);
@@ -244,7 +244,7 @@ int main(int argc, char **argv)
 
 
 		//************************************************************************/
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
diff --git a/tests/TestMRT.cpp b/tests/TestMRT.cpp
index e4acba99..5f2c4449 100644
--- a/tests/TestMRT.cpp
+++ b/tests/TestMRT.cpp
@@ -580,16 +580,16 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		comm.bcast(&nspheres,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
 		comm.barrier();
 		// **************************************************************
@@ -668,7 +668,7 @@ int main(int argc, char **argv)
 			}
 		}
 		comm.barrier();
-		sum = comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -731,7 +731,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 
 		ScaLBL_DeviceBarrier(); comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 
 		while (timestep < timesteps) {
 			
@@ -752,7 +752,7 @@ int main(int argc, char **argv)
 
 		}
 		//************************************************************************/
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		//	cout << "CPU time: " << (stoptime - starttime) << " seconds" << endl;
 		cputime = stoptime - starttime;
 		//	cout << "Lattice update rate: "<< double(Nx*Ny*Nz*timestep)/cputime/1000000 <<  " MLUPS" << endl;
@@ -795,7 +795,7 @@ int main(int argc, char **argv)
     			}
     		}
     	}
-    	sum = comm.sumReduce( sum_local );
+    	MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
     	double PoreVel = sum*iVol_global;
     	if (rank==0) printf("Velocity = %f \n",PoreVel);
 
diff --git a/tests/TestMicroCTReader.cpp b/tests/TestMicroCTReader.cpp
index 52a5b9d3..9a54610c 100644
--- a/tests/TestMicroCTReader.cpp
+++ b/tests/TestMicroCTReader.cpp
@@ -62,6 +62,7 @@ int main(int argc, char **argv)
     int N_errors = ut.NumFailGlobal();
 
     // Close MPI
+    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/TestMomentsD3Q19.cpp b/tests/TestMomentsD3Q19.cpp
index 2660ed26..6bd3e8ff 100644
--- a/tests/TestMomentsD3Q19.cpp
+++ b/tests/TestMomentsD3Q19.cpp
@@ -539,7 +539,7 @@ int main (int argc, char **argv)
 
     error=count;
     // Finished
-    comm.barrier();
+    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
     return error; 
 }
diff --git a/tests/TestNetcdf.cpp b/tests/TestNetcdf.cpp
index 3d0498d2..8768c9ea 100644
--- a/tests/TestNetcdf.cpp
+++ b/tests/TestNetcdf.cpp
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
     PROFILE_SAVE("TestNetcdf");
 
     // Close MPI
-    comm.barrier();
+    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/TestSegDist.cpp b/tests/TestSegDist.cpp
index ecb6d6b9..b5e23ec8 100644
--- a/tests/TestSegDist.cpp
+++ b/tests/TestSegDist.cpp
@@ -100,10 +100,10 @@ int main(int argc, char **argv)
     comm.barrier();
     if (rank==0) printf("Initialized! Converting to Signed Distance function \n");
 
-    double t1 = Utilities::MPI::time();
+    double t1 = MPI_Wtime();
     DoubleArray Distance(nx,ny,nz);
     CalcDist(Distance,id,Dm,{false,false,false});
-    double t2 = Utilities::MPI::time();
+    double t2 = MPI_Wtime();
     if (rank==0)
         printf("Total time: %f seconds \n",t2-t1);
 
diff --git a/tests/lb2_CMT_wia.cpp b/tests/lb2_CMT_wia.cpp
index 389bc8a8..820428a3 100644
--- a/tests/lb2_CMT_wia.cpp
+++ b/tests/lb2_CMT_wia.cpp
@@ -292,18 +292,18 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x,2*sendCount_x,rank_x,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X,2*recvCount_X,rank_X,recvtag);
-		req1[1] = comm.Isend(sendbuf_X,2*sendCount_X,rank_X,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x,2*recvCount_x,rank_x,recvtag);
-		req1[2] = comm.Isend(sendbuf_y,2*sendCount_y,rank_y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y,2*recvCount_Y,rank_Y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y,2*sendCount_Y,rank_Y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y,2*recvCount_y,rank_y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z,2*sendCount_z,rank_z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z,2*recvCount_Z,rank_Z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z,2*sendCount_Z,rank_Z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z,2*recvCount_z,rank_z,recvtag);
+		MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
 */		//...................................................................................
 		
 		ScaLBL_D3Q7_Swap(ID, &packed_even[0], &packed_odd[0], Nx, Ny, Nz);
@@ -311,8 +311,8 @@ int main(int argc, char **argv)
 		
 /*		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(6,req1);
-		comm.waitAll(6,req2);
+		MPI_Waitall(6,req1,stat1);
+		MPI_Waitall(6,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -358,7 +358,7 @@ int main(int argc, char **argv)
 	fclose(PHASE);
     
     // Close MPI
-    comm.barrier();
+    MPI_Barrier(MPI_COMM_WORLD);
     MPI_Finalize();
     return 0;
 }
diff --git a/tests/lb2_Color_blob_wia_mpi.cpp b/tests/lb2_Color_blob_wia_mpi.cpp
index e3323612..70342176 100644
--- a/tests/lb2_Color_blob_wia_mpi.cpp
+++ b/tests/lb2_Color_blob_wia_mpi.cpp
@@ -114,6 +114,7 @@ int main(int argc, char **argv)
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -206,36 +207,36 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	//.................................................
-	comm.bcast(&tau,1,0);
-	comm.bcast(&alpha,1,0);
-	comm.bcast(&beta,1,0);
-	comm.bcast(&das,1,0);
-	comm.bcast(&dbs,1,0);
-	comm.bcast(&phi_s,1,0);
-	comm.bcast(&wp_saturation,1,0);
-	comm.bcast(&pBC,1,0);
-	comm.bcast(&Restart,1,0);
-	comm.bcast(&din,1,0);
-	comm.bcast(&dout,1,0);
-	comm.bcast(&Fx,1,0);
-	comm.bcast(&Fy,1,0);
-	comm.bcast(&Fz,1,0);
-	comm.bcast(&timestepMax,1,0);
-	comm.bcast(&interval,1,0);
-	comm.bcast(&tol,1,0);
+	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&das,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dbs,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&phi_s,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&wp_saturation,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
+	MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+	MPI_Bcast(&interval,1,MPI_INT,0,comm);
+	MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-//	comm.bcast(&nBlocks,1,0);
-//	comm.bcast(&nthreads,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+//	MPI_Bcast(&nBlocks,1,MPI_INT,0,comm);
+//	MPI_Bcast(&nthreads,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 	
@@ -398,10 +399,10 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,nspheres,0);
-	comm.bcast(cy,nspheres,0);
-	comm.bcast(cz,nspheres,0);
-	comm.bcast(rad,nspheres,0);
+	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
@@ -417,7 +418,7 @@ int main(int argc, char **argv)
 		D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 		printf("Sauter Mean Diameter (computed from sphere packing) = %f \n ",D);
 	}
-	comm.bcast(&D,1,0);
+	MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
 
 	//.......................................................................
 //	sprintf(LocalRankString,"%05d",rank);
@@ -477,7 +478,7 @@ int main(int argc, char **argv)
 	id[(Nz-1)*Nx*Ny] = id[(Nz-1)*Nx*Ny+Nx-1] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx + Nx-1] = 0;
 	//.........................................................
 	sum_local = 1.0*sum;
-	porosity = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -885,24 +886,42 @@ int main(int argc, char **argv)
 	PackID(sendList_yZ, sendCount_yZ ,sendID_yZ, id);
 	PackID(sendList_YZ, sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,sendCount_x,rank_x,sendtag,recvID_X,recvCount_X,rank_X,recvtag);
-	comm.sendrecv(sendID_X,sendCount_X,rank_X,sendtag,recvID_x,recvCount_x,rank_x,recvtag);
-	comm.sendrecv(sendID_y,sendCount_y,rank_y,sendtag,recvID_Y,recvCount_Y,rank_Y,recvtag);
-	comm.sendrecv(sendID_Y,sendCount_Y,rank_Y,sendtag,recvID_y,recvCount_y,rank_y,recvtag);
-	comm.sendrecv(sendID_z,sendCount_z,rank_z,sendtag,recvID_Z,recvCount_Z,rank_Z,recvtag);
-	comm.sendrecv(sendID_Z,sendCount_Z,rank_Z,sendtag,recvID_z,recvCount_z,rank_z,recvtag);
-	comm.sendrecv(sendID_xy,sendCount_xy,rank_xy,sendtag,recvID_XY,recvCount_XY,rank_XY,recvtag);
-	comm.sendrecv(sendID_XY,sendCount_XY,rank_XY,sendtag,recvID_xy,recvCount_xy,rank_xy,recvtag);
-	comm.sendrecv(sendID_Xy,sendCount_Xy,rank_Xy,sendtag,recvID_xY,recvCount_xY,rank_xY,recvtag);
-	comm.sendrecv(sendID_xY,sendCount_xY,rank_xY,sendtag,recvID_Xy,recvCount_Xy,rank_Xy,recvtag);
-	comm.sendrecv(sendID_xz,sendCount_xz,rank_xz,sendtag,recvID_XZ,recvCount_XZ,rank_XZ,recvtag);
-	comm.sendrecv(sendID_XZ,sendCount_XZ,rank_XZ,sendtag,recvID_xz,recvCount_xz,rank_xz,recvtag);
-	comm.sendrecv(sendID_Xz,sendCount_Xz,rank_Xz,sendtag,recvID_xZ,recvCount_xZ,rank_xZ,recvtag);
-	comm.sendrecv(sendID_xZ,sendCount_xZ,rank_xZ,sendtag,recvID_Xz,recvCount_Xz,rank_Xz,recvtag);
-	comm.sendrecv(sendID_yz,sendCount_yz,rank_yz,sendtag,recvID_YZ,recvCount_YZ,rank_YZ,recvtag);
-	comm.sendrecv(sendID_YZ,sendCount_YZ,rank_YZ,sendtag,recvID_yz,recvCount_yz,rank_yz,recvtag);
-	comm.sendrecv(sendID_Yz,sendCount_Yz,rank_Yz,sendtag,recvID_yZ,recvCount_yZ,rank_yZ,recvtag);
-	comm.sendrecv(sendID_yZ,sendCount_yZ,rank_yZ,sendtag,recvID_Yz,recvCount_Yz,rank_Yz,recvtag);
+	MPI_Sendrecv(sendID_x,sendCount_x,MPI_CHAR,rank_x,sendtag,
+			recvID_X,recvCount_X,MPI_CHAR,rank_X,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,sendCount_X,MPI_CHAR,rank_X,sendtag,
+			recvID_x,recvCount_x,MPI_CHAR,rank_x,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,sendCount_y,MPI_CHAR,rank_y,sendtag,
+			recvID_Y,recvCount_Y,MPI_CHAR,rank_Y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,sendCount_Y,MPI_CHAR,rank_Y,sendtag,
+			recvID_y,recvCount_y,MPI_CHAR,rank_y,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,sendCount_z,MPI_CHAR,rank_z,sendtag,
+			recvID_Z,recvCount_Z,MPI_CHAR,rank_Z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,sendCount_Z,MPI_CHAR,rank_Z,sendtag,
+			recvID_z,recvCount_z,MPI_CHAR,rank_z,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,sendCount_xy,MPI_CHAR,rank_xy,sendtag,
+			recvID_XY,recvCount_XY,MPI_CHAR,rank_XY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,sendCount_XY,MPI_CHAR,rank_XY,sendtag,
+			recvID_xy,recvCount_xy,MPI_CHAR,rank_xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,sendCount_Xy,MPI_CHAR,rank_Xy,sendtag,
+			recvID_xY,recvCount_xY,MPI_CHAR,rank_xY,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,sendCount_xY,MPI_CHAR,rank_xY,sendtag,
+			recvID_Xy,recvCount_Xy,MPI_CHAR,rank_Xy,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,sendCount_xz,MPI_CHAR,rank_xz,sendtag,
+			recvID_XZ,recvCount_XZ,MPI_CHAR,rank_XZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,sendCount_XZ,MPI_CHAR,rank_XZ,sendtag,
+			recvID_xz,recvCount_xz,MPI_CHAR,rank_xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,sendCount_Xz,MPI_CHAR,rank_Xz,sendtag,
+			recvID_xZ,recvCount_xZ,MPI_CHAR,rank_xZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,sendCount_xZ,MPI_CHAR,rank_xZ,sendtag,
+			recvID_Xz,recvCount_Xz,MPI_CHAR,rank_Xz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,sendCount_yz,MPI_CHAR,rank_yz,sendtag,
+			recvID_YZ,recvCount_YZ,MPI_CHAR,rank_YZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,sendCount_YZ,MPI_CHAR,rank_YZ,sendtag,
+			recvID_yz,recvCount_yz,MPI_CHAR,rank_yz,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,sendCount_Yz,MPI_CHAR,rank_Yz,sendtag,
+			recvID_yZ,recvCount_yZ,MPI_CHAR,rank_yZ,recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,sendCount_yZ,MPI_CHAR,rank_yZ,sendtag,
+			recvID_Yz,recvCount_Yz,MPI_CHAR,rank_Yz,recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(recvList_x, recvCount_x ,recvID_x, id);
 	UnpackID(recvList_X, recvCount_X ,recvID_X, id);
@@ -1361,48 +1380,48 @@ int main(int argc, char **argv)
 	//...................................................................................
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
-	req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_x,sendtag);
-	req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag);
-	req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_X,sendtag);
-	req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag);
-	req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_y,sendtag);
-	req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag);
-	req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag);
-	req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag);
-	req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_z,sendtag);
-	req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag);
-	req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag);
-	req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag);
-	req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-	req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
-	req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-	req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
-	req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-	req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
-	req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-	req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
-	req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-	req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
-	req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
-	req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
-	req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
-	req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-	req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
-	req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
-	req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
-	req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 	//...................................................................................
 	//...................................................................................
 	// Wait for completion of Indicator Field communication
 	//...................................................................................
-	comm.waitAll(18,req1);
-	comm.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 	//...................................................................................
 	//...................................................................................
@@ -1478,7 +1497,7 @@ int main(int argc, char **argv)
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	comm.barrier();
-	starttime = Utilities::MPI::time();
+	starttime = MPI_Wtime();
 	//.........................................
 	
 	sendtag = recvtag = 5;
@@ -1574,42 +1593,42 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x, 5*sendCount_x,rank_x,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X, 5*recvCount_X,rank_X,recvtag);
-		req1[1] = comm.Isend(sendbuf_X, 5*sendCount_X,rank_X,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x, 5*recvCount_x,rank_x,recvtag);
-		req1[2] = comm.Isend(sendbuf_y, 5*sendCount_y,rank_y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y, 5*recvCount_Y,rank_Y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y, 5*sendCount_Y,rank_Y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y, 5*recvCount_y,rank_y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z, 5*sendCount_z,rank_z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z, 5*recvCount_Z,rank_Z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z, 5*sendCount_Z,rank_Z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z, 5*recvCount_z,rank_z,recvtag);
-		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
-		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
-		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
-		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
-		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
-		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
-		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
-		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
-		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
-		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
-		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
-		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+		MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 		//...................................................................................
 
 		//*************************************************************************
@@ -1629,8 +1648,8 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 
 		//...................................................................................
 		// Unpack the distributions on the device
@@ -1724,18 +1743,18 @@ int main(int argc, char **argv)
 
 		//...................................................................................
 		// Send all the distributions
-		req1[0] = comm.Isend(sendbuf_x, 2*sendCount_x,rank_x,sendtag);
-		req2[0] = comm.Irecv(recvbuf_X, 2*recvCount_X,rank_X,recvtag);
-		req1[1] = comm.Isend(sendbuf_X, 2*sendCount_X,rank_X,sendtag);
-		req2[1] = comm.Irecv(recvbuf_x, 2*recvCount_x,rank_x,recvtag);
-		req1[2] = comm.Isend(sendbuf_y, 2*sendCount_y,rank_y,sendtag);
-		req2[2] = comm.Irecv(recvbuf_Y, 2*recvCount_Y,rank_Y,recvtag);
-		req1[3] = comm.Isend(sendbuf_Y, 2*sendCount_Y,rank_Y,sendtag);
-		req2[3] = comm.Irecv(recvbuf_y, 2*recvCount_y,rank_y,recvtag);
-		req1[4] = comm.Isend(sendbuf_z, 2*sendCount_z,rank_z,sendtag);
-		req2[4] = comm.Irecv(recvbuf_Z, 2*recvCount_Z,rank_Z,recvtag);
-		req1[5] = comm.Isend(sendbuf_Z, 2*sendCount_Z,rank_Z,sendtag);
-		req2[5] = comm.Irecv(recvbuf_z, 2*recvCount_z,rank_z,recvtag);
+		MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
 		//...................................................................................
 		
 		ScaLBL_D3Q7_Swap(ID, A_even, A_odd, Nx, Ny, Nz);
@@ -1743,8 +1762,8 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 		// Wait for completion of D3Q19 communication
-		comm.waitAll(6,req1);
-		comm.waitAll(6,req2);
+		MPI_Waitall(6,req1,stat1);
+		MPI_Waitall(6,req2,stat2);
 		//...................................................................................
 		// Unpack the distributions on the device
 		//...................................................................................
@@ -1805,48 +1824,48 @@ int main(int argc, char **argv)
 		//...................................................................................
 		// Send / Recv all the phase indcator field values
 		//...................................................................................
-		req1[0] = comm.Isend(sendbuf_x, sendCount_x,rank_x,sendtag,comm,&req1[0]);
-		req2[0] = comm.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag,comm,&req2[0]);
-		req1[1] = comm.Isend(sendbuf_X, sendCount_X,rank_X,sendtag,comm,&req1[1]);
-		req2[1] = comm.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag,comm,&req2[1]);
-		req1[2] = comm.Isend(sendbuf_y, sendCount_y,rank_y,sendtag,comm,&req1[2]);
-		req2[2] = comm.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag,comm,&req2[2]);
-		req1[3] = comm.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag,comm,&req1[3]);
-		req2[3] = comm.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag,comm,&req2[3]);
-		req1[4] = comm.Isend(sendbuf_z, sendCount_z,rank_z,sendtag,comm,&req1[4]);
-		req2[4] = comm.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag,comm,&req2[4]);
-		req1[5] = comm.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag,comm,&req1[5]);
-		req2[5] = comm.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag,comm,&req2[5]);
-		req1[6] = comm.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag,comm,&req1[6]);
-		req2[6] = comm.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag,comm,&req2[6]);
-		req1[7] = comm.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag,comm,&req1[7]);
-		req2[7] = comm.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag,comm,&req2[7]);
-		req1[8] = comm.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag,comm,&req1[8]);
-		req2[8] = comm.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag,comm,&req2[8]);
-		req1[9] = comm.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag,comm,&req1[9]);
-		req2[9] = comm.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag,comm,&req2[9]);
-		req1[10] = comm.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag,comm,&req1[10]);
-		req2[10] = comm.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag,comm,&req2[10]);
-		req1[11] = comm.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag,comm,&req1[11]);
-		req2[11] = comm.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag,comm,&req2[11]);
-		req1[12] = comm.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag,comm,&req1[12]);
-		req2[12] = comm.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag,comm,&req2[12]);
-		req1[13] = comm.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag,comm,&req1[13]);
-		req2[13] = comm.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag,comm,&req2[13]);
-		req1[14] = comm.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag,comm,&req1[14]);
-		req2[14] = comm.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag,comm,&req2[14]);
-		req1[15] = comm.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag,comm,&req1[15]);
-		req2[15] = comm.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag,comm,&req2[15]);
-		req1[16] = comm.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag,comm,&req1[16]);
-		req2[16] = comm.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag,comm,&req2[16]);
-		req1[17] = comm.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag,comm,&req1[17]);
-		req2[17] = comm.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag,comm,&req2[17]);
+		MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,comm,&req1[0]);
+		MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,comm,&req2[0]);
+		MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,comm,&req1[1]);
+		MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,comm,&req2[1]);
+		MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,comm,&req1[2]);
+		MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,comm,&req2[2]);
+		MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,comm,&req1[3]);
+		MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,comm,&req2[3]);
+		MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,comm,&req1[4]);
+		MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,comm,&req2[4]);
+		MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,comm,&req1[5]);
+		MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,comm,&req2[5]);
+		MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,comm,&req1[6]);
+		MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,comm,&req2[6]);
+		MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,comm,&req1[7]);
+		MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,comm,&req2[7]);
+		MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,comm,&req1[8]);
+		MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,comm,&req2[8]);
+		MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,comm,&req1[9]);
+		MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,comm,&req2[9]);
+		MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,comm,&req1[10]);
+		MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,comm,&req2[10]);
+		MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,comm,&req1[11]);
+		MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,comm,&req2[11]);
+		MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,comm,&req1[12]);
+		MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,comm,&req2[12]);
+		MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,comm,&req1[13]);
+		MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,comm,&req2[13]);
+		MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,comm,&req1[14]);
+		MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,comm,&req2[14]);
+		MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,comm,&req1[15]);
+		MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,comm,&req2[15]);
+		MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,comm,&req1[16]);
+		MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,comm,&req2[16]);
+		MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,comm,&req1[17]);
+		MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,comm,&req2[17]);
 		//...................................................................................
 		//...................................................................................
 		// Wait for completion of Indicator Field communication
 		//...................................................................................
-		comm.waitAll(18,req1);
-		comm.waitAll(18,req2);
+		MPI_Waitall(18,req1,stat1);
+		MPI_Waitall(18,req2,stat2);
 		ScaLBL_DeviceBarrier();
 		//...................................................................................
 		//...................................................................................
@@ -2423,28 +2442,28 @@ int main(int argc, char **argv)
 			
 			//...........................................................................
 			comm.barrier();
-			nwp_volume_global = comm.sumReduce( nwp_volume );
-			awn_global = comm.sumReduce( awn );
-			ans_global = comm.sumReduce( ans );
-			aws_global = comm.sumReduce( aws );
-			lwns_global = comm.sumReduce( lwns );
-			As_global  = comm.sumReduce( As );
-			Jwn_global = comm.sumReduce( Jwn );
-			Kwn_global = comm.sumReduce( Kwn );
-			efawns_global = comm.sumReduce( efawns );
+			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&Kwn,&Kwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			// Phase averages
-			vol_w_global = comm.sumReduce( vol_w );
-			vol_n_global = comm.sumReduce( vol_n );
-			paw_global   = comm.sumReduce( paw );
-			pan_global   = comm.sumReduce( pan );
-			vaw_global(0) = comm.sumReduce( vaw(0) );
-			van_global(0) = comm.sumReduce( van(0) );
-			vawn_global(0) = comm.sumReduce( vawn(0) );
-			Gwn_global(0) = comm.sumReduce( Gwn(0) );
-			Gns_global(0) = comm.sumReduce( Gns(0) );
-			Gws_global(0) = comm.sumReduce( Gws(0) );
-			trawn_global = comm.sumReduce( trawn );
-			trJwn_global = comm.sumReduce( trJwn );
+			MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,comm);
+			MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			comm.barrier();
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
@@ -2670,7 +2689,7 @@ int main(int argc, char **argv)
 	//************************************************************************/
 	ScaLBL_DeviceBarrier();
 	comm.barrier();
-	stoptime = Utilities::MPI::time();
+	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
 	cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_BGK_simulator.cpp b/tests/lbpm_BGK_simulator.cpp
index 1ac61853..8b079900 100644
--- a/tests/lbpm_BGK_simulator.cpp
+++ b/tests/lbpm_BGK_simulator.cpp
@@ -97,28 +97,28 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		comm.bcast(&tau,1,0);
-		//comm.bcast(&pBC,1,0);
-		//comm.bcast(&Restart,1,0);
-		comm.bcast(&din,1,0);
-		comm.bcast(&dout,1,0);
-		comm.bcast(&Fx,1,0);
-		comm.bcast(&Fy,1,0);
-		comm.bcast(&Fz,1,0);
-		comm.bcast(&timestepMax,1,0);
-		comm.bcast(&interval,1,0);
-		comm.bcast(&tol,1,0);
+		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+		//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
+		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+		MPI_Bcast(&interval,1,MPI_INT,0,comm);
+		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 		// Computational domain
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		//comm.bcast(&nspheres,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		//MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
 		comm.barrier();
 
@@ -249,7 +249,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		sum = comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -331,7 +331,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 		//.........................................
 
 		double D32,Fo,Re,velocity,err1D,mag_force,vel_prev;
@@ -410,7 +410,7 @@ int main(int argc, char **argv)
 		//************************************************************************/
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_color_macro_simulator.cpp b/tests/lbpm_color_macro_simulator.cpp
index c92b0c45..97df6812 100644
--- a/tests/lbpm_color_macro_simulator.cpp
+++ b/tests/lbpm_color_macro_simulator.cpp
@@ -39,6 +39,9 @@ int main(int argc, char **argv)
 		int nprocx,nprocy,nprocz;
 		int iproc,jproc,kproc;
 
+		MPI_Request req1[18],req2[18];
+		MPI_Status stat1[18],stat2[18];
+
 		if (rank == 0){
 			printf("********************************************************\n");
 			printf("Running Color LBM	\n");
@@ -169,32 +172,32 @@ int main(int argc, char **argv)
 		// Broadcast simulation parameters from rank 0 to all other procs
 		comm.barrier();
 		//.................................................
-		comm.bcast(&tauA,1,0);
-		comm.bcast(&tauB,1,0);
-		comm.bcast(&rhoA,1,0);
-		comm.bcast(&rhoB,1,0);
-		comm.bcast(&alpha,1,0);
-		comm.bcast(&beta,1,0);
-		comm.bcast(&BoundaryCondition,1,0);
-		comm.bcast(&InitialCondition,1,0);
-		comm.bcast(&din,1,0);
-		comm.bcast(&dout,1,0);
-		comm.bcast(&Fx,1,0);
-		comm.bcast(&Fy,1,0);
-		comm.bcast(&Fz,1,0);
-		comm.bcast(&timestepMax,1,0);
-		comm.bcast(&RESTART_INTERVAL,1,0);
-		comm.bcast(&tol,1,0);
+		MPI_Bcast(&tauA,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&tauB,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&rhoA,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&rhoB,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&beta,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&BoundaryCondition,1,MPI_INT,0,comm);
+		MPI_Bcast(&InitialCondition,1,MPI_INT,0,comm);
+		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+		MPI_Bcast(&RESTART_INTERVAL,1,MPI_INT,0,comm);
+		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 		// Computational domain
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
 
 		flux = 0.f;
@@ -319,7 +322,7 @@ int main(int argc, char **argv)
 					timestep=0;
 				}
 			}
-			comm.bcast(&timestep,1,0);
+			MPI_Bcast(&timestep,1,MPI_INT,0,comm);
 			FILE *RESTART = fopen(LocalRestartFile,"rb");
 			if (IDFILE==NULL) ERROR("lbpm_color_simulator: Error opening file: Restart.xxxxx");
 			readID=fread(id,1,N,RESTART);
@@ -358,7 +361,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		sum - comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 		//.........................................................
@@ -534,7 +537,7 @@ int main(int argc, char **argv)
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		starttime = Utilities::MPI::time();
+		starttime = MPI_Wtime();
 		//.........................................
 
 		err = 1.0; 	
@@ -634,7 +637,7 @@ int main(int argc, char **argv)
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
 		comm.barrier();
-		stoptime = Utilities::MPI::time();
+		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
 		cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_disc_pp.cpp b/tests/lbpm_disc_pp.cpp
index 41825c7d..20d41884 100644
--- a/tests/lbpm_disc_pp.cpp
+++ b/tests/lbpm_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -147,6 +147,8 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	int depth;
 
@@ -187,16 +189,16 @@ int main(int argc, char **argv)
 	comm.barrier();
 	//.................................................
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&ndiscs,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&ndiscs,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 
@@ -273,9 +275,9 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,ndiscs,0);
-	comm.bcast(cy,ndiscs,0);
-	comm.bcast(rad,ndiscs,0);
+	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0){
@@ -344,7 +346,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	porosity = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -360,7 +362,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 	
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_inkbottle_pp.cpp b/tests/lbpm_inkbottle_pp.cpp
index ca188633..669ab8c0 100644
--- a/tests/lbpm_inkbottle_pp.cpp
+++ b/tests/lbpm_inkbottle_pp.cpp
@@ -81,16 +81,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 	
@@ -197,7 +197,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_juanes_bench_disc_pp.cpp b/tests/lbpm_juanes_bench_disc_pp.cpp
index a90d43f8..47d8cb84 100644
--- a/tests/lbpm_juanes_bench_disc_pp.cpp
+++ b/tests/lbpm_juanes_bench_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -147,6 +147,9 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
+
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -190,16 +193,16 @@ int main(int argc, char **argv)
 	comm.barrier();
 	//.................................................
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&ndiscs,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&ndiscs,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 
@@ -289,9 +292,9 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,ndiscs,0);
-	comm.bcast(cy,ndiscs,0);
-	comm.bcast(rad,ndiscs,0);
+	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
 	comm.barrier();
 	/*	if (rank == 0){
@@ -433,7 +436,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	porosity = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -449,7 +452,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_nondarcy_simulator.cpp b/tests/lbpm_nondarcy_simulator.cpp
index a25fef69..096dc790 100644
--- a/tests/lbpm_nondarcy_simulator.cpp
+++ b/tests/lbpm_nondarcy_simulator.cpp
@@ -94,6 +94,8 @@ int main(int argc, char **argv)
 			int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 			int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 			//**********************************
+			MPI_Request req1[18],req2[18];
+			MPI_Status stat1[18],stat2[18];
 
 			double REYNOLDS_NUMBER = 100.f;
 			if (argc > 1){
@@ -156,28 +158,28 @@ int main(int argc, char **argv)
 			// Broadcast simulation parameters from rank 0 to all other procs
 			comm.barrier();
 			//.................................................
-			comm.bcast(&tau,1,0);
-			//comm.bcast(&pBC,1,0);
-			//comm.bcast(&Restart,1,0);
-			comm.bcast(&din,1,0);
-			comm.bcast(&dout,1,0);
-			comm.bcast(&Fx,1,0);
-			comm.bcast(&Fy,1,0);
-			comm.bcast(&Fz,1,0);
-			comm.bcast(&timestepMax,1,0);
-			comm.bcast(&interval,1,0);
-			comm.bcast(&tol,1,0);
+			MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+			//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+			//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
+			MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+			MPI_Bcast(&interval,1,MPI_INT,0,comm);
+			MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 			// Computational domain
-			comm.bcast(&Nx,1,0);
-			comm.bcast(&Ny,1,0);
-			comm.bcast(&Nz,1,0);
-			comm.bcast(&nprocx,1,0);
-			comm.bcast(&nprocy,1,0);
-			comm.bcast(&nprocz,1,0);
-			comm.bcast(&nspheres,1,0);
-			comm.bcast(&Lx,1,0);
-			comm.bcast(&Ly,1,0);
-			comm.bcast(&Lz,1,0);
+			MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+			MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+			MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+			MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+			MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+			MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+			MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+			MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+			MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 			//.................................................
 			comm.barrier();
 
@@ -306,8 +308,8 @@ int main(int argc, char **argv)
 					}
 				}
 			}
-			por_vol = comm.sumReduce( sum_local );
-			//porosity = comm.sumReduce( sum_local );
+			MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
+			//	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 			porosity = pore_vol*iVol_global;
 			if (rank==0) printf("Media porosity = %f \n",porosity);
 			//.........................................................
@@ -431,7 +433,7 @@ int main(int argc, char **argv)
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
 			comm.barrier();
-			starttime = Utilities::MPI::time();
+			starttime = MPI_Wtime();
 			//.........................................
 
 			double D32,vawx,vawy,vawz,Fo,Re,velocity,err1D,mag_force,vel_prev;
@@ -552,7 +554,7 @@ int main(int argc, char **argv)
 			fclose(NONDARCY);
 			ScaLBL_DeviceBarrier();
 			comm.barrier();
-			stoptime = Utilities::MPI::time();
+			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
 			cputime = (stoptime - starttime)/timestep;
diff --git a/tests/lbpm_nonnewtonian_simulator.cpp b/tests/lbpm_nonnewtonian_simulator.cpp
index bea3a814..ff8792e7 100644
--- a/tests/lbpm_nonnewtonian_simulator.cpp
+++ b/tests/lbpm_nonnewtonian_simulator.cpp
@@ -124,6 +124,8 @@ int main(int argc, char **argv)
 		//		int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 		//		int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 		//**********************************
+		MPI_Request req1[18],req2[18];
+		MPI_Status stat1[18],stat2[18];
 
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -426,8 +428,8 @@ int main(int argc, char **argv)
 			}
 		}
 
-		pore_vol = comm.sumReduce( sum_local );					/*    6     */
-		//porosity = comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);					/*    6     */
+		//MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = pore_vol*iVol_global;
 
 		if (rank==0) printf("Media porosity = %f \n",porosity);
@@ -572,7 +574,7 @@ int main(int argc, char **argv)
 					timestep=5;
 				}
 			}
-			comm.bcast(&timestep,1,0);
+			MPI_Bcast(&timestep,1,MPI_INT,0,comm);
 
 			// Read in the restart file to CPU buffers
 			double *cDen = new double[2*N];
@@ -660,7 +662,7 @@ int main(int argc, char **argv)
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
 			comm.barrier();
-			starttime = Utilities::MPI::time();
+			starttime = MPI_Wtime();
 
 			/*
 			 *  Create the thread pool
@@ -808,7 +810,7 @@ int main(int argc, char **argv)
 			//************************************************************************/
 			ScaLBL_DeviceBarrier();
 			comm.barrier();
-			stoptime = Utilities::MPI::time();
+			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
 			cputime = (stoptime - starttime)/timestep;
@@ -833,6 +835,20 @@ int main(int argc, char **argv)
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 // Scrap
 
 // if (rank==0){
diff --git a/tests/lbpm_plates_pp.cpp b/tests/lbpm_plates_pp.cpp
index 37191979..acd64f52 100644
--- a/tests/lbpm_plates_pp.cpp
+++ b/tests/lbpm_plates_pp.cpp
@@ -31,6 +31,8 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	double TubeRadius =15.0;
 	double WIDTH;
@@ -75,16 +77,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 	
@@ -174,7 +176,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_porenetwork_pp.cpp b/tests/lbpm_porenetwork_pp.cpp
index 1715811f..4a6ccda7 100644
--- a/tests/lbpm_porenetwork_pp.cpp
+++ b/tests/lbpm_porenetwork_pp.cpp
@@ -24,6 +24,9 @@ int main(int argc, char **argv)
 	int iproc,jproc,kproc;
 	int sendtag,recvtag;
 	//*****************************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
+	//**********************************
 
 	int nsph,ncyl, BC;
 	nsph = atoi(argv[1]);
@@ -64,16 +67,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 	
@@ -266,7 +269,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 	if (rank==0) printf("Pore volume = %f \n",pore_vol/double(Nx*Ny*Nz));
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_random_pp.cpp b/tests/lbpm_random_pp.cpp
index 8318f50f..ad4b83cc 100644
--- a/tests/lbpm_random_pp.cpp
+++ b/tests/lbpm_random_pp.cpp
@@ -98,16 +98,16 @@ int main(int argc, char **argv)
 	}
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&nx,1,0);
-	comm.bcast(&ny,1,0);
-	comm.bcast(&nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 
@@ -166,7 +166,7 @@ int main(int argc, char **argv)
 		}
 	}
 	// total Global is the number of nodes in the pore-space
-	totalGlobal = sumReduce( count );
+	MPI_Allreduce(&count,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
 	float porosity=float(totalGlobal)/(nprocx*nprocy*nprocz*(nx-2)*(ny-2)*(nz-2));
 	if (rank==0) printf("Media Porosity: %f \n",porosity);
 
@@ -216,12 +216,12 @@ int main(int argc, char **argv)
 			sizeY = SizeY[bin];
 			sizeZ = SizeZ[bin];
 		}
-		comm.bcast(&x,1,0);
-		comm.bcast(&y,1,0);
-		comm.bcast(&z,1,0);
-		comm.bcast(&sizeX,1,0);
-		comm.bcast(&sizeY,1,0);
-		comm.bcast(&sizeZ,1,0);
+		MPI_Bcast(&x,1,MPI_INT,0,comm);
+		MPI_Bcast(&y,1,MPI_INT,0,comm);
+		MPI_Bcast(&z,1,MPI_INT,0,comm);
+		MPI_Bcast(&sizeX,1,MPI_INT,0,comm);
+		MPI_Bcast(&sizeY,1,MPI_INT,0,comm);
+		MPI_Bcast(&sizeZ,1,MPI_INT,0,comm);
 
 		//if (rank==0) printf("Broadcast block at %i,%i,%i \n",x,y,z);
 
@@ -269,7 +269,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		countGlobal = sumReduce( count );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
 		sat = float(countGlobal)/totalGlobal;
 		//if (rank==0) printf("New count=%i\n",countGlobal);
 		//if (rank==0) printf("New saturation=%f\n",sat);
@@ -345,24 +345,42 @@ int main(int argc, char **argv)
 	PackID(Dm.sendList_yZ, Dm.sendCount_yZ ,sendID_yZ, id);
 	PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
 	//......................................................................................
-	comm.sendrecv(sendID_x,Dm.sendCount_x,Dm.rank_x(),sendtag,recvID_X,Dm.recvCount_X,Dm.rank_X(),recvtag);
-	comm.sendrecv(sendID_X,Dm.sendCount_X,Dm.rank_X(),sendtag,recvID_x,Dm.recvCount_x,Dm.rank_x(),recvtag);
-	comm.sendrecv(sendID_y,Dm.sendCount_y,Dm.rank_y(),sendtag,recvID_Y,Dm.recvCount_Y,Dm.rank_Y(),recvtag);
-	comm.sendrecv(sendID_Y,Dm.sendCount_Y,Dm.rank_Y(),sendtag,recvID_y,Dm.recvCount_y,Dm.rank_y(),recvtag);
-	comm.sendrecv(sendID_z,Dm.sendCount_z,Dm.rank_z(),sendtag,recvID_Z,Dm.recvCount_Z,Dm.rank_Z(),recvtag);
-	comm.sendrecv(sendID_Z,Dm.sendCount_Z,Dm.rank_Z(),sendtag,recvID_z,Dm.recvCount_z,Dm.rank_z(),recvtag);
-	comm.sendrecv(sendID_xy,Dm.sendCount_xy,Dm.rank_xy(),sendtag,recvID_XY,Dm.recvCount_XY,Dm.rank_XY(),recvtag);
-	comm.sendrecv(sendID_XY,Dm.sendCount_XY,Dm.rank_XY(),sendtag,recvID_xy,Dm.recvCount_xy,Dm.rank_xy(),recvtag);
-	comm.sendrecv(sendID_Xy,Dm.sendCount_Xy,Dm.rank_Xy(),sendtag,recvID_xY,Dm.recvCount_xY,Dm.rank_xY(),recvtag);
-	comm.sendrecv(sendID_xY,Dm.sendCount_xY,Dm.rank_xY(),sendtag,recvID_Xy,Dm.recvCount_Xy,Dm.rank_Xy(),recvtag);
-	comm.sendrecv(sendID_xz,Dm.sendCount_xz,Dm.rank_xz(),sendtag,recvID_XZ,Dm.recvCount_XZ,Dm.rank_XZ(),recvtag);
-	comm.sendrecv(sendID_XZ,Dm.sendCount_XZ,Dm.rank_XZ(),sendtag,recvID_xz,Dm.recvCount_xz,Dm.rank_xz(),recvtag);
-	comm.sendrecv(sendID_Xz,Dm.sendCount_Xz,Dm.rank_Xz(),sendtag,recvID_xZ,Dm.recvCount_xZ,Dm.rank_xZ(),recvtag);
-	comm.sendrecv(sendID_xZ,Dm.sendCount_xZ,Dm.rank_xZ(),sendtag,recvID_Xz,Dm.recvCount_Xz,Dm.rank_Xz(),recvtag);
-	comm.sendrecv(sendID_yz,Dm.sendCount_yz,Dm.rank_yz(),sendtag,recvID_YZ,Dm.recvCount_YZ,Dm.rank_YZ(),recvtag);
-	comm.sendrecv(sendID_YZ,Dm.sendCount_YZ,Dm.rank_YZ(),sendtag,recvID_yz,Dm.recvCount_yz,Dm.rank_yz(),recvtag);
-	comm.sendrecv(sendID_Yz,Dm.sendCount_Yz,Dm.rank_Yz(),sendtag,recvID_yZ,Dm.recvCount_yZ,Dm.rank_yZ(),recvtag);
-	comm.sendrecv(sendID_yZ,Dm.sendCount_yZ,Dm.rank_yZ(),sendtag,recvID_Yz,Dm.recvCount_Yz,Dm.rank_Yz(),recvtag);
+	MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
+			recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
+			recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
+			recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
+			recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
+			recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
+			recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
+			recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
+			recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
+			recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
+			recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
+			recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
+			recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
+			recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
+			recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
+			recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
+			recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
+			recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,comm,MPI_STATUS_IGNORE);
+	MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
+			recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,comm,MPI_STATUS_IGNORE);
 	//......................................................................................
 	UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
 	UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
@@ -394,7 +412,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		countGlobal = comm.sumReduce( count );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
 		sat = float(countGlobal)/totalGlobal;
 	if (rank==0) printf("Final saturation=%f\n",sat);
 
diff --git a/tests/lbpm_segmented_decomp.cpp b/tests/lbpm_segmented_decomp.cpp
index 65b8576f..1bc89adb 100644
--- a/tests/lbpm_segmented_decomp.cpp
+++ b/tests/lbpm_segmented_decomp.cpp
@@ -85,23 +85,23 @@ int main(int argc, char **argv)
 		comm.barrier();
 		// Computational domain
 		//.................................................
-		comm.bcast(&nx,1,0);
-		comm.bcast(&ny,1,0);
-		comm.bcast(&nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		comm.bcast(&nspheres,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&xStart,1,0);
-		comm.bcast(&yStart,1,0);
-		comm.bcast(&zStart,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&xStart,1,MPI_INT,0,comm);
+		MPI_Bcast(&yStart,1,MPI_INT,0,comm);
+		MPI_Bcast(&zStart,1,MPI_INT,0,comm);
 		//.................................................
 		comm.barrier();
 
@@ -191,7 +191,7 @@ int main(int argc, char **argv)
 						}
 						else{
 							printf("Sending data to process %i \n", rnk);
-							comm.send(tmp,N,rnk,15);
+							MPI_Send(tmp,N,MPI_CHAR,rnk,15,comm);
 						}
 					}
 				}
@@ -200,7 +200,7 @@ int main(int argc, char **argv)
 		else{
 			// Recieve the subdomain from rank = 0
 			printf("Ready to recieve data %i at process %i \n", N,rank);
-			comm.recv(Dm.id,N,0,15);
+			MPI_Recv(Dm.id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
 		}
 		comm.barrier();
 
@@ -243,8 +243,8 @@ int main(int argc, char **argv)
 				printf("Original label=%i, New label=%i \n",oldlabel,newlabel);
 			}
 		}
-		comm.barrier();
-		comm.bcast(LabelList,2*NLABELS,0);
+		MPI_Barrier(MPI_COMM_WORLD);
+		MPI_Bcast(LabelList,2*NLABELS,MPI_INT,0,MPI_COMM_WORLD);
 		
 		char *newIDs;
 		newIDs= new char [nx*ny*nz];
@@ -278,8 +278,8 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		countGlobal = comm.sumReduce( count );
-		totalGlobal = comm.sumReduce( total );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
+		MPI_Allreduce(&total,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
 
 
 		float porosity = float(totalGlobal-countGlobal)/totalGlobal;
@@ -321,8 +321,8 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		countGlobal = comm.sumReduce( count );
-		totalGlobal = comm.sumReduce( total );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_INT,MPI_SUM,comm);
+		MPI_Allreduce(&total,&totalGlobal,1,MPI_INT,MPI_SUM,comm);
 		float saturation = float(countGlobal)/totalGlobal;
 		if (rank==0) printf("wetting phase saturation=%f\n",saturation);
 
diff --git a/tests/lbpm_segmented_pp.cpp b/tests/lbpm_segmented_pp.cpp
index 484a11e2..39cf0bd1 100644
--- a/tests/lbpm_segmented_pp.cpp
+++ b/tests/lbpm_segmented_pp.cpp
@@ -180,7 +180,7 @@ int main(int argc, char **argv)
 			fflush(stdout);
 			porosity = ReadFromBlock(Dm->id,Dm->iproc(),Dm->jproc(),Dm->kproc(),nx,ny,nz);
 			
-		    comm.barrier();
+			MPI_Barrier(MPI_COMM_WORLD);
 			if (rank==0) printf("Writing local ID files (poros=%f) \n",porosity);
 			fflush(stdout);
 			FILE *ID = fopen(LocalRankFilename,"wb");
diff --git a/tests/lbpm_sphere_pp.cpp b/tests/lbpm_sphere_pp.cpp
index 0df11b96..2e053eed 100644
--- a/tests/lbpm_sphere_pp.cpp
+++ b/tests/lbpm_sphere_pp.cpp
@@ -38,6 +38,8 @@ int main(int argc, char **argv)
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
 	//**********************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -123,10 +125,10 @@ int main(int argc, char **argv)
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
 	comm.barrier();
 	// Broadcast the sphere packing to all processes
-	comm.bcast(cx,nspheres,0);
-	comm.bcast(cy,nspheres,0);
-	comm.bcast(cz,nspheres,0);
-	comm.bcast(rad,nspheres,0);
+	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
 	comm.barrier();
 	if (rank == 0) cout << "Domain set." << endl;
@@ -142,7 +144,7 @@ int main(int argc, char **argv)
 		D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 		printf("Sauter Mean Diameter (computed from sphere packing) = %f \n",D);
 	}
-	comm.bcast(&D,1,0);
+	MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
 
 	//.......................................................................
 	SignedDistance(SignDist.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 		}
 	}
 	sum_local = 1.0*sum;
-	porosity = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 	porosity = porosity*iVol_global;
 	if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -191,7 +193,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 	
 	//.........................................................
 	// don't perform computations at the eight corners
diff --git a/tests/lbpm_squaretube_pp.cpp b/tests/lbpm_squaretube_pp.cpp
index a4ee5f60..c1f05aee 100644
--- a/tests/lbpm_squaretube_pp.cpp
+++ b/tests/lbpm_squaretube_pp.cpp
@@ -30,6 +30,9 @@ int main(int argc, char **argv)
 	int rank_xy,rank_XY,rank_xY,rank_Xy;
 	int rank_xz,rank_XZ,rank_xZ,rank_Xz;
 	int rank_yz,rank_YZ,rank_yZ,rank_Yz;
+	//**********************************
+	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 
 	int ORIENTATION=2; //default: the tube is aligned with Z axis
 	                   //ORIENTATION = 0: tube is aligned with X axis
@@ -80,16 +83,16 @@ int main(int argc, char **argv)
 	// Broadcast simulation parameters from rank 0 to all other procs
 	comm.barrier();
 	// Computational domain
-	comm.bcast(&Nx,1,0);
-	comm.bcast(&Ny,1,0);
-	comm.bcast(&Nz,1,0);
-	comm.bcast(&nprocx,1,0);
-	comm.bcast(&nprocy,1,0);
-	comm.bcast(&nprocz,1,0);
-	comm.bcast(&nspheres,1,0);
-	comm.bcast(&Lx,1,0);
-	comm.bcast(&Ly,1,0);
-	comm.bcast(&Lz,1,0);
+	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+	MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+	MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+	MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+	MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
 	comm.barrier();
 	
@@ -232,7 +235,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	pore_vol = comm.sumReduce( sum_local );
+	MPI_Allreduce(&sum_local,&pore_vol,1,MPI_DOUBLE,MPI_SUM,comm);
 
 	//.........................................................
 	// don't perform computations at the eight corners

From 05cafcb525c1a82a922df455eb6e207b430560c9 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 17 Mar 2020 21:44:45 -0400
Subject: [PATCH 058/121] fix failed merge

---
 CMakeLists.txt                            |  344 +-
 IO/MeshDatabase.cpp                       |  145 +-
 IO/MeshDatabase.h                         |    4 +-
 IO/PIO.cpp                                |   12 +-
 IO/PackData.cpp                           |  105 -
 IO/PackData.h                             |   78 -
 IO/Writer.cpp                             |   20 +-
 IO/Writer.h                               |    4 +-
 IO/netcdf.cpp                             |   12 +-
 IO/netcdf.h                               |    4 +-
 IO/silo.cpp                               |    2 +-
 IO/silo.h                                 |    2 +-
 IO/silo.hpp                               |    2 +-
 analysis/Minkowski.cpp                    |   16 +-
 analysis/Minkowski.h                      |    2 +-
 analysis/SubPhase.cpp                     |  154 +-
 analysis/SubPhase.h                       |    2 +-
 analysis/TwoPhase.cpp                     |   89 +-
 analysis/TwoPhase.h                       |    2 +-
 analysis/analysis.cpp                     |   83 +-
 analysis/analysis.h                       |    8 +-
 analysis/distance.cpp                     |    2 +-
 analysis/morphology.cpp                   |  106 +-
 analysis/runAnalysis.cpp                  |   49 +-
 analysis/runAnalysis.h                    |    8 +-
 analysis/uCT.cpp                          |   11 +-
 cmake/FindHIP.cmake                       |  579 ----
 common/Communication.h                    |  198 +-
 common/Communication.hpp                  |   53 +-
 common/Domain.cpp                         |  230 +-
 common/Domain.h                           |    6 +-
 common/MPI.I                              | 1143 -------
 common/MPI.cpp                            | 3758 ---------------------
 common/MPI.h                              | 1152 -------
 common/MPI_Helpers.cpp                    |  266 ++
 common/MPI_Helpers.h                      |  239 ++
 IO/PackData.hpp => common/MPI_Helpers.hpp |    9 +-
 common/ReadMicroCT.cpp                    |    4 +-
 common/ReadMicroCT.h                      |    3 +-
 common/ScaLBL.cpp                         |  226 +-
 common/ScaLBL.h                           |    3 +-
 common/SpherePack.cpp                     |    1 +
 common/SpherePack.h                       |    1 +
 common/UnitTest.cpp                       |  211 +-
 common/UnitTest.h                         |   71 +-
 common/UtilityMacros.h                    |   28 +-
 cpu/BGK.cpp                               |    5 +-
 cpu/Color.cpp                             |   51 +-
 cpu/exe/lb2_Color_mpi.cpp                 |    2 +-
 cpu/exe/lb2_Color_wia_mpi_bubble.cpp      |    2 +-
 models/ColorModel.cpp                     |   57 +-
 models/ColorModel.h                       |    6 +-
 models/DFHModel.cpp                       |   35 +-
 models/DFHModel.h                         |    6 +-
 models/MRTModel.cpp                       |   49 +-
 models/MRTModel.h                         |    6 +-
 tests/BlobAnalyzeParallel.cpp             |   21 +-
 tests/BlobIdentifyParallel.cpp            |    9 +-
 tests/ColorToBinary.cpp                   |    9 +-
 tests/ComponentLabel.cpp                  |    9 +-
 tests/GenerateSphereTest.cpp              |   75 +-
 tests/TestBlobAnalyze.cpp                 |   17 +-
 tests/TestBlobIdentify.cpp                |   37 +-
 tests/TestBlobIdentifyCorners.cpp         |    5 +-
 tests/TestBubble.cpp                      |   46 +-
 tests/TestBubbleDFH.cpp                   |   32 +-
 tests/TestColorBubble.cpp                 |   14 +-
 tests/TestColorGrad.cpp                   |   24 +-
 tests/TestColorGradDFH.cpp                |   18 +-
 tests/TestColorMassBounceback.cpp         |   32 +-
 tests/TestColorSquareTube.cpp             |   14 +-
 tests/TestCommD3Q19.cpp                   |   23 +-
 tests/TestDatabase.cpp                    |    9 +-
 tests/TestFluxBC.cpp                      |   18 +-
 tests/TestForceD3Q19.cpp                  |    7 +-
 tests/TestForceMoments.cpp                |   30 +-
 tests/TestInterfaceSpeed.cpp              |   32 +-
 tests/TestMRT.cpp                         |   38 +-
 tests/TestMap.cpp                         |   17 +-
 tests/TestMassConservationD3Q7.cpp        |   11 +-
 tests/TestMicroCTReader.cpp               |   10 +-
 tests/TestMomentsD3Q19.cpp                |    9 +-
 tests/TestNetcdf.cpp                      |   10 +-
 tests/TestPoiseuille.cpp                  |   18 +-
 tests/TestPressVel.cpp                    |   23 +-
 tests/TestSegDist.cpp                     |   13 +-
 tests/TestSubphase.cpp                    |    9 +-
 tests/TestTopo3D.cpp                      |    9 +-
 tests/TestTorus.cpp                       |    9 +-
 tests/TestTorusEvolve.cpp                 |    9 +-
 tests/TestTwoPhase.cpp                    |   11 +-
 tests/TestWriter.cpp                      |   21 +-
 tests/convertIO.cpp                       |   15 +-
 tests/hello_world.cpp                     |   11 +-
 tests/lb2_CMT_wia.cpp                     |    2 +-
 tests/lb2_Color_blob_wia_mpi.cpp          |   48 +-
 tests/lbpm_BGK_simulator.cpp              |   33 +-
 tests/lbpm_captube_pp.cpp                 |   16 +-
 tests/lbpm_color_macro_simulator.cpp      |   36 +-
 tests/lbpm_color_simulator.cpp            |   14 +-
 tests/lbpm_dfh_simulator.cpp              |   12 +-
 tests/lbpm_disc_pp.cpp                    |   24 +-
 tests/lbpm_inkbottle_pp.cpp               |   20 +-
 tests/lbpm_juanes_bench_disc_pp.cpp       |   26 +-
 tests/lbpm_minkowski_scalar.cpp           |   23 +-
 tests/lbpm_morph_pp.cpp                   |   22 +-
 tests/lbpm_morphdrain_pp.cpp              |   12 +-
 tests/lbpm_morphopen_pp.cpp               |   12 +-
 tests/lbpm_nondarcy_simulator.cpp         |   28 +-
 tests/lbpm_nonnewtonian_simulator.cpp     |   83 +-
 tests/lbpm_nonnewtonian_simulator.h       |   40 +-
 tests/lbpm_permeability_simulator.cpp     |   13 +-
 tests/lbpm_plates_pp.cpp                  |   20 +-
 tests/lbpm_porenetwork_pp.cpp             |   20 +-
 tests/lbpm_random_pp.cpp                  |   13 +-
 tests/lbpm_refine_pp.cpp                  |    9 +-
 tests/lbpm_segmented_decomp.cpp           |   20 +-
 tests/lbpm_segmented_pp.cpp               |    9 +-
 tests/lbpm_sphere_pp.cpp                  |   18 +-
 tests/lbpm_squaretube_pp.cpp              |   20 +-
 tests/lbpm_uCT_maskfilter.cpp             |   16 +-
 tests/lbpm_uCT_pp.cpp                     |   37 +-
 tests/testCommunication.cpp               |   34 +-
 tests/test_dcel_minkowski.cpp             |    8 +-
 tests/test_dcel_tri_normal.cpp            |    4 +-
 125 files changed, 2544 insertions(+), 8538 deletions(-)
 delete mode 100644 IO/PackData.cpp
 delete mode 100644 IO/PackData.h
 delete mode 100644 cmake/FindHIP.cmake
 delete mode 100644 common/MPI.I
 delete mode 100644 common/MPI.cpp
 delete mode 100644 common/MPI.h
 create mode 100644 common/MPI_Helpers.cpp
 create mode 100644 common/MPI_Helpers.h
 rename IO/PackData.hpp => common/MPI_Helpers.hpp (95%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e7eeaea..acc2c2dc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,174 +1,170 @@
-# Set some CMake properties    
-CMAKE_MINIMUM_REQUIRED( VERSION 3.9 )
-
-
-MESSAGE("====================")
-MESSAGE("Configuring LBPM-WIA")
-MESSAGE("====================")
-
-
-# Set the project name
-SET( PROJ LBPM )          # Set the project name for CMake
-SET( LBPM_LIB lbpm-wia )  # Set the final library name
-SET( LBPM_INC  )          # Set an optional subfolder for includes (e.g. include/name/...)
-SET( TEST_MAX_PROCS 16 )
-
-
-# Initialize the project
-PROJECT( ${PROJ} LANGUAGES CXX )
-
-
-# Prevent users from building in place
-IF ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
-    MESSAGE( FATAL_ERROR "Building code in place is a bad idea" )
-ENDIF()
-
-
-# Set the default C++ standard
-SET( CMAKE_CXX_EXTENSIONS OFF )
-IF ( NOT CMAKE_CXX_STANDARD )
-    IF ( CXX_STD )
-        MESSAGE( FATAL_ERROR "CXX_STD is obsolete, please set CMAKE_CXX_STANDARD" )
-    ENDIF()
-    SET( CMAKE_CXX_STANDARD 14 )
-ENDIF()
-IF ( ( "${CMAKE_CXX_STANDARD}" GREATER "90" ) OR ( "${CMAKE_CXX_STANDARD}" LESS "14" ) )
-    MESSAGE( FATAL_ERROR "C++14 or newer required" )
-ENDIF()
-
-
-# Set source/install paths
-SET( ${PROJ}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
-SET( ${PROJ}_BUILD_DIR  "${CMAKE_CURRENT_BINARY_DIR}" )
-IF( ${PROJ}_INSTALL_DIR )
-    SET( ${PROJ}_INSTALL_DIR "${${PROJ}_INSTALL_DIR}" )
-ELSEIF( PREFIX )
-    SET( ${PROJ}_INSTALL_DIR "${PREFIX}" )
-ELSEIF( NOT ${PROJ}_INSTALL_DIR )
-    SET( ${PROJ}_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}" )
-ENDIF()
-INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
-SET( CMAKE_MODULE_PATH ${${PROJ}_SOURCE_DIR} ${${PROJ}_SOURCE_DIR}/cmake )
-
-
-# Include macros
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/macros.cmake" )
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/libraries.cmake" )
-INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/LBPM-macros.cmake" )
-
-
-# Check if we are only compiling docs
-CHECK_ENABLE_FLAG( ONLY_BUILD_DOCS 0 )
-
-
-# Set testing paramaters
-SET( DROP_METHOD "http" )
-SET( DROP_SITE "" )
-SET( DROP_LOCATION "/CDash/submit.php?project=LBPM-WIA" )
-SET( TRIGGER_SITE "" )
-SET( DROP_SITE_CDASH TRUE )
-ENABLE_TESTING()
-INCLUDE( CTest )
-
-
-# Check the compile mode and compile flags
-IF ( NOT ONLY_BUILD_DOCS )
-    CONFIGURE_SYSTEM()
-ENDIF()
-
-
-# Add some directories to include
-INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
-
-
-# Create the target for documentation
-ADD_CUSTOM_TARGET( doc )
-ADD_CUSTOM_TARGET( latex_docs )
-CHECK_ENABLE_FLAG( USE_DOXYGEN 1 )
-CHECK_ENABLE_FLAG( USE_LATEX 1 )
-FILE( MAKE_DIRECTORY "${${PROJ}_INSTALL_DIR}/doc" )
-IF ( USE_DOXYGEN )
-    SET( DOXYFILE_LATEX YES )
-    SET( DOXYFILE_IN "${${PROJ}_SOURCE_DIR}/doxygen/Doxyfile.in" )
-    SET( DOXY_HEADER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/header.html" )
-    SET( DOXY_FOOTER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/footer.html" )
-    SET( DOXYFILE_OUTPUT_DIR "${${PROJ}_INSTALL_DIR}/doc" )
-    SET( DOXYFILE_SRC_HTML_DIR "${${PROJ}_SOURCE_DIR}/doxygen/html" )
-    SET( DOXYFILE_SOURCE_DIR "${${PROJ}_SOURCE_DIR}" )
-    SET( REL_PACKAGE_HTML "" )
-    SET( DOXYGEN_MACROS "" )
-    MESSAGE("DOXYGEN_MACROS = ${DOXYGEN_MACROS}")
-    INCLUDE( "${${PROJ}_SOURCE_DIR}/cmake/UseDoxygen.cmake" )
-    IF ( DOXYGEN_FOUND )
-        ADD_DEPENDENCIES( doxygen latex_docs )
-        ADD_DEPENDENCIES( doc latex_docs doxygen )
-    ELSE()
-        SET( USE_DOXYGEN 0 )
-    ENDIF()
-ENDIF()
-
-
-# Create custom targets for build-test, check, and distclean
-ADD_CUSTOM_TARGET( build-test )
-ADD_CUSTOM_TARGET( build-examples )
-ADD_CUSTOM_TARGET( check COMMAND  make test  )
-ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool StackTrace )
-
-
-# Check for CUDA
-CHECK_ENABLE_FLAG( USE_CUDA 0 )
-CHECK_ENABLE_FLAG( USE_HIP 0 )
-NULL_USE( CMAKE_CUDA_FLAGS )
-IF ( USE_CUDA )
-    ADD_DEFINITIONS( -DUSE_CUDA )
-    ENABLE_LANGUAGE( CUDA )
-ELSEIF ( USE_HIP )
-    FIND_PACKAGE( HIP )
-    MESSAGE( FATAL_ERROR "STOP" )
-ENDIF()
-
-
-# Configure external packages
-IF ( NOT ONLY_BUILD_DOCS )
-    CONFIGURE_MPI()     # MPI must be before other libraries
-    CONFIGURE_MIC()
-    CONFIGURE_NETCDF()
-    CONFIGURE_SILO()
-    CONFIGURE_LBPM()
-    CONFIGURE_TIMER( 0 "${${PROJ}_INSTALL_DIR}/null_timer" )
-    CONFIGURE_LINE_COVERAGE()
-    # Set the external library link list
-    SET( EXTERNAL_LIBS ${EXTERNAL_LIBS} ${TIMER_LIBS} )
-ENDIF()
-
-
-
-# Macro to create 1,2,4 processor tests
-MACRO( ADD_LBPM_TEST_1_2_4 EXENAME ${ARGN} )
-    ADD_LBPM_TEST( ${EXENAME} ${ARGN} )
-    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 2 ${ARGN} )
-    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 4 ${ARGN} )
-ENDMACRO()
-
-
-# Add the src directories
-IF ( NOT ONLY_BUILD_DOCS )
-    BEGIN_PACKAGE_CONFIG( lbpm-wia-library )
-    ADD_PACKAGE_SUBDIRECTORY( common )
-    ADD_PACKAGE_SUBDIRECTORY( analysis )
-    ADD_PACKAGE_SUBDIRECTORY( IO )
-    ADD_PACKAGE_SUBDIRECTORY( threadpool )
-    ADD_PACKAGE_SUBDIRECTORY( StackTrace )
-    ADD_PACKAGE_SUBDIRECTORY( models )
-    IF ( USE_CUDA )
-        ADD_PACKAGE_SUBDIRECTORY( gpu )
-    ELSE()
-        ADD_PACKAGE_SUBDIRECTORY( cpu )
-    ENDIF()
-    INSTALL_LBPM_TARGET( lbpm-wia-library  )
-    ADD_SUBDIRECTORY( tests )
-    ADD_SUBDIRECTORY( example )
-    #ADD_SUBDIRECTORY( workflows )
-    INSTALL_PROJ_LIB()
-ENDIF()
-
+# Set some CMake properties    
+CMAKE_MINIMUM_REQUIRED( VERSION 3.9 )
+
+
+MESSAGE("====================")
+MESSAGE("Configuring LBPM-WIA")
+MESSAGE("====================")
+
+
+# Set the project name
+SET( PROJ LBPM )          # Set the project name for CMake
+SET( LBPM_LIB lbpm-wia )  # Set the final library name
+SET( LBPM_INC  )          # Set an optional subfolder for includes (e.g. include/name/...)
+SET( TEST_MAX_PROCS 16 )
+
+
+# Initialize the project
+PROJECT( ${PROJ} LANGUAGES CXX )
+
+
+# Prevent users from building in place
+IF ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" )
+    MESSAGE( FATAL_ERROR "Building code in place is a bad idea" )
+ENDIF()
+
+
+# Set the default C++ standard
+SET( CMAKE_CXX_EXTENSIONS OFF )
+IF ( NOT CMAKE_CXX_STANDARD )
+    IF ( CXX_STD )
+        MESSAGE( FATAL_ERROR "CXX_STD is obsolete, please set CMAKE_CXX_STANDARD" )
+    ENDIF()
+    SET( CMAKE_CXX_STANDARD 14 )
+ENDIF()
+IF ( ( "${CMAKE_CXX_STANDARD}" GREATER "90" ) OR ( "${CMAKE_CXX_STANDARD}" LESS "14" ) )
+    MESSAGE( FATAL_ERROR "C++14 or newer required" )
+ENDIF()
+
+
+# Set source/install paths
+SET( ${PROJ}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
+SET( ${PROJ}_BUILD_DIR  "${CMAKE_CURRENT_BINARY_DIR}" )
+IF( ${PROJ}_INSTALL_DIR )
+    SET( ${PROJ}_INSTALL_DIR "${${PROJ}_INSTALL_DIR}" )
+ELSEIF( PREFIX )
+    SET( ${PROJ}_INSTALL_DIR "${PREFIX}" )
+ELSEIF( NOT ${PROJ}_INSTALL_DIR )
+    SET( ${PROJ}_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}" )
+ENDIF()
+INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
+SET( CMAKE_MODULE_PATH ${${PROJ}_SOURCE_DIR} ${${PROJ}_SOURCE_DIR}/cmake )
+
+
+# Include macros
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/macros.cmake" )
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/libraries.cmake" )
+INCLUDE( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/LBPM-macros.cmake" )
+
+
+# Check if we are only compiling docs
+CHECK_ENABLE_FLAG( ONLY_BUILD_DOCS 0 )
+
+
+# Set testing paramaters
+SET( DROP_METHOD "http" )
+SET( DROP_SITE "" )
+SET( DROP_LOCATION "/CDash/submit.php?project=LBPM-WIA" )
+SET( TRIGGER_SITE "" )
+SET( DROP_SITE_CDASH TRUE )
+ENABLE_TESTING()
+INCLUDE( CTest )
+
+
+# Check the compile mode and compile flags
+IF ( NOT ONLY_BUILD_DOCS )
+    CONFIGURE_SYSTEM()
+ENDIF()
+
+
+# Add some directories to include
+INCLUDE_DIRECTORIES( "${${PROJ}_INSTALL_DIR}/include" )
+
+
+# Create the target for documentation
+ADD_CUSTOM_TARGET( doc )
+ADD_CUSTOM_TARGET( latex_docs )
+CHECK_ENABLE_FLAG( USE_DOXYGEN 1 )
+CHECK_ENABLE_FLAG( USE_LATEX 1 )
+FILE( MAKE_DIRECTORY "${${PROJ}_INSTALL_DIR}/doc" )
+IF ( USE_DOXYGEN )
+    SET( DOXYFILE_LATEX YES )
+    SET( DOXYFILE_IN "${${PROJ}_SOURCE_DIR}/doxygen/Doxyfile.in" )
+    SET( DOXY_HEADER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/header.html" )
+    SET( DOXY_FOOTER_FILE "${${PROJ}_SOURCE_DIR}/doxygen/html/footer.html" )
+    SET( DOXYFILE_OUTPUT_DIR "${${PROJ}_INSTALL_DIR}/doc" )
+    SET( DOXYFILE_SRC_HTML_DIR "${${PROJ}_SOURCE_DIR}/doxygen/html" )
+    SET( DOXYFILE_SOURCE_DIR "${${PROJ}_SOURCE_DIR}" )
+    SET( REL_PACKAGE_HTML "" )
+    SET( DOXYGEN_MACROS "" )
+    MESSAGE("DOXYGEN_MACROS = ${DOXYGEN_MACROS}")
+    INCLUDE( "${${PROJ}_SOURCE_DIR}/cmake/UseDoxygen.cmake" )
+    IF ( DOXYGEN_FOUND )
+        ADD_DEPENDENCIES( doxygen latex_docs )
+        ADD_DEPENDENCIES( doc latex_docs doxygen )
+    ELSE()
+        SET( USE_DOXYGEN 0 )
+    ENDIF()
+ENDIF()
+
+
+# Create custom targets for build-test, check, and distclean
+ADD_CUSTOM_TARGET( build-test )
+ADD_CUSTOM_TARGET( build-examples )
+ADD_CUSTOM_TARGET( check COMMAND  make test  )
+ADD_DISTCLEAN( analysis null_timer tests liblbpm-wia.* cpu gpu example common IO threadpool StackTrace )
+
+
+# Check for CUDA
+CHECK_ENABLE_FLAG( USE_CUDA 0 )
+NULL_USE( CMAKE_CUDA_FLAGS )
+IF ( USE_CUDA )
+    ADD_DEFINITIONS( -DUSE_CUDA )
+    ENABLE_LANGUAGE( CUDA )
+ENDIF()
+
+
+# Configure external packages
+IF ( NOT ONLY_BUILD_DOCS )
+    CONFIGURE_MPI()     # MPI must be before other libraries
+    CONFIGURE_MIC()
+    CONFIGURE_NETCDF()
+    CONFIGURE_SILO()
+    CONFIGURE_LBPM()
+    CONFIGURE_TIMER( 0 "${${PROJ}_INSTALL_DIR}/null_timer" )
+    CONFIGURE_LINE_COVERAGE()
+    # Set the external library link list
+    SET( EXTERNAL_LIBS ${EXTERNAL_LIBS} ${TIMER_LIBS} )
+ENDIF()
+
+
+
+# Macro to create 1,2,4 processor tests
+MACRO( ADD_LBPM_TEST_1_2_4 EXENAME ${ARGN} )
+    ADD_LBPM_TEST( ${EXENAME} ${ARGN} )
+    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 2 ${ARGN} )
+    ADD_LBPM_TEST_PARALLEL( ${EXENAME} 4 ${ARGN} )
+ENDMACRO()
+
+
+# Add the src directories
+IF ( NOT ONLY_BUILD_DOCS )
+    BEGIN_PACKAGE_CONFIG( lbpm-wia-library )
+    ADD_PACKAGE_SUBDIRECTORY( common )
+    ADD_PACKAGE_SUBDIRECTORY( analysis )
+    ADD_PACKAGE_SUBDIRECTORY( IO )
+    ADD_PACKAGE_SUBDIRECTORY( threadpool )
+    ADD_PACKAGE_SUBDIRECTORY( StackTrace )
+    ADD_PACKAGE_SUBDIRECTORY( models )
+    IF ( USE_CUDA )
+        ADD_PACKAGE_SUBDIRECTORY( gpu )
+    ELSE()
+        ADD_PACKAGE_SUBDIRECTORY( cpu )
+    ENDIF()
+    INSTALL_LBPM_TARGET( lbpm-wia-library  )
+    ADD_SUBDIRECTORY( tests )
+    ADD_SUBDIRECTORY( example )
+    #ADD_SUBDIRECTORY( workflows )
+    INSTALL_PROJ_LIB()
+ENDIF()
+
diff --git a/IO/MeshDatabase.cpp b/IO/MeshDatabase.cpp
index 2c03ddde..1fad9231 100644
--- a/IO/MeshDatabase.cpp
+++ b/IO/MeshDatabase.cpp
@@ -1,8 +1,7 @@
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
-#include "IO/PackData.h"
 #include "IO/IOHelpers.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 
 #include <vector>
@@ -14,6 +13,8 @@
 
 
 
+/****************************************************
+****************************************************/
 // MeshType
 template<>
 size_t packsize<IO::MeshType>( const IO::MeshType& rhs )
@@ -246,76 +247,80 @@ void DatabaseEntry::read( const std::string& line )
 
 // Gather the mesh databases from all processors
 inline int tod( int N ) { return (N+7)/sizeof(double); }
-std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, const Utilities::MPI& comm )
+std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, MPI_Comm comm )
 {
-    if ( comm.getSize() == 1 )
-        return meshes;
-    PROFILE_START("gatherAll");
-    PROFILE_START("gatherAll-pack",2);
-    int size = comm.getSize();
-    // First pack the mesh data to local buffers
-    int localsize = 0;
-    for (size_t i=0; i<meshes.size(); i++)
-        localsize += tod(packsize(meshes[i]));
-    auto localbuf = new double[localsize];
-    int pos = 0;
-    for (size_t i=0; i<meshes.size(); i++) {
-        pack( meshes[i], (char*) &localbuf[pos] );
-        pos += tod(packsize(meshes[i]));
-    }
-    PROFILE_STOP("gatherAll-pack",2);
-    // Get the number of bytes each processor will be sending/recieving
-    PROFILE_START("gatherAll-send1",2);
-    auto recvsize = comm.allGather( localsize );
-    int globalsize = recvsize[0];
-    auto disp = new int[size];
-    disp[0] = 0;
-    for (int i=1; i<size; i++) {
-        disp[i] = disp[i-1] + recvsize[i];
-        globalsize += recvsize[i];
-    }
-    PROFILE_STOP("gatherAll-send1",2);
-    // Send/recv the global data
-    PROFILE_START("gatherAll-send2",2);
-    auto globalbuf = new double[globalsize];
-    comm.allGather(localbuf,localsize,globalbuf,recvsize.data(),disp,true);
-    PROFILE_STOP("gatherAll-send2",2);
-    // Unpack the data
-    PROFILE_START("gatherAll-unpack",2);
-    std::map<std::string,MeshDatabase> data;
-    pos = 0;
-    while ( pos < globalsize ) {
-        MeshDatabase tmp;
-        unpack(tmp,(char*)&globalbuf[pos]);
-        pos += tod(packsize(tmp));
-        std::map<std::string,MeshDatabase>::iterator it = data.find(tmp.name);
-        if ( it==data.end() ) {
-            data[tmp.name] = tmp;
-        } else {
-            for (size_t i=0; i<tmp.domains.size(); i++)
-                it->second.domains.push_back(tmp.domains[i]);
-            for (size_t i=0; i<tmp.variables.size(); i++)
-                it->second.variables.push_back(tmp.variables[i]);
-            it->second.variable_data.insert(tmp.variable_data.begin(),tmp.variable_data.end());
+    #ifdef USE_MPI
+        PROFILE_START("gatherAll");
+        PROFILE_START("gatherAll-pack",2);
+        int size = MPI_WORLD_SIZE();
+        // First pack the mesh data to local buffers
+        int localsize = 0;
+        for (size_t i=0; i<meshes.size(); i++)
+            localsize += tod(packsize(meshes[i]));
+        auto localbuf = new double[localsize];
+        int pos = 0;
+        for (size_t i=0; i<meshes.size(); i++) {
+            pack( meshes[i], (char*) &localbuf[pos] );
+            pos += tod(packsize(meshes[i]));
         }
-    }
-    for (auto it=data.begin(); it!=data.end(); ++it) {
-        // Get the unique variables
-        std::set<VariableDatabase> data2(it->second.variables.begin(),it->second.variables.end());
-        it->second.variables = std::vector<VariableDatabase>(data2.begin(),data2.end());
-    }
-    // Free temporary memory
-    delete [] localbuf;
-    delete [] disp;
-    delete [] globalbuf;
-    // Return the results
-    std::vector<MeshDatabase> data2(data.size());
-    size_t i=0; 
-    for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it, ++i)
-        data2[i] = it->second;
-    PROFILE_STOP("gatherAll-unpack",2);
-    PROFILE_STOP("gatherAll");
-    return data2;
+        PROFILE_STOP("gatherAll-pack",2);
+        // Get the number of bytes each processor will be sending/recieving
+        PROFILE_START("gatherAll-send1",2);
+        auto recvsize = new int[size];
+        MPI_Allgather(&localsize,1,MPI_INT,recvsize,1,MPI_INT,comm);
+        int globalsize = recvsize[0];
+        auto disp = new int[size];
+        disp[0] = 0;
+        for (int i=1; i<size; i++) {
+            disp[i] = disp[i-1] + recvsize[i];
+            globalsize += recvsize[i];
+        }
+        PROFILE_STOP("gatherAll-send1",2);
+        // Send/recv the global data
+        PROFILE_START("gatherAll-send2",2);
+        auto globalbuf = new double[globalsize];
+        MPI_Allgatherv(localbuf,localsize,MPI_DOUBLE,globalbuf,recvsize,disp,MPI_DOUBLE,comm);
+        PROFILE_STOP("gatherAll-send2",2);
+        // Unpack the data
+        PROFILE_START("gatherAll-unpack",2);
+        std::map<std::string,MeshDatabase> data;
+        pos = 0;
+        while ( pos < globalsize ) {
+            MeshDatabase tmp;
+            unpack(tmp,(char*)&globalbuf[pos]);
+            pos += tod(packsize(tmp));
+            std::map<std::string,MeshDatabase>::iterator it = data.find(tmp.name);
+            if ( it==data.end() ) {
+                data[tmp.name] = tmp;
+            } else {
+                for (size_t i=0; i<tmp.domains.size(); i++)
+                    it->second.domains.push_back(tmp.domains[i]);
+                for (size_t i=0; i<tmp.variables.size(); i++)
+                    it->second.variables.push_back(tmp.variables[i]);
+                it->second.variable_data.insert(tmp.variable_data.begin(),tmp.variable_data.end());
+            }
+        }
+        for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it) {
+            // Get the unique variables
+            std::set<VariableDatabase> data2(it->second.variables.begin(),it->second.variables.end());
+            it->second.variables = std::vector<VariableDatabase>(data2.begin(),data2.end());
+        }
+        // Free temporary memory
+        delete [] localbuf;
+        delete [] recvsize;
+        delete [] disp;
+        delete [] globalbuf;
+        // Return the results
+        std::vector<MeshDatabase> data2(data.size());
+        size_t i=0; 
+        for (std::map<std::string,MeshDatabase>::iterator it=data.begin(); it!=data.end(); ++it, ++i)
+            data2[i] = it->second;
+        PROFILE_STOP("gatherAll-unpack",2);
+        PROFILE_STOP("gatherAll");
+        return data2;
+    #else
+        return meshes;
+    #endif
 }
 
 
diff --git a/IO/MeshDatabase.h b/IO/MeshDatabase.h
index 8e501624..9f544925 100644
--- a/IO/MeshDatabase.h
+++ b/IO/MeshDatabase.h
@@ -2,7 +2,7 @@
 #define MeshDatabase_INC
 
 #include "IO/Mesh.h" 
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 #include <iostream>
 #include <memory>
@@ -70,7 +70,7 @@ public:
 
 
 //! Gather the mesh databases from all processors
-std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, const Utilities::MPI& comm );
+std::vector<MeshDatabase> gatherAll( const std::vector<MeshDatabase>& meshes, MPI_Comm comm );
 
 
 //! Write the mesh databases to a file
diff --git a/IO/PIO.cpp b/IO/PIO.cpp
index 3c2f3934..6c6ece2d 100644
--- a/IO/PIO.cpp
+++ b/IO/PIO.cpp
@@ -1,6 +1,6 @@
 #include "IO/PIO.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 #include <fstream>
 #include <string>
@@ -36,7 +36,10 @@ static void shutdownFilestream( )
 }
 void Utilities::logOnlyNodeZero( const std::string &filename )
 {
-    int rank = ::Utilities::MPI( MPI_COMM_WORLD ).getRank();
+    int rank = 0;
+    #ifdef USE_MPI
+        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+    #endif
     if ( rank == 0 )
         logAllNodes(filename,true);
 }
@@ -51,7 +54,10 @@ void Utilities::logAllNodes( const std::string &filename, bool singleStream )
     // Open the log stream and redirect output
     std::string full_filename = filename;
     if ( !singleStream ) {
-        int rank = ::Utilities::MPI( MPI_COMM_WORLD ).getRank();
+        int rank = 0;
+        #ifdef USE_MPI
+            MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+        #endif
         char tmp[100];
         sprintf(tmp,".%04i",rank);
         full_filename += std::string(tmp);
diff --git a/IO/PackData.cpp b/IO/PackData.cpp
deleted file mode 100644
index f10d9ca7..00000000
--- a/IO/PackData.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "IO/PackData.h"
-
-#include <string.h>
-
-
-/********************************************************
-* Concrete implimentations for packing/unpacking        *
-********************************************************/
-// unsigned char
-template<>
-size_t packsize<unsigned char>( const unsigned char& rhs )
-{
-    return sizeof(unsigned char);
-}
-template<>
-void pack<unsigned char>( const unsigned char& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(unsigned char));
-}
-template<>
-void unpack<unsigned char>( unsigned char& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(unsigned char));
-}
-// char
-template<>
-size_t packsize<char>( const char& rhs )
-{
-    return sizeof(char);
-}
-template<>
-void pack<char>( const char& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(char));
-}
-template<>
-void unpack<char>( char& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(char));
-}
-// int
-template<>
-size_t packsize<int>( const int& rhs )
-{
-    return sizeof(int);
-}
-template<>
-void pack<int>( const int& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(int));
-}
-template<>
-void unpack<int>( int& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(int));
-}
-// unsigned int
-template<>
-size_t packsize<unsigned int>( const unsigned int& rhs )
-{
-    return sizeof(unsigned int);
-}
-template<>
-void pack<unsigned int>( const unsigned int& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(int));
-}
-template<>
-void unpack<unsigned int>( unsigned int& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(int));
-}
-// size_t
-template<>
-size_t packsize<size_t>( const size_t& rhs )
-{
-    return sizeof(size_t);
-}
-template<>
-void pack<size_t>( const size_t& rhs, char *buffer )
-{
-    memcpy(buffer,&rhs,sizeof(size_t));
-}
-template<>
-void unpack<size_t>( size_t& data, const char *buffer )
-{
-    memcpy(&data,buffer,sizeof(size_t));
-}
-// std::string
-template<>
-size_t packsize<std::string>( const std::string& rhs )
-{
-    return rhs.size()+1;
-}
-template<>
-void pack<std::string>( const std::string& rhs, char *buffer )
-{
-    memcpy(buffer,rhs.c_str(),rhs.size()+1);
-}
-template<>
-void unpack<std::string>( std::string& data, const char *buffer )
-{
-    data = std::string(buffer);
-}
-
diff --git a/IO/PackData.h b/IO/PackData.h
deleted file mode 100644
index 85326c0b..00000000
--- a/IO/PackData.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// This file contains unctions to pack/unpack data structures
-#ifndef included_PackData
-#define included_PackData
-
-#include <vector>
-#include <set>
-#include <map>
-
-
-//! Template function to return the buffer size required to pack a class
-template<class TYPE>
-size_t packsize( const TYPE& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const TYPE& rhs, char *buffer );
-
-//! Template function to unpack a class from a buffer
-template<class TYPE>
-void unpack( TYPE& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::vector
-template<class TYPE>
-size_t packsize( const std::vector<TYPE>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const std::vector<TYPE>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void unpack( std::vector<TYPE>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::pair
-template<class TYPE1, class TYPE2>
-size_t packsize( const std::pair<TYPE1,TYPE2>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void pack( const std::pair<TYPE1,TYPE2>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void unpack( std::pair<TYPE1,TYPE2>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::map
-template<class TYPE1, class TYPE2>
-size_t packsize( const std::map<TYPE1,TYPE2>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void pack( const std::map<TYPE1,TYPE2>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE1, class TYPE2>
-void unpack( std::map<TYPE1,TYPE2>& data, const char *buffer );
-
-
-//! Template function to return the buffer size required to pack a std::set
-template<class TYPE>
-size_t packsize( const std::set<TYPE>& rhs );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void pack( const std::set<TYPE>& rhs, char *buffer );
-
-//! Template function to pack a class to a buffer
-template<class TYPE>
-void unpack( std::set<TYPE>& data, const char *buffer );
-
-
-#include "IO/PackData.hpp"
-
-#endif
-
diff --git a/IO/Writer.cpp b/IO/Writer.cpp
index 61c333af..6581ad42 100644
--- a/IO/Writer.cpp
+++ b/IO/Writer.cpp
@@ -2,7 +2,7 @@
 #include "IO/MeshDatabase.h"
 #include "IO/IOHelpers.h"
 #include "IO/silo.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 
 #include <sys/stat.h>
@@ -36,7 +36,7 @@ void IO::initialize( const std::string& path, const std::string& format, bool ap
         global_IO_format = Format::SILO;
     else
         ERROR("Unknown format");
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = comm_rank(MPI_COMM_WORLD);
     if ( !append && rank==0 ) {
         mkdir(path.c_str(),S_IRWXU|S_IRGRP);
         std::string filename;
@@ -55,7 +55,7 @@ void IO::initialize( const std::string& path, const std::string& format, bool ap
 // Write the mesh data in the original format
 static std::vector<IO::MeshDatabase> writeMeshesOrigFormat( const std::vector<IO::MeshDataStruct>& meshData, const std::string& path )
 {
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = MPI_WORLD_RANK();
     std::vector<IO::MeshDatabase> meshes_written;
     for (size_t i=0; i<meshData.size(); i++) {
         char domainname[100], filename[100], fullpath[200];
@@ -120,7 +120,7 @@ static std::vector<IO::MeshDatabase> writeMeshesOrigFormat( const std::vector<IO
 // Create the database entry for the mesh data
 static IO::MeshDatabase getDatabase( const std::string& filename, const IO::MeshDataStruct& mesh, int format )
 {
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = MPI_WORLD_RANK();
     char domainname[100];
     sprintf(domainname,"%s_%05i",mesh.meshName.c_str(),rank);
     // Create the MeshDatabase
@@ -161,7 +161,7 @@ static IO::MeshDatabase write_domain( FILE *fid, const std::string& filename,
     const IO::MeshDataStruct& mesh, int format )
 {
     const int level = 0;
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = MPI_WORLD_RANK();
     // Create the MeshDatabase
     IO::MeshDatabase database = getDatabase( filename, mesh, format );
     // Write the mesh
@@ -399,7 +399,7 @@ void writeSiloSummary( const std::vector<IO::MeshDatabase>& meshes_written, cons
 static std::vector<IO::MeshDatabase> writeMeshesNewFormat( 
     const std::vector<IO::MeshDataStruct>& meshData, const std::string& path, int format )
 {
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = MPI_WORLD_RANK();
     std::vector<IO::MeshDatabase> meshes_written;
     char filename[100], fullpath[200];
     sprintf(filename,"%05i",rank);
@@ -419,7 +419,7 @@ static std::vector<IO::MeshDatabase> writeMeshesSilo(
     const std::vector<IO::MeshDataStruct>& meshData, const std::string& path, int format )
 {
 #ifdef USE_SILO
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = MPI_WORLD_RANK();
     std::vector<IO::MeshDatabase> meshes_written;
     char filename[100], fullpath[200];
     sprintf(filename,"%05i.silo",rank);
@@ -441,12 +441,12 @@ static std::vector<IO::MeshDatabase> writeMeshesSilo(
 /****************************************************
 * Write the mesh data                               *
 ****************************************************/
-void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm )
+void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm )
 {
     if ( global_IO_path.empty() )
         IO::initialize( );
     PROFILE_START("writeData");
-    int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+    int rank = comm_rank(comm);
     // Check the meshData before writing
     for ( const auto& data : meshData ) {
         if ( !data.check() )
@@ -457,7 +457,7 @@ void IO::writeData( const std::string& subdir, const std::vector<IO::MeshDataStr
     if ( rank == 0 ) {
         mkdir(path.c_str(),S_IRWXU|S_IRGRP);
     }
-    comm.barrier();
+    MPI_Barrier(comm);
     // Write the mesh files
     std::vector<IO::MeshDatabase> meshes_written;
     if ( global_IO_format == Format::OLD ) {
diff --git a/IO/Writer.h b/IO/Writer.h
index dfc22db8..710fa0d8 100644
--- a/IO/Writer.h
+++ b/IO/Writer.h
@@ -34,7 +34,7 @@ void initialize( const std::string& path="", const std::string& format="silo", b
  * @param[in] meshData      The data to write
  * @param[in] comm          The comm to use for writing (usually MPI_COMM_WORLD or a dup thereof)
  */
-void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm );
+void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm );
 
 
 /*!
@@ -44,7 +44,7 @@ void writeData( const std::string& subdir, const std::vector<IO::MeshDataStruct>
  * @param[in] meshData      The data to write
  * @param[in] comm          The comm to use for writing (usually MPI_COMM_WORLD or a dup thereof)
  */
-inline void writeData( int timestep, const std::vector<IO::MeshDataStruct>& meshData, const Utilities::MPI& comm )
+inline void writeData( int timestep, const std::vector<IO::MeshDataStruct>& meshData, MPI_Comm comm )
 {
     char subdir[100];
     sprintf(subdir,"vis%03i",timestep);
diff --git a/IO/netcdf.cpp b/IO/netcdf.cpp
index e061579a..b36bb6d6 100644
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@@ -1,6 +1,6 @@
 #include "IO/netcdf.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 #include "ProfilerApp.h"
 
@@ -116,7 +116,7 @@ std::string VariableTypeName( VariableType type )
 /****************************************************
 * Open/close a file                                 *
 ****************************************************/
-int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm )
+int open( const std::string& filename, FileMode mode, MPI_Comm comm )
 {
     int fid = 0;
     if ( comm == MPI_COMM_NULL ) {
@@ -134,13 +134,13 @@ int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm
         }
     } else {
         if ( mode == READ ) {
-            int err = nc_open_par( filename.c_str(), NC_MPIPOSIX, comm.getCommunicator(), MPI_INFO_NULL, &fid );
+            int err = nc_open_par( filename.c_str(), NC_MPIPOSIX, comm, MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else if ( mode == WRITE ) {
-            int err = nc_open_par( filename.c_str(), NC_WRITE|NC_MPIPOSIX, comm.getCommunicator(), MPI_INFO_NULL, &fid );
+            int err = nc_open_par( filename.c_str(), NC_WRITE|NC_MPIPOSIX, comm, MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else if ( mode == CREATE ) {
-            int err = nc_create_par( filename.c_str(), NC_NETCDF4|NC_MPIIO, comm.getCommunicator(), MPI_INFO_NULL, &fid );
+            int err = nc_create_par( filename.c_str(), NC_NETCDF4|NC_MPIIO, comm, MPI_INFO_NULL, &fid );
             CHECK_NC_ERR( err );
         } else {
             ERROR("Unknown file mode");
@@ -375,7 +375,7 @@ Array<TYPE> getVar( int fid, const std::string& var, const std::vector<int>& sta
     std::vector<size_t> var_size = getVarDim( fid, var );
     for (int d=0; d<(int)var_size.size(); d++) {
         if ( start[d]<0 || start[d]+stride[d]*(count[d]-1)>(int)var_size[d] ) {
-            int rank = Utilities::MPI(MPI_COMM_WORLD).getRank();
+            int rank = comm_rank(MPI_COMM_WORLD);
             char tmp[1000];
             sprintf(tmp,"%i: Range exceeded array dimension:\n"
                 "   start[%i]=%i, count[%i]=%i, stride[%i]=%i, var_size[%i]=%i",
diff --git a/IO/netcdf.h b/IO/netcdf.h
index b4559e51..657747bf 100644
--- a/IO/netcdf.h
+++ b/IO/netcdf.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "common/Array.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 
 
@@ -32,7 +32,7 @@ std::string VariableTypeName( VariableType type );
  * @param mode          Open the file for reading or writing
  * @param comm          MPI communicator to use (MPI_COMM_WORLD: don't use parallel netcdf)
 */
-int open( const std::string& filename, FileMode mode, const Utilities::MPI& comm=MPI_COMM_NULL );
+int open( const std::string& filename, FileMode mode, MPI_Comm comm=MPI_COMM_NULL );
 
 
 /*!
diff --git a/IO/silo.cpp b/IO/silo.cpp
index ddf3646a..eece8583 100644
--- a/IO/silo.cpp
+++ b/IO/silo.cpp
@@ -1,6 +1,6 @@
 #include "IO/silo.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 #include "ProfilerApp.h"
 
diff --git a/IO/silo.h b/IO/silo.h
index 339a5c34..4c7081e5 100644
--- a/IO/silo.h
+++ b/IO/silo.h
@@ -6,7 +6,7 @@
 #include <array>
 
 #include "common/Array.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 
 
diff --git a/IO/silo.hpp b/IO/silo.hpp
index 35852004..312f32d8 100644
--- a/IO/silo.hpp
+++ b/IO/silo.hpp
@@ -3,7 +3,7 @@
 
 #include "IO/silo.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 #include "ProfilerApp.h"
 
diff --git a/analysis/Minkowski.cpp b/analysis/Minkowski.cpp
index 3e3fb35e..faac6142 100644
--- a/analysis/Minkowski.cpp
+++ b/analysis/Minkowski.cpp
@@ -4,7 +4,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -109,13 +109,13 @@ void Minkowski::ComputeScalar(const DoubleArray& Field, const double isovalue)
 	// convert X for 2D manifold to 3D object
 	Xi *= 0.5;
 	
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 	// Phase averages
-	Vi_global = Dm->Comm.sumReduce( Vi );
-	Xi_global = Dm->Comm.sumReduce( Xi );
-	Ai_global = Dm->Comm.sumReduce( Ai );
-	Ji_global = Dm->Comm.sumReduce( Ji );
-	Dm->Comm.barrier();
+	MPI_Allreduce(&Vi,&Vi_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Xi,&Xi_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Ai,&Ai_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Ji,&Ji_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Barrier(Dm->Comm);
     PROFILE_STOP("ComputeScalar");
 }
 
@@ -168,7 +168,7 @@ int Minkowski::MeasureConnectedPathway(){
 	double vF=0.0; 
 	n_connected_components = ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,Dm->rank_info,distance,distance,vF,vF,label,Dm->Comm);
 //	int n_connected_components = ComputeGlobalPhaseComponent(Nx-2,Ny-2,Nz-2,Dm->rank_info,const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, Dm->Comm )
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 	
 	for (int k=0; k<Nz; k++){
 		for (int j=0; j<Ny; j++){
diff --git a/analysis/Minkowski.h b/analysis/Minkowski.h
index 2fb9f048..0d36e33d 100644
--- a/analysis/Minkowski.h
+++ b/analysis/Minkowski.h
@@ -12,7 +12,7 @@
 #include "analysis/distance.h"
 
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/SubPhase.cpp b/analysis/SubPhase.cpp
index 6fcbe399..76541ffd 100644
--- a/analysis/SubPhase.cpp
+++ b/analysis/SubPhase.cpp
@@ -229,25 +229,25 @@ void SubPhase::Basic(){
 			}
 		}
 	}
-	gwb.V = Dm->Comm.sumReduce( wb.V);
-	gnb.V = Dm->Comm.sumReduce( nb.V);
-	gwb.M = Dm->Comm.sumReduce( wb.M);
-	gnb.M = Dm->Comm.sumReduce( nb.M);
-	gwb.Px = Dm->Comm.sumReduce( wb.Px);
-	gwb.Py = Dm->Comm.sumReduce( wb.Py);
-	gwb.Pz = Dm->Comm.sumReduce( wb.Pz);
-	gnb.Px = Dm->Comm.sumReduce( nb.Px);
-	gnb.Py = Dm->Comm.sumReduce( nb.Py);
-	gnb.Pz = Dm->Comm.sumReduce( nb.Pz);
+	gwb.V=sumReduce( Dm->Comm, wb.V);
+	gnb.V=sumReduce( Dm->Comm, nb.V);
+	gwb.M=sumReduce( Dm->Comm, wb.M);
+	gnb.M=sumReduce( Dm->Comm, nb.M);
+	gwb.Px=sumReduce( Dm->Comm, wb.Px);
+	gwb.Py=sumReduce( Dm->Comm, wb.Py);
+	gwb.Pz=sumReduce( Dm->Comm, wb.Pz);
+	gnb.Px=sumReduce( Dm->Comm, nb.Px);
+	gnb.Py=sumReduce( Dm->Comm, nb.Py);
+	gnb.Pz=sumReduce( Dm->Comm, nb.Pz);
 	
-	count_w = Dm->Comm.sumReduce( count_w);
-	count_n = Dm->Comm.sumReduce( count_n);
+	count_w=sumReduce( Dm->Comm, count_w);
+	count_n=sumReduce( Dm->Comm, count_n);
 	if (count_w > 0.0)
-		gwb.p = Dm->Comm.sumReduce(wb.p) / count_w;
+		gwb.p=sumReduce( Dm->Comm, wb.p) / count_w;
 	else 
 		gwb.p = 0.0;
 	if (count_n > 0.0)
-		gnb.p = Dm->Comm.sumReduce( nb.p) / count_n;
+		gnb.p=sumReduce( Dm->Comm, nb.p) / count_n;
 	else 
 		gnb.p = 0.0;
 
@@ -444,14 +444,14 @@ void SubPhase::Full(){
 	nd.X -= nc.X;
 
 	// compute global entities
-	gnc.V = Dm->Comm.sumReduce( nc.V );
-	gnc.A = Dm->Comm.sumReduce( nc.A );
-	gnc.H = Dm->Comm.sumReduce( nc.H );
-	gnc.X = Dm->Comm.sumReduce( nc.X );
-	gnd.V = Dm->Comm.sumReduce( nd.V );
-	gnd.A = Dm->Comm.sumReduce( nd.A );
-	gnd.H = Dm->Comm.sumReduce( nd.H );
-	gnd.X = Dm->Comm.sumReduce( nd.X );
+	gnc.V=sumReduce( Dm->Comm, nc.V);
+	gnc.A=sumReduce( Dm->Comm, nc.A);
+	gnc.H=sumReduce( Dm->Comm, nc.H);
+	gnc.X=sumReduce( Dm->Comm, nc.X);
+	gnd.V=sumReduce( Dm->Comm, nd.V);
+	gnd.A=sumReduce( Dm->Comm, nd.A);
+	gnd.H=sumReduce( Dm->Comm, nd.H);
+	gnd.X=sumReduce( Dm->Comm, nd.X);
 	gnd.Nc = nd.Nc;
  	// wetting
 	for (k=0; k<Nz; k++){
@@ -491,14 +491,14 @@ void SubPhase::Full(){
 	wd.H -= wc.H;
 	wd.X -= wc.X;
 	// compute global entities
-	gwc.V = Dm->Comm.sumReduce( wc.V );
-	gwc.A = Dm->Comm.sumReduce( wc.A );
-	gwc.H = Dm->Comm.sumReduce( wc.H );
-	gwc.X = Dm->Comm.sumReduce( wc.X );
-	gwd.V = Dm->Comm.sumReduce( wd.V );
-	gwd.A = Dm->Comm.sumReduce( wd.A );
-	gwd.H = Dm->Comm.sumReduce( wd.H );
-	gwd.X = Dm->Comm.sumReduce( wd.X );
+	gwc.V=sumReduce( Dm->Comm, wc.V);
+	gwc.A=sumReduce( Dm->Comm, wc.A);
+	gwc.H=sumReduce( Dm->Comm, wc.H);
+	gwc.X=sumReduce( Dm->Comm, wc.X);
+	gwd.V=sumReduce( Dm->Comm, wd.V);
+	gwd.A=sumReduce( Dm->Comm, wd.A);
+	gwd.H=sumReduce( Dm->Comm, wd.H);
+	gwd.X=sumReduce( Dm->Comm, wd.X);
 	gwd.Nc = wd.Nc;
 	
  	/*  Set up geometric analysis of interface region */
@@ -526,20 +526,20 @@ void SubPhase::Full(){
 	iwn.A = morph_i->A(); 
 	iwn.H = morph_i->H(); 
 	iwn.X = morph_i->X(); 
-	giwn.V = Dm->Comm.sumReduce( iwn.V );
-	giwn.A = Dm->Comm.sumReduce( iwn.A );
-	giwn.H = Dm->Comm.sumReduce( iwn.H );
-	giwn.X = Dm->Comm.sumReduce( iwn.X );
+	giwn.V=sumReduce( Dm->Comm, iwn.V);
+	giwn.A=sumReduce( Dm->Comm, iwn.A);
+	giwn.H=sumReduce( Dm->Comm, iwn.H);
+	giwn.X=sumReduce( Dm->Comm, iwn.X);
 	// measure only the connected part
 	iwnc.Nc = morph_i->MeasureConnectedPathway();
 	iwnc.V = morph_i->V(); 
 	iwnc.A = morph_i->A(); 
 	iwnc.H = morph_i->H(); 
 	iwnc.X = morph_i->X(); 
-	giwnc.V = Dm->Comm.sumReduce( iwnc.V );
-	giwnc.A = Dm->Comm.sumReduce( iwnc.A );
-	giwnc.H = Dm->Comm.sumReduce( iwnc.H );
-	giwnc.X = Dm->Comm.sumReduce( iwnc.X );
+	giwnc.V=sumReduce( Dm->Comm, iwnc.V);
+	giwnc.A=sumReduce( Dm->Comm, iwnc.A);
+	giwnc.H=sumReduce( Dm->Comm, iwnc.H);
+	giwnc.X=sumReduce( Dm->Comm, iwnc.X);
 	giwnc.Nc = iwnc.Nc;
 
 	double vol_nc_bulk = 0.0;
@@ -630,46 +630,46 @@ void SubPhase::Full(){
 		}
 	}
 
-	gnd.M = Dm->Comm.sumReduce( nd.M );
-	gnd.Px = Dm->Comm.sumReduce( nd.Px );
-	gnd.Py = Dm->Comm.sumReduce( nd.Py );
-	gnd.Pz = Dm->Comm.sumReduce( nd.Pz );
-	gnd.K = Dm->Comm.sumReduce( nd.K );
+	gnd.M=sumReduce( Dm->Comm, nd.M);
+	gnd.Px=sumReduce( Dm->Comm, nd.Px);
+	gnd.Py=sumReduce( Dm->Comm, nd.Py);
+	gnd.Pz=sumReduce( Dm->Comm, nd.Pz);
+	gnd.K=sumReduce( Dm->Comm, nd.K);
 
-	gwd.M = Dm->Comm.sumReduce( wd.M );
-	gwd.Px = Dm->Comm.sumReduce( wd.Px );
-	gwd.Py = Dm->Comm.sumReduce( wd.Py );
-	gwd.Pz = Dm->Comm.sumReduce( wd.Pz );
-	gwd.K = Dm->Comm.sumReduce( wd.K );
+	gwd.M=sumReduce( Dm->Comm, wd.M);
+	gwd.Px=sumReduce( Dm->Comm, wd.Px);
+	gwd.Py=sumReduce( Dm->Comm, wd.Py);
+	gwd.Pz=sumReduce( Dm->Comm, wd.Pz);
+	gwd.K=sumReduce( Dm->Comm, wd.K);
 	
-	gnc.M = Dm->Comm.sumReduce( nc.M );
-	gnc.Px = Dm->Comm.sumReduce( nc.Px );
-	gnc.Py = Dm->Comm.sumReduce( nc.Py );
-	gnc.Pz = Dm->Comm.sumReduce( nc.Pz );
-	gnc.K = Dm->Comm.sumReduce( nc.K );
+	gnc.M=sumReduce( Dm->Comm, nc.M);
+	gnc.Px=sumReduce( Dm->Comm, nc.Px);
+	gnc.Py=sumReduce( Dm->Comm, nc.Py);
+	gnc.Pz=sumReduce( Dm->Comm, nc.Pz);
+	gnc.K=sumReduce( Dm->Comm, nc.K);
 
-	gwc.M = Dm->Comm.sumReduce( wc.M );
-	gwc.Px = Dm->Comm.sumReduce( wc.Px );
-	gwc.Py = Dm->Comm.sumReduce( wc.Py );
-	gwc.Pz = Dm->Comm.sumReduce( wc.Pz );
-	gwc.K = Dm->Comm.sumReduce( wc.K );
+	gwc.M=sumReduce( Dm->Comm, wc.M);
+	gwc.Px=sumReduce( Dm->Comm, wc.Px);
+	gwc.Py=sumReduce( Dm->Comm, wc.Py);
+	gwc.Pz=sumReduce( Dm->Comm, wc.Pz);
+	gwc.K=sumReduce( Dm->Comm, wc.K);
 	
-	giwn.Mn = Dm->Comm.sumReduce( iwn.Mn );
-	giwn.Pnx = Dm->Comm.sumReduce( iwn.Pnx );
-	giwn.Pny = Dm->Comm.sumReduce( iwn.Pny );
-	giwn.Pnz = Dm->Comm.sumReduce( iwn.Pnz );
-	giwn.Kn = Dm->Comm.sumReduce( iwn.Kn );
-	giwn.Mw = Dm->Comm.sumReduce( iwn.Mw );
-	giwn.Pwx = Dm->Comm.sumReduce( iwn.Pwx );
-	giwn.Pwy = Dm->Comm.sumReduce( iwn.Pwy );
-	giwn.Pwz = Dm->Comm.sumReduce( iwn.Pwz );
-	giwn.Kw = Dm->Comm.sumReduce( iwn.Kw );
+	giwn.Mn=sumReduce( Dm->Comm, iwn.Mn);
+	giwn.Pnx=sumReduce( Dm->Comm, iwn.Pnx);
+	giwn.Pny=sumReduce( Dm->Comm, iwn.Pny);
+	giwn.Pnz=sumReduce( Dm->Comm, iwn.Pnz);
+	giwn.Kn=sumReduce( Dm->Comm, iwn.Kn);
+	giwn.Mw=sumReduce( Dm->Comm, iwn.Mw);
+	giwn.Pwx=sumReduce( Dm->Comm, iwn.Pwx);
+	giwn.Pwy=sumReduce( Dm->Comm, iwn.Pwy);
+	giwn.Pwz=sumReduce( Dm->Comm, iwn.Pwz);
+	giwn.Kw=sumReduce( Dm->Comm, iwn.Kw);
 	
 	// pressure averaging
-	gnc.p = Dm->Comm.sumReduce( nc.p );
-	gnd.p = Dm->Comm.sumReduce( nd.p );
-	gwc.p = Dm->Comm.sumReduce( wc.p );
-	gwd.p = Dm->Comm.sumReduce( wd.p );
+	gnc.p=sumReduce( Dm->Comm, nc.p);
+	gnd.p=sumReduce( Dm->Comm, nd.p);
+	gwc.p=sumReduce( Dm->Comm, wc.p);
+	gwd.p=sumReduce( Dm->Comm, wd.p);
 
 	if (vol_wc_bulk > 0.0)
 		wc.p = wc.p /vol_wc_bulk;
@@ -680,10 +680,10 @@ void SubPhase::Full(){
 	if (vol_nd_bulk > 0.0)
 		nd.p = nd.p /vol_nd_bulk;
 
-	vol_wc_bulk = Dm->Comm.sumReduce( vol_wc_bulk );
-	vol_wd_bulk = Dm->Comm.sumReduce( vol_wd_bulk );
-	vol_nc_bulk = Dm->Comm.sumReduce( vol_nc_bulk );
-	vol_nd_bulk = Dm->Comm.sumReduce( vol_nd_bulk );
+	vol_wc_bulk=sumReduce( Dm->Comm, vol_wc_bulk);
+	vol_wd_bulk=sumReduce( Dm->Comm, vol_wd_bulk);
+	vol_nc_bulk=sumReduce( Dm->Comm, vol_nc_bulk);
+	vol_nd_bulk=sumReduce( Dm->Comm, vol_nd_bulk);
 	
 	if (vol_wc_bulk > 0.0)
 		gwc.p = gwc.p /vol_wc_bulk;
@@ -719,7 +719,7 @@ void SubPhase::AggregateLabels( const std::string& filename )
 			}
 		}
 	}
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 
 	Dm->AggregateLabels( filename );
 
diff --git a/analysis/SubPhase.h b/analysis/SubPhase.h
index 691c654f..71b87ef0 100644
--- a/analysis/SubPhase.h
+++ b/analysis/SubPhase.h
@@ -12,7 +12,7 @@
 #include "analysis/distance.h"
 #include "analysis/Minkowski.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/TwoPhase.cpp b/analysis/TwoPhase.cpp
index 812490e7..9b2e5fd8 100644
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@@ -5,7 +5,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -882,7 +882,7 @@ void TwoPhase::ComponentAverages()
 		}
 	}
 
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 	if (Dm->rank()==0){
 		printf("Component averages computed locally -- reducing result... \n");
 	}
@@ -895,8 +895,8 @@ void TwoPhase::ComponentAverages()
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_NWP(idx,b)=RecvBuffer(idx);
 	}
 	*/
-	Dm->Comm.barrier();
-	Dm->Comm.sumReduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP);
+	MPI_Barrier(Dm->Comm);
+	MPI_Allreduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT*NumberComponents_NWP,					MPI_DOUBLE,MPI_SUM,Dm->Comm);
 	//	MPI_Reduce(ComponentAverages_NWP.data(),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
 
 	if (Dm->rank()==0){
@@ -993,9 +993,9 @@ void TwoPhase::ComponentAverages()
 
 	// reduce the wetting phase averages
 	for (int b=0; b<NumberComponents_WP; b++){
-		Dm->Comm.barrier();
+		MPI_Barrier(Dm->Comm);
 //		MPI_Allreduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,Dm->Comm);
-		Dm->Comm.sumReduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT);
+		MPI_Reduce(&ComponentAverages_WP(0,b),RecvBuffer.data(),BLOB_AVG_COUNT,MPI_DOUBLE,MPI_SUM,0,Dm->Comm);
 		for (int idx=0; idx<BLOB_AVG_COUNT; idx++) ComponentAverages_WP(idx,b)=RecvBuffer(idx);
 	}
 	
@@ -1078,48 +1078,43 @@ void TwoPhase::Reduce()
 	int i;
 	double iVol_global=1.0/Volume;
 	//...........................................................................
-	Dm->Comm.barrier();
-	nwp_volume_global = Dm->Comm.sumReduce( nwp_volume );
-	wp_volume_global = Dm->Comm.sumReduce( wp_volume );
-	awn_global = Dm->Comm.sumReduce( awn );
-	ans_global = Dm->Comm.sumReduce( ans );
-	aws_global = Dm->Comm.sumReduce( aws );
-	lwns_global = Dm->Comm.sumReduce( lwns );
-	As_global = Dm->Comm.sumReduce( As );
-	Jwn_global = Dm->Comm.sumReduce( Jwn );
-	Kwn_global = Dm->Comm.sumReduce( Kwn );
-	KGwns_global = Dm->Comm.sumReduce( KGwns );
-	KNwns_global = Dm->Comm.sumReduce( KNwns );
-	efawns_global = Dm->Comm.sumReduce( efawns );
-	wwndnw_global = Dm->Comm.sumReduce( wwndnw );
-	wwnsdnwn_global = Dm->Comm.sumReduce( wwnsdnwn );
-	Jwnwwndnw_global = Dm->Comm.sumReduce( Jwnwwndnw );
+	MPI_Barrier(Dm->Comm);
+	MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&wp_volume,&wp_volume_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&aws,&aws_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&lwns,&lwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&As,&As_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Jwn,&Jwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Kwn,&Kwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&KGwns,&KGwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&KNwns,&KNwns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&efawns,&efawns_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&wwndnw,&wwndnw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&wwnsdnwn,&wwnsdnwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Jwnwwndnw,&Jwnwwndnw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 	// Phase averages
-	vol_w_global = Dm->Comm.sumReduce( vol_w );
-	vol_n_global = Dm->Comm.sumReduce( vol_n );
-	paw_global = Dm->Comm.sumReduce( paw );
-	pan_global = Dm->Comm.sumReduce( pan );
-	for (int idx=0; idx<3; idx++)
-		vaw_global(idx) = Dm->Comm.sumReduce( vaw(idx) );
-	for (int idx=0; idx<3; idx++)
-		van_global(idx) = Dm->Comm.sumReduce( van(idx));
-	for (int idx=0; idx<3; idx++)
-		vawn_global(idx) = Dm->Comm.sumReduce( vawn(idx) );
-	for (int idx=0; idx<3; idx++)
-		vawns_global(idx) = Dm->Comm.sumReduce( vawns(idx) );
-	for (int idx=0; idx<6; idx++){
-		Gwn_global(idx) = Dm->Comm.sumReduce( Gwn(idx) );
-		Gns_global(idx) = Dm->Comm.sumReduce( Gns(idx) );
-		Gws_global(idx) = Dm->Comm.sumReduce( Gws(idx) );
-	}
-	trawn_global = Dm->Comm.sumReduce( trawn );
-	trJwn_global = Dm->Comm.sumReduce( trJwn );
-	trRwn_global = Dm->Comm.sumReduce( trRwn );
-	euler_global = Dm->Comm.sumReduce( euler );
-	An_global = Dm->Comm.sumReduce( An );
-	Jn_global = Dm->Comm.sumReduce( Jn );
-	Kn_global = Dm->Comm.sumReduce( Kn );
-	Dm->Comm.barrier();
+	MPI_Allreduce(&vol_w,&vol_w_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&vol_n,&vol_n_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&paw,&paw_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&pan,&pan_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&vaw(0),&vaw_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&van(0),&van_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&vawn(0),&vawn_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&vawns(0),&vawns_global(0),3,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&trRwn,&trRwn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&euler,&euler_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&An,&An_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Jn,&Jn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&Kn,&Kn_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+
+	MPI_Barrier(Dm->Comm);
 
 	// Normalize the phase averages
 	// (density of both components = 1.0)
diff --git a/analysis/TwoPhase.h b/analysis/TwoPhase.h
index 4d500a89..fddd04e8 100644
--- a/analysis/TwoPhase.h
+++ b/analysis/TwoPhase.h
@@ -12,7 +12,7 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
diff --git a/analysis/analysis.cpp b/analysis/analysis.cpp
index 4298750e..7587f3c5 100644
--- a/analysis/analysis.cpp
+++ b/analysis/analysis.cpp
@@ -188,7 +188,7 @@ int ComputeLocalPhaseComponent(const IntArray &PhaseID, int &VALUE, BlobIDArray
 /******************************************************************
 * Reorder the global blob ids                                     *
 ******************************************************************/
-static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int ngz, const Utilities::MPI& comm )
+static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int ngz, MPI_Comm comm )
 {
     if ( N_blobs==0 )
         return 0;
@@ -212,7 +212,7 @@ static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int
         }
     }
     ASSERT(max_id<N_blobs);
-    comm.sumReduce(local_size,global_size,N_blobs);
+    MPI_Allreduce(local_size,global_size,N_blobs,MPI_DOUBLE,MPI_SUM,comm);
     std::vector<std::pair<double,int> > map1(N_blobs);
     int N_blobs2 = 0;
     for (int i=0; i<N_blobs; i++) {
@@ -235,12 +235,12 @@ static int ReorderBlobIDs2( BlobIDArray& ID, int N_blobs, int ngx, int ngy, int
     PROFILE_STOP("ReorderBlobIDs2",1);
     return N_blobs2;
 }
-void ReorderBlobIDs( BlobIDArray& ID, const Utilities::MPI& comm )
+void ReorderBlobIDs( BlobIDArray& ID, MPI_Comm comm )
 {
     PROFILE_START("ReorderBlobIDs");
     int tmp = ID.max()+1;
     int N_blobs = 0;
-    N_blobs = comm.maxReduce( tmp );
+    MPI_Allreduce(&tmp,&N_blobs,1,MPI_INT,MPI_MAX,comm);
     ReorderBlobIDs2(ID,N_blobs,1,1,1,comm);
     PROFILE_STOP("ReorderBlobIDs");
 }
@@ -260,29 +260,30 @@ static void updateRemoteIds(
     int N_send, const std::vector<int>& N_recv,
     int64_t *send_buf, std::vector<int64_t*>& recv_buf,
     std::map<int64_t,int64_t>& remote_map,
-    const Utilities::MPI& comm )
+    MPI_Comm comm )
 {
     std::vector<MPI_Request> send_req(neighbors.size());
     std::vector<MPI_Request> recv_req(neighbors.size());
-    auto it = map.begin();
+    std::vector<MPI_Status> status(neighbors.size());
+    std::map<int64_t,global_id_info_struct>::const_iterator it = map.begin();
     ASSERT(N_send==(int)map.size());
     for (size_t i=0; i<map.size(); i++, ++it) {
         send_buf[2*i+0] = it->first;
         send_buf[2*i+1] = it->second.new_id;
     }
     for (size_t i=0; i<neighbors.size(); i++) {
-        send_req[i] = comm.Isend( send_buf,    2*N_send, neighbors[i], 0 );
-        recv_req[i] = comm.Irecv( recv_buf[i], 2*N_recv[i], neighbors[i], 0 );
+        MPI_Isend( send_buf,    2*N_send,    MPI_LONG_LONG, neighbors[i], 0, comm, &send_req[i] );
+        MPI_Irecv( recv_buf[i], 2*N_recv[i], MPI_LONG_LONG, neighbors[i], 0, comm, &recv_req[i] );
     }
     for (it=map.begin(); it!=map.end(); ++it) {
         remote_map[it->first] = it->second.new_id;
     }
     for (size_t i=0; i<neighbors.size(); i++) {
-        comm.wait( recv_req[i] );
+        MPI_Wait(&recv_req[i],&status[i]);
         for (int j=0; j<N_recv[i]; j++)
             remote_map[recv_buf[i][2*j+0]] = recv_buf[i][2*j+1];
     }
-    comm.waitAll(neighbors.size(),getPtr(send_req));
+    MPI_Waitall(neighbors.size(),getPtr(send_req),getPtr(status));
 }
 // Compute a new local id for each local id
 static bool updateLocalIds( const std::map<int64_t,int64_t>& remote_map, 
@@ -303,18 +304,18 @@ static bool updateLocalIds( const std::map<int64_t,int64_t>& remote_map,
     return changed;
 }
 static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
-    int nblobs, BlobIDArray& IDs, const Utilities::MPI& comm )
+    int nblobs, BlobIDArray& IDs, MPI_Comm comm )
 {
     PROFILE_START("LocalToGlobalIDs",1);
     const int rank = rank_info.rank[1][1][1];
-    int nprocs = comm.getSize();
+    int nprocs = comm_size(comm);
     const int ngx = (IDs.size(0)-nx)/2;
     const int ngy = (IDs.size(1)-ny)/2;
     const int ngz = (IDs.size(2)-nz)/2;
     // Get the number of blobs for each rank
     std::vector<int> N_blobs(nprocs,0);
     PROFILE_START("LocalToGlobalIDs-Allgather",1);
-    comm.allGather(nblobs,getPtr(N_blobs));
+    MPI_Allgather(&nblobs,1,MPI_INT,getPtr(N_blobs),1,MPI_INT,comm);
     PROFILE_STOP("LocalToGlobalIDs-Allgather",1);
     int64_t N_blobs_tot = 0;
     int offset = 0;
@@ -362,12 +363,13 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
     std::vector<int> N_recv(neighbors.size(),0);
     std::vector<MPI_Request> send_req(neighbors.size());
     std::vector<MPI_Request> recv_req(neighbors.size());
+    std::vector<MPI_Status> status(neighbors.size());
     for (size_t i=0; i<neighbors.size(); i++) {
-        send_req[i] = comm.Isend( &N_send,    1, neighbors[i], 0 );
-        recv_req[i] = comm.Irecv( &N_recv[i], 1, neighbors[i], 0 );
+        MPI_Isend( &N_send,    1, MPI_INT, neighbors[i], 0, comm, &send_req[i] );
+        MPI_Irecv( &N_recv[i], 1, MPI_INT, neighbors[i], 0, comm, &recv_req[i] );
     }
-    comm.waitAll(neighbors.size(),getPtr(send_req));
-    comm.waitAll(neighbors.size(),getPtr(recv_req));
+    MPI_Waitall(neighbors.size(),getPtr(send_req),getPtr(status));
+    MPI_Waitall(neighbors.size(),getPtr(recv_req),getPtr(status));
     // Allocate memory for communication
     int64_t *send_buf = new int64_t[2*N_send];
     std::vector<int64_t*> recv_buf(neighbors.size());
@@ -396,7 +398,8 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
         bool changed = updateLocalIds( remote_map, map );
         // Check if we are finished
         int test = changed ? 1:0;
-        int result = comm.sumReduce( test );
+        int result = 0;
+        MPI_Allreduce(&test,&result,1,MPI_INT,MPI_SUM,comm);
         if ( result==0 )
             break;
     }
@@ -432,7 +435,7 @@ static int LocalToGlobalIDs( int nx, int ny, int nz, const RankInfoStruct& rank_
 }
 int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
     const DoubleArray& Phase, const DoubleArray& SignDist, double vF, double vS,
-    BlobIDArray& GlobalBlobID, const Utilities::MPI& comm )
+    BlobIDArray& GlobalBlobID, MPI_Comm comm )
 {
     PROFILE_START("ComputeGlobalBlobIDs");
     // First compute the local ids
@@ -443,7 +446,7 @@ int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_inf
     return nglobal;
 }
 int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& rank_info,
-    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, const Utilities::MPI& comm )
+    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, MPI_Comm comm )
 {
     PROFILE_START("ComputeGlobalPhaseComponent");
     // First compute the local ids
@@ -459,27 +462,37 @@ int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& r
 * Compute the mapping of blob ids between timesteps               *
 ******************************************************************/
 typedef std::map<BlobIDType,std::map<BlobIDType,int64_t> > map_type;
+template<class TYPE> inline MPI_Datatype getMPIType();
+template<> inline MPI_Datatype getMPIType<int32_t>() { return MPI_INT; }
+template<> inline MPI_Datatype getMPIType<int64_t>() { 
+    if ( sizeof(int64_t)==sizeof(long int) )
+        return MPI_LONG;
+    else if ( sizeof(int64_t)==sizeof(double) )
+        return MPI_DOUBLE;
+}
 template<class TYPE>
-void gatherSet( std::set<TYPE>& set, const Utilities::MPI& comm )
+void gatherSet( std::set<TYPE>& set, MPI_Comm comm )
 {
-    int nprocs = comm.getSize();
+    int nprocs = comm_size(comm);
+    MPI_Datatype type = getMPIType<TYPE>();
     std::vector<TYPE> send_data(set.begin(),set.end());
     int send_count = send_data.size();
     std::vector<int> recv_count(nprocs,0), recv_disp(nprocs,0);
-    comm.allGather( send_count, getPtr(recv_count) );
+    MPI_Allgather(&send_count,1,MPI_INT,getPtr(recv_count),1,MPI_INT,comm);
     for (int i=1; i<nprocs; i++)
         recv_disp[i] = recv_disp[i-1] + recv_count[i-1];
     std::vector<TYPE> recv_data(recv_disp[nprocs-1]+recv_count[nprocs-1]);
-    comm.allGather( getPtr(send_data), send_count, getPtr(recv_data),
-        getPtr(recv_count), getPtr(recv_disp), true );
+    MPI_Allgatherv(getPtr(send_data),send_count,type,
+        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),type,comm);
     for (size_t i=0; i<recv_data.size(); i++)
         set.insert(recv_data[i]);
 }
-void gatherSrcIDMap( map_type& src_map, const Utilities::MPI& comm )
+void gatherSrcIDMap( map_type& src_map, MPI_Comm comm )
 {
-    int nprocs = comm.getSize();
+    int nprocs = comm_size(comm);
+    MPI_Datatype type = getMPIType<int64_t>();
     std::vector<int64_t> send_data;
-    for (auto it=src_map.begin(); it!=src_map.end(); ++it) {
+    for (map_type::const_iterator it=src_map.begin(); it!=src_map.end(); ++it) {
         int id = it->first;
         const std::map<BlobIDType,int64_t>& src_ids = it->second;
         send_data.push_back(id);
@@ -492,21 +505,21 @@ void gatherSrcIDMap( map_type& src_map, const Utilities::MPI& comm )
     }
     int send_count = send_data.size();
     std::vector<int> recv_count(nprocs,0), recv_disp(nprocs,0);
-    comm.allGather(send_count,getPtr(recv_count));
+    MPI_Allgather(&send_count,1,MPI_INT,getPtr(recv_count),1,MPI_INT,comm);
     for (int i=1; i<nprocs; i++)
         recv_disp[i] = recv_disp[i-1] + recv_count[i-1];
     std::vector<int64_t> recv_data(recv_disp[nprocs-1]+recv_count[nprocs-1]);
-    comm.allGather(getPtr(send_data),send_count,
-        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),true);
+    MPI_Allgatherv(getPtr(send_data),send_count,type,
+        getPtr(recv_data),getPtr(recv_count),getPtr(recv_disp),type,comm);
     size_t i=0;
     src_map.clear();
     while ( i < recv_data.size() ) {
         BlobIDType id = recv_data[i];
         size_t count = recv_data[i+1];
         i += 2;
-        auto& src_ids = src_map[id];
+        std::map<BlobIDType,int64_t>& src_ids = src_map[id];
         for (size_t j=0; j<count; j++,i+=2) {
-            auto it = src_ids.find(recv_data[i]);
+            std::map<BlobIDType,int64_t>::iterator it = src_ids.find(recv_data[i]);
             if ( it == src_ids.end() )
                 src_ids.insert(std::pair<BlobIDType,int64_t>(recv_data[i],recv_data[i+1]));
             else
@@ -525,7 +538,7 @@ void addSrcDstIDs( BlobIDType src_id, map_type& src_map, map_type& dst_map,
     }
 }
 ID_map_struct computeIDMap( int nx, int ny, int nz, 
-    const BlobIDArray& ID1, const BlobIDArray& ID2, const Utilities::MPI& comm )
+    const BlobIDArray& ID1, const BlobIDArray& ID2, MPI_Comm comm )
 {
     ASSERT(ID1.size()==ID2.size());
     PROFILE_START("computeIDMap");
@@ -767,7 +780,7 @@ void renumberIDs( const std::vector<BlobIDType>& new_ids, BlobIDArray& IDs )
 ******************************************************************/
 void writeIDMap( const ID_map_struct& map, long long int timestep, const std::string& filename )
 {
-    int rank = Utilities::MPI( MPI_COMM_WORLD ).getRank();
+    int rank = MPI_WORLD_RANK();
     if ( rank!=0 )
         return;
     bool empty = map.created.empty() && map.destroyed.empty() &&
diff --git a/analysis/analysis.h b/analysis/analysis.h
index ec377995..2ce531b1 100644
--- a/analysis/analysis.h
+++ b/analysis/analysis.h
@@ -58,7 +58,7 @@ int ComputeLocalPhaseComponent( const IntArray &PhaseID, int &VALUE, IntArray &C
  */
 int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_info, 
     const DoubleArray& Phase, const DoubleArray& SignDist, double vF, double vS, 
-    BlobIDArray& GlobalBlobID, const Utilities::MPI& comm );
+    BlobIDArray& GlobalBlobID, MPI_Comm comm );
 
 
 /*!
@@ -75,7 +75,7 @@ int ComputeGlobalBlobIDs( int nx, int ny, int nz, const RankInfoStruct& rank_inf
  * @return Return the number of components in the specified phase
  */
 int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& rank_info,
-    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, const Utilities::MPI& comm );
+    const IntArray &PhaseID, int &VALUE, BlobIDArray &GlobalBlobID, MPI_Comm comm );
 
 
 /*!
@@ -87,7 +87,7 @@ int ComputeGlobalPhaseComponent( int nx, int ny, int nz, const RankInfoStruct& r
  * @param[in] nz            Number of elements in the z-direction
  * @param[in/out] ID        The ids of the blobs
  */
-void ReorderBlobIDs( BlobIDArray& ID, const Utilities::MPI& comm );
+void ReorderBlobIDs( BlobIDArray& ID, MPI_Comm comm );
 
 
 typedef std::pair<BlobIDType,std::vector<BlobIDType> > BlobIDSplitStruct;
@@ -120,7 +120,7 @@ struct ID_map_struct {
  * @param[in] ID1           The blob ids at the first timestep
  * @param[in] ID2           The blob ids at the second timestep
  */
-ID_map_struct computeIDMap( int nx, int ny, int nz, const BlobIDArray& ID1, const BlobIDArray& ID2, const Utilities::MPI& comm );
+ID_map_struct computeIDMap( int nx, int ny, int nz, const BlobIDArray& ID1, const BlobIDArray& ID2, MPI_Comm comm );
 
 
 /*!
diff --git a/analysis/distance.cpp b/analysis/distance.cpp
index 9c605e1e..e297b435 100644
--- a/analysis/distance.cpp
+++ b/analysis/distance.cpp
@@ -176,7 +176,7 @@ void CalcVecDist( Array<Vec> &d, const Array<int> &ID0, const Domain &Dm,
         // Update distance
         double err = calcVecUpdateInterior( d, dx[0], dx[1], dx[2] );
         // Check if we are finished
-        err = Dm.Comm.maxReduce( err );
+        err = maxReduce( Dm.Comm, err );
         if ( err < tol )
             break;
     }
diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index ab4312f8..72a17892 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -58,11 +58,11 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 			}
 		}
 	}
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 	
 	// total Global is the number of nodes in the pore-space
-	totalGlobal = Dm->Comm.sumReduce( count );
-	maxdistGlobal = Dm->Comm.sumReduce( maxdist );
+	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm->Comm);
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double volume_fraction=totalGlobal/volume;
 	if (rank==0) printf("Volume fraction for morphological opening: %f \n",volume_fraction);
@@ -133,6 +133,7 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 	double deltaR=0.05; // amount to change the radius in voxel units
 	double Rcrit_old=0.0;
 
+	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
 
 	if (ErodeLabel == 1){
@@ -202,41 +203,41 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
 		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -258,7 +259,7 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 		UnpackID(Dm->recvList_YZ, Dm->recvCount_YZ ,recvID_YZ, id);
 		//......................................................................................
 
-		//double GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
+		MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 
 		count = 0.f;
 		for (int k=1; k<Nz-1; k++){
@@ -271,7 +272,7 @@ double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain>
 				}
 			}
 		}
-		countGlobal = Dm->Comm.sumReduce( count );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		void_fraction_new = countGlobal/totalGlobal;
 		void_fraction_diff_new = abs(void_fraction_new-VoidFraction);
 	/*	if (rank==0){
@@ -359,11 +360,11 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		}
 	}
 
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 	
 	// total Global is the number of nodes in the pore-space
-	totalGlobal = Dm->Comm.sumReduce( count );
-	maxdistGlobal = Dm->Comm.sumReduce( maxdist );
+	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
+	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm->Comm);
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double volume_fraction=totalGlobal/volume;
 	if (rank==0) printf("Volume fraction for morphological opening: %f \n",volume_fraction);
@@ -433,6 +434,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	double deltaR=0.05; // amount to change the radius in voxel units
 	double Rcrit_old;
 
+	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
 
 	double Rcrit_new = maxdistGlobal;
@@ -440,7 +442,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	//	Rcrit_new = strtod(argv[2],NULL);
 	//	if (rank==0) printf("Max. distance =%f, Initial critical radius = %f \n",maxdistGlobal,Rcrit_new);
 	//}
-	Dm->Comm.barrier();
+	MPI_Barrier(Dm->Comm);
 
 	
 	FILE *DRAIN = fopen("morphdrain.csv","w");
@@ -507,41 +509,41 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		PackID(Dm->sendList_YZ, Dm->sendCount_YZ ,sendID_YZ, id);
 		//......................................................................................
 		MPI_Sendrecv(sendID_x,Dm->sendCount_x,MPI_CHAR,Dm->rank_x(),sendtag,
-				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_X,Dm->recvCount_X,MPI_CHAR,Dm->rank_X(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_X,Dm->sendCount_X,MPI_CHAR,Dm->rank_X(),sendtag,
-				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_x,Dm->recvCount_x,MPI_CHAR,Dm->rank_x(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_y,Dm->sendCount_y,MPI_CHAR,Dm->rank_y(),sendtag,
-				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Y,Dm->recvCount_Y,MPI_CHAR,Dm->rank_Y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Y,Dm->sendCount_Y,MPI_CHAR,Dm->rank_Y(),sendtag,
-				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_y,Dm->recvCount_y,MPI_CHAR,Dm->rank_y(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_z,Dm->sendCount_z,MPI_CHAR,Dm->rank_z(),sendtag,
-				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Z,Dm->recvCount_Z,MPI_CHAR,Dm->rank_Z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Z,Dm->sendCount_Z,MPI_CHAR,Dm->rank_Z(),sendtag,
-				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_z,Dm->recvCount_z,MPI_CHAR,Dm->rank_z(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xy,Dm->sendCount_xy,MPI_CHAR,Dm->rank_xy(),sendtag,
-				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_XY,Dm->recvCount_XY,MPI_CHAR,Dm->rank_XY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XY,Dm->sendCount_XY,MPI_CHAR,Dm->rank_XY(),sendtag,
-				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xy,Dm->recvCount_xy,MPI_CHAR,Dm->rank_xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xy,Dm->sendCount_Xy,MPI_CHAR,Dm->rank_Xy(),sendtag,
-				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xY,Dm->recvCount_xY,MPI_CHAR,Dm->rank_xY(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xY,Dm->sendCount_xY,MPI_CHAR,Dm->rank_xY(),sendtag,
-				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Xy,Dm->recvCount_Xy,MPI_CHAR,Dm->rank_Xy(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xz,Dm->sendCount_xz,MPI_CHAR,Dm->rank_xz(),sendtag,
-				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_XZ,Dm->recvCount_XZ,MPI_CHAR,Dm->rank_XZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_XZ,Dm->sendCount_XZ,MPI_CHAR,Dm->rank_XZ(),sendtag,
-				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xz,Dm->recvCount_xz,MPI_CHAR,Dm->rank_xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Xz,Dm->sendCount_Xz,MPI_CHAR,Dm->rank_Xz(),sendtag,
-				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_xZ,Dm->recvCount_xZ,MPI_CHAR,Dm->rank_xZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_xZ,Dm->sendCount_xZ,MPI_CHAR,Dm->rank_xZ(),sendtag,
-				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Xz,Dm->recvCount_Xz,MPI_CHAR,Dm->rank_Xz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yz,Dm->sendCount_yz,MPI_CHAR,Dm->rank_yz(),sendtag,
-				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_YZ,Dm->recvCount_YZ,MPI_CHAR,Dm->rank_YZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_YZ,Dm->sendCount_YZ,MPI_CHAR,Dm->rank_YZ(),sendtag,
-				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_yz,Dm->recvCount_yz,MPI_CHAR,Dm->rank_yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_Yz,Dm->sendCount_Yz,MPI_CHAR,Dm->rank_Yz(),sendtag,
-				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_yZ,Dm->recvCount_yZ,MPI_CHAR,Dm->rank_yZ(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		MPI_Sendrecv(sendID_yZ,Dm->sendCount_yZ,MPI_CHAR,Dm->rank_yZ(),sendtag,
-				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm.getCommunicator(),MPI_STATUS_IGNORE);
+				recvID_Yz,Dm->recvCount_Yz,MPI_CHAR,Dm->rank_Yz(),recvtag,Dm->Comm,MPI_STATUS_IGNORE);
 		//......................................................................................
 		UnpackID(Dm->recvList_x, Dm->recvCount_x ,recvID_x, id);
 		UnpackID(Dm->recvList_X, Dm->recvCount_X ,recvID_X, id);
@@ -562,7 +564,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		UnpackID(Dm->recvList_yZ, Dm->recvCount_yZ ,recvID_yZ, id);
 		UnpackID(Dm->recvList_YZ, Dm->recvCount_YZ ,recvID_YZ, id);
 		//......................................................................................
-		// double GlobalNumber = Dm->Comm.sumReduce( LocalNumber );
+		MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -581,7 +583,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		Dm->Comm.barrier();
+		MPI_Barrier(Dm->Comm);
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -643,7 +645,7 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 				}
 			}
 		}
-		countGlobal = Dm->Comm.sumReduce( count );
+		MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 		void_fraction_new = countGlobal/totalGlobal;
 		void_fraction_diff_new = abs(void_fraction_new-VoidFraction);
 		if (rank==0){
@@ -700,7 +702,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			}
 		}
 	}
-	double count_original = Dm->Comm.sumReduce( count);
+	double count_original=sumReduce( Dm->Comm, count);
 
 	// Estimate morph_delta
 	double morph_delta = 0.0;
@@ -730,8 +732,8 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 				}
 			}
 		}
-		count = Dm->Comm.sumReduce( count );
-		MAX_DISPLACEMENT = Dm->Comm.maxReduce( MAX_DISPLACEMENT );
+		count=sumReduce( Dm->Comm, count);
+		MAX_DISPLACEMENT = maxReduce( Dm->Comm, MAX_DISPLACEMENT);
 		GrowthEstimate = count - count_original;
 		ERROR = fabs((GrowthEstimate-TargetGrowth) /TargetGrowth);
 
@@ -774,7 +776,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			}
 		}
 	}
-	count = Dm->Comm.sumReduce( count );
+	count=sumReduce( Dm->Comm, count);
 
 	return count;
 }
diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index 89451c7b..6c76f58b 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -3,7 +3,7 @@
 #include "analysis/analysis.h"
 #include "common/Array.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/ScaLBL.h"
 #include "models/ColorModel.h"
 
@@ -462,7 +462,7 @@ private:
 /******************************************************************
  *  MPI comm wrapper for use with analysis                         *
  ******************************************************************/
-runAnalysis::commWrapper::commWrapper( int tag_, const Utilities::MPI& comm_, runAnalysis* analysis_ ):
+runAnalysis::commWrapper::commWrapper( int tag_, MPI_Comm comm_, runAnalysis* analysis_ ):
             comm(comm_),
             tag(tag_),
             analysis(analysis_)
@@ -479,7 +479,7 @@ runAnalysis::commWrapper::~commWrapper()
 {
     if ( tag == -1 )
         return;
-    comm.barrier();
+    MPI_Barrier( comm );
     analysis->d_comm_used[tag] = false;
 }
 runAnalysis::commWrapper runAnalysis::getComm( )
@@ -496,10 +496,10 @@ runAnalysis::commWrapper runAnalysis::getComm( )
         if ( tag == -1 )
             ERROR("Unable to get comm");
     }
-    tag = d_comm.bcast( tag, 0 );
+    MPI_Bcast( &tag, 1, MPI_INT, 0, d_comm );
     d_comm_used[tag] = true;
-    if ( d_comms[tag].isNull() )
-        d_comms[tag] = d_comm.dup();
+    if ( d_comms[tag] == MPI_COMM_NULL )
+        MPI_Comm_dup( MPI_COMM_WORLD, &d_comms[tag] );
     return commWrapper(tag,d_comms[tag],this);
 }
 
@@ -507,20 +507,14 @@ runAnalysis::commWrapper runAnalysis::getComm( )
 /******************************************************************
  *  Constructor/Destructors                                        *
  ******************************************************************/
-runAnalysis::runAnalysis( std::shared_ptr<Database> input_db,
-                          const RankInfoStruct& rank_info,
-                          std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm,
-                          std::shared_ptr <Domain> Dm,
-                          int Np,
-                          bool Regular,
-                          IntArray Map ):
-    d_Np( Np ),
-    d_regular ( Regular),
-    d_rank_info( rank_info ),
-    d_Map( Map ),
-    d_fillData(Dm->Comm,Dm->rank_info,{Dm->Nx-2,Dm->Ny-2,Dm->Nz-2},{1,1,1},0,1),
-    d_comm( Utilities::MPI( MPI_COMM_WORLD ).dup() ),
-    d_ScaLBL_Comm( ScaLBL_Comm)
+runAnalysis::runAnalysis(std::shared_ptr<Database> input_db, const RankInfoStruct& rank_info, std::shared_ptr<ScaLBL_Communicator> ScaLBL_Comm, std::shared_ptr <Domain> Dm,
+        int Np, bool Regular, IntArray Map ):
+            d_Np( Np ),
+            d_regular ( Regular),
+            d_rank_info( rank_info ),
+            d_Map( Map ),
+            d_fillData(Dm->Comm,Dm->rank_info,{Dm->Nx-2,Dm->Ny-2,Dm->Nz-2},{1,1,1},0,1),
+            d_ScaLBL_Comm( ScaLBL_Comm)
 {
 
 	auto db = input_db->getDatabase( "Analysis" );
@@ -558,7 +552,7 @@ runAnalysis::runAnalysis( std::shared_ptr<Database> input_db,
     d_restartFile = restart_file + "." + rankString;
     
     
-    d_rank = d_comm.getRank();
+    d_rank = MPI_WORLD_RANK();
     writeIDMap(ID_map_struct(),0,id_map_filename);
     // Initialize IO for silo
     IO::initialize("","silo","false");
@@ -627,8 +621,11 @@ runAnalysis::runAnalysis( std::shared_ptr<Database> input_db,
     
 
     // Initialize the comms
-    for (int i=0; i<1024; i++)
+    MPI_Comm_dup(MPI_COMM_WORLD,&d_comm);
+    for (int i=0; i<1024; i++) {
+        d_comms[i] = MPI_COMM_NULL;
         d_comm_used[i] = false;
+    }
     // Initialize the threads
     int N_threads = db->getWithDefault<int>( "N_threads", 4 );
     auto method = db->getWithDefault<std::string>( "load_balance", "default" );
@@ -638,6 +635,12 @@ runAnalysis::~runAnalysis( )
 {
     // Finish processing analysis
     finish();
+    // Clear internal data
+    MPI_Comm_free( &d_comm );
+    for (int i=0; i<1024; i++) {
+        if ( d_comms[i] != MPI_COMM_NULL )
+            MPI_Comm_free(&d_comms[i]);
+    }
 }
 void runAnalysis::finish( )
 {
@@ -651,7 +654,7 @@ void runAnalysis::finish( )
     d_wait_subphase.reset();
     d_wait_restart.reset();
     // Syncronize
-    d_comm.barrier();
+    MPI_Barrier( d_comm );
     PROFILE_STOP("finish");
 }
 
diff --git a/analysis/runAnalysis.h b/analysis/runAnalysis.h
index 3c5bc7f0..0bf2f676 100644
--- a/analysis/runAnalysis.h
+++ b/analysis/runAnalysis.h
@@ -68,10 +68,10 @@ public:
     class commWrapper
     {
       public:
-        Utilities::MPI comm;
+        MPI_Comm comm;
         int tag;
         runAnalysis *analysis;
-        commWrapper( int tag, const Utilities::MPI& comm, runAnalysis *analysis );
+        commWrapper( int tag, MPI_Comm comm, runAnalysis *analysis );
         commWrapper( ) = delete;
         commWrapper( const commWrapper &rhs ) = delete;
         commWrapper& operator=( const commWrapper &rhs ) = delete;
@@ -100,8 +100,8 @@ private:
     std::vector<IO::MeshDataStruct> d_meshData;
     fillHalo<double> d_fillData;
     std::string d_restartFile;
-    Utilities::MPI d_comm;
-    Utilities::MPI d_comms[1024];
+    MPI_Comm d_comm;
+    MPI_Comm d_comms[1024];
     volatile bool d_comm_used[1024];
     std::shared_ptr<ScaLBL_Communicator> d_ScaLBL_Comm;
 
diff --git a/analysis/uCT.cpp b/analysis/uCT.cpp
index 28d677c1..912f8e85 100644
--- a/analysis/uCT.cpp
+++ b/analysis/uCT.cpp
@@ -228,7 +228,8 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     Array<float>& Mean, Array<float>& Dist1, Array<float>& Dist2 )
 {
     PROFILE_SCOPED(timer,"filter_final");
-	int rank = Dm.Comm.getRank();
+	int rank;
+	MPI_Comm_rank(Dm.Comm,&rank);
     int Nx = Dm.Nx-2;
     int Ny = Dm.Ny-2;
     int Nz = Dm.Nz-2;
@@ -241,7 +242,7 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     float tmp = 0;
     for (size_t i=0; i<Dist0.length(); i++)
         tmp += Dist0(i)*Dist0(i);
-    tmp = sqrt( Dm.Comm.sumReduce(tmp) / Dm.Comm.sumReduce<float>(Dist0.length()) );
+    tmp = sqrt( sumReduce(Dm.Comm,tmp) / sumReduce(Dm.Comm,(float)Dist0.length()) );
     const float dx1 = 0.3*tmp;
     const float dx2 = 1.05*dx1;
     if (rank==0)
@@ -284,7 +285,7 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
     Phase.fill(1);
     ComputeGlobalBlobIDs( Nx, Ny, Nz, Dm.rank_info, Phase, SignDist, 0, 0, GlobalBlobID, Dm.Comm );
     fillInt.fill(GlobalBlobID);
-    int N_blobs = Dm.Comm.maxReduce(GlobalBlobID.max()+1);
+    int N_blobs = maxReduce(Dm.Comm,GlobalBlobID.max()+1);
     std::vector<float> mean(N_blobs,0);
     std::vector<int> count(N_blobs,0);
     for (int k=1; k<=Nz; k++) {
@@ -320,8 +321,8 @@ void filter_final( Array<char>& ID, Array<float>& Dist,
             }
         }
     }
-    mean = Dm.Comm.sumReduce(mean);
-    count = Dm.Comm.sumReduce(count);
+    mean = sumReduce(Dm.Comm,mean);
+    count = sumReduce(Dm.Comm,count);
     for (size_t i=0; i<mean.size(); i++)
         mean[i] /= count[i];
     /*if (rank==0) {
diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake
deleted file mode 100644
index d2377e9a..00000000
--- a/cmake/FindHIP.cmake
+++ /dev/null
@@ -1,579 +0,0 @@
-###############################################################################
-# FindHIP.cmake
-###############################################################################
-
-###############################################################################
-# SET: Variable defaults
-###############################################################################
-# User defined flags
-set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
-set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC")
-set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
-mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS)
-set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
-list(REMOVE_DUPLICATES _hip_configuration_types)
-foreach(config ${_hip_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
-    set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC")
-    set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
-    mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
-endforeach()
-option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
-option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
-mark_as_advanced(HIP_HOST_COMPILATION_CPP)
-
-###############################################################################
-# Set HIP CMAKE Flags
-###############################################################################
-# Copy the invocation styles from CXX to HIP
-set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
-set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
-set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
-set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
-#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
-set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
-
-# Set the CMake Flags to use the HCC Compilier.
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-
-###############################################################################
-# FIND: HIP and associated helper binaries
-###############################################################################
-# HIP is supported on Linux only
-if(UNIX AND NOT APPLE AND NOT CYGWIN)
-    # Search for HIP installation
-    if(NOT HIP_ROOT_DIR)
-        # Search in user specified path first
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            ENV ROCM_PATH
-            ENV HIP_PATH
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            NO_DEFAULT_PATH
-            )
-        # Now search in default path
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            /opt/rocm
-            /opt/rocm/hip
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            )
-
-        # Check if we found HIP installation
-        if(HIP_ROOT_DIR)
-            # If so, fix the path
-            string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" HIP_ROOT_DIR ${HIP_ROOT_DIR})
-            # And push it back to the cache
-            set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
-        endif()
-        if(NOT EXISTS ${HIP_ROOT_DIR})
-            if(HIP_FIND_REQUIRED)
-                message(FATAL_ERROR "Specify HIP_ROOT_DIR")
-            elseif(NOT HIP_FIND_QUIETLY)
-                message("HIP_ROOT_DIR not found or specified")
-            endif()
-        endif()
-    endif()
-
-    # Find HIPCC executable
-    find_program(
-        HIP_HIPCC_EXECUTABLE
-        NAMES hipcc
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCC_EXECUTABLE hipcc)
-    endif()
-    mark_as_advanced(HIP_HIPCC_EXECUTABLE)
-
-    # Find HIPCONFIG executable
-    find_program(
-        HIP_HIPCONFIG_EXECUTABLE
-        NAMES hipconfig
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCONFIG_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
-    endif()
-    mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
-
-    # Find HIPCC_CMAKE_LINKER_HELPER executable
-    find_program(
-        HIP_HIPCC_CMAKE_LINKER_HELPER
-        NAMES hipcc_cmake_linker_helper
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
-        # Now search in default paths
-        find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
-    endif()
-    mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
-        # Compute the version
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
-            OUTPUT_VARIABLE _hip_version
-            ERROR_VARIABLE _hip_error
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_STRIP_TRAILING_WHITESPACE
-            )
-        if(NOT _hip_error)
-            set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
-        else()
-            set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
-        endif()
-        mark_as_advanced(HIP_VERSION)
-    endif()
-    if(HIP_VERSION)
-        string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
-        list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
-        list(GET _hip_version_list 1 HIP_VERSION_MINOR)
-        list(GET _hip_version_list 2 HIP_VERSION_PATCH)
-        set(HIP_VERSION_STRING "${HIP_VERSION}")
-    endif()
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
-        # Compute the platform
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
-            OUTPUT_VARIABLE _hip_platform
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            )
-        set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
-        mark_as_advanced(HIP_PLATFORM)
-    endif()
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-    HIP
-    REQUIRED_VARS
-    HIP_ROOT_DIR
-    HIP_HIPCC_EXECUTABLE
-    HIP_HIPCONFIG_EXECUTABLE
-    HIP_PLATFORM
-    VERSION_VAR HIP_VERSION
-    )
-
-###############################################################################
-# MACRO: Locate helper files
-###############################################################################
-macro(HIP_FIND_HELPER_FILE _name _extension)
-    set(_hip_full_name "${_name}.${_extension}")
-    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-    set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
-    if(NOT EXISTS "${HIP_${_name}}")
-        set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
-        if(HIP_FIND_REQUIRED)
-            message(FATAL_ERROR "${error_message}")
-        else()
-            if(NOT HIP_FIND_QUIETLY)
-                message(STATUS "${error_message}")
-            endif()
-        endif()
-    endif()
-    # Set this variable as internal, so the user isn't bugged with it.
-    set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
-endmacro()
-
-###############################################################################
-hip_find_helper_file(run_make2cmake cmake)
-hip_find_helper_file(run_hipcc cmake)
-###############################################################################
-
-###############################################################################
-# MACRO: Reset compiler flags
-###############################################################################
-macro(HIP_RESET_FLAGS)
-    unset(HIP_HIPCC_FLAGS)
-    unset(HIP_HCC_FLAGS)
-    unset(HIP_NVCC_FLAGS)
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        unset(HIP_HIPCC_FLAGS_${config_upper})
-        unset(HIP_HCC_FLAGS_${config_upper})
-        unset(HIP_NVCC_FLAGS_${config_upper})
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Separate the options from the sources
-###############################################################################
-macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options)
-    set(${_sources})
-    set(${_cmake_options})
-    set(${_hipcc_options})
-    set(${_hcc_options})
-    set(${_nvcc_options})
-    set(_hipcc_found_options FALSE)
-    set(_hcc_found_options FALSE)
-    set(_nvcc_found_options FALSE)
-    foreach(arg ${ARGN})
-        if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
-            set(_hipcc_found_options TRUE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options TRUE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options TRUE)
-        elseif(
-                "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
-                "x${arg}" STREQUAL "xSTATIC" OR
-                "x${arg}" STREQUAL "xSHARED" OR
-                "x${arg}" STREQUAL "xMODULE"
-                )
-            list(APPEND ${_cmake_options} ${arg})
-        else()
-            if(_hipcc_found_options)
-                list(APPEND ${_hipcc_options} ${arg})
-            elseif(_hcc_found_options)
-                list(APPEND ${_hcc_options} ${arg})
-            elseif(_nvcc_found_options)
-                list(APPEND ${_nvcc_options} ${arg})
-            else()
-                # Assume this is a file
-                list(APPEND ${_sources} ${arg})
-            endif()
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Add include directories to pass to the hipcc command
-###############################################################################
-set(HIP_HIPCC_INCLUDE_ARGS_USER "")
-macro(HIP_INCLUDE_DIRECTORIES)
-    foreach(dir ${ARGN})
-        list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
-    endforeach()
-endmacro()
-
-###############################################################################
-# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
-###############################################################################
-function(HIP_COMPUTE_BUILD_PATH path build_path)
-    # Convert to cmake style paths
-    file(TO_CMAKE_PATH "${path}" bpath)
-    if(IS_ABSOLUTE "${bpath}")
-        string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
-        if(_binary_dir_pos EQUAL 0)
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
-        else()
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
-        endif()
-    endif()
-
-    # Remove leading /
-    string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
-    # Avoid absolute paths by removing ':'
-    string(REPLACE ":" "_" bpath "${bpath}")
-    # Avoid relative paths that go up the tree
-    string(REPLACE "../" "__/" bpath "${bpath}")
-    # Avoid spaces
-    string(REPLACE " " "_" bpath "${bpath}")
-    # Strip off the filename
-    get_filename_component(bpath "${bpath}" PATH)
-
-    set(${build_path} "${bpath}" PARENT_SCOPE)
-endfunction()
-
-###############################################################################
-# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
-###############################################################################
-macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
-    set(_hip_found_config)
-    foreach(arg ${ARGN})
-        # Determine if we are dealing with a per-configuration flag
-        foreach(config ${_hip_configuration_types})
-            string(TOUPPER ${config} config_upper)
-            if(arg STREQUAL "${config_upper}")
-                set(_hip_found_config _${arg})
-                # Clear arg to prevent it from being processed anymore
-                set(arg)
-            endif()
-        endforeach()
-        if(arg)
-            list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Try and include dependency file if it exists
-###############################################################################
-macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
-    set(HIP_HIPCC_DEPEND)
-    set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
-
-    # Create the dependency file if it doesn't exist
-    if(NOT EXISTS ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-    # Include the dependency file
-    include(${dependency_file})
-
-    # Verify the existence of all the included files
-    if(HIP_HIPCC_DEPEND)
-        foreach(f ${HIP_HIPCC_DEPEND})
-            if(NOT EXISTS ${f})
-                # If they aren't there, regenerate the file again
-                set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-            endif()
-        endforeach()
-    else()
-        # No dependencies, so regenerate the file
-        set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-    endif()
-
-    # Regenerate the dependency file if needed
-    if(HIP_HIPCC_DEPEND_REGENERATE)
-        set(HIP_HIPCC_DEPEND ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-endmacro()
-
-###############################################################################
-# MACRO: Prepare cmake commands for the target
-###############################################################################
-macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
-    set(_hip_flags "")
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
-    if(HIP_HOST_COMPILATION_CPP)
-        set(HIP_C_OR_CXX CXX)
-    else()
-        set(HIP_C_OR_CXX C)
-    endif()
-    set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
-
-    # Initialize list of includes with those specified by the user. Append with
-    # ones specified to cmake directly.
-    set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
-
-    # Add the include directories
-    set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
-    list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
-
-    get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
-    list(REMOVE_DUPLICATES _hip_include_directories)
-    if(_hip_include_directories)
-        foreach(dir ${_hip_include_directories})
-            list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
-        endforeach()
-    endif()
-
-    HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
-
-    # Add the compile definitions
-    set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
-    list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
-
-    # Check if we are building shared library.
-    set(_hip_build_shared_libs FALSE)
-    list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
-    list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
-    if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
-        set(_hip_build_shared_libs TRUE)
-    endif()
-    list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
-    if(_hip_found_STATIC GREATER -1)
-        set(_hip_build_shared_libs FALSE)
-    endif()
-
-    # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
-    if(_hip_build_shared_libs)
-        list(APPEND HIP_HCC_FLAGS "-fPIC")
-        list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
-    endif()
-
-    # Set host compiler
-    set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
-
-    # Set compiler flags
-    set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
-    set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
-    set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})")
-    set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
-        set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
-        set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})")
-        set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
-    endforeach()
-
-    # Reset the output variable
-    set(_hip_generated_files "")
-    set(_hip_source_files "")
-
-    # Iterate over all arguments and create custom commands for all source files
-    foreach(file ${ARGN})
-        # Ignore any file marked as a HEADER_FILE_ONLY
-        get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-        # Allow per source file overrides of the format. Also allows compiling non .cu files.
-        get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
-        if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
-            set(host_flag FALSE)
-        else()
-            set(host_flag TRUE)
-        endif()
-
-        if(NOT host_flag)
-            # Determine output directory
-            HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
-            set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
-
-            get_filename_component(basename ${file} NAME)
-            set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
-            set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
-
-            # Set file names
-            set(generated_file "${generated_file_path}/${generated_file_basename}")
-            set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
-            set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
-            set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
-
-            # Set properties for object files
-            set_source_files_properties("${generated_file}"
-                PROPERTIES
-                EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
-                )
-
-            # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
-            get_filename_component(file_path "${file}" PATH)
-            if(IS_ABSOLUTE "${file_path}")
-                set(source_file "${file}")
-            else()
-                set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
-            endif()
-
-            # Bring in the dependencies
-            HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
-
-            # Configure the build script
-            configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
-            file(GENERATE
-                OUTPUT "${custom_target_script}"
-                INPUT "${custom_target_script_pregen}"
-                )
-            set(main_dep DEPENDS ${source_file})
-            if(CMAKE_GENERATOR MATCHES "Makefiles")
-                set(verbose_output "$(VERBOSE)")
-            elseif(HIP_VERBOSE_BUILD)
-                set(verbose_output ON)
-            else()
-                set(verbose_output OFF)
-            endif()
-
-            # Create up the comment string
-            file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-            set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
-
-            # Build the generated file and dependency file
-            add_custom_command(
-                OUTPUT ${generated_file}
-                # These output files depend on the source_file and the contents of cmake_dependency_file
-                ${main_dep}
-                DEPENDS ${HIP_HIPCC_DEPEND}
-                DEPENDS ${custom_target_script}
-                # Make sure the output directory exists before trying to write to it.
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
-                COMMAND ${CMAKE_COMMAND} ARGS
-                -D verbose:BOOL=${verbose_output}
-                -D build_configuration:STRING=${_hip_build_configuration}
-                -D "generated_file:STRING=${generated_file}"
-                -P "${custom_target_script}"
-                WORKING_DIRECTORY "${hip_compile_output_dir}"
-                COMMENT "${hip_build_comment_string}"
-                )
-
-            # Make sure the build system knows the file is generated
-            set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
-            list(APPEND _hip_generated_files ${generated_file})
-            list(APPEND _hip_source_files ${file})
-        endif()
-    endforeach()
-
-    # Set the return parameter
-    set(${_generated_files} ${_hip_generated_files})
-    set(${_source_files} ${_hip_source_files})
-endmacro()
-
-###############################################################################
-# HIP_ADD_EXECUTABLE
-###############################################################################
-macro(HIP_ADD_EXECUTABLE hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    if("x${HCC_HOME}" STREQUAL "x")
-        set(HCC_HOME "/opt/rocm/hcc")
-    endif()
-    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
-endmacro()
-
-###############################################################################
-# HIP_ADD_LIBRARY
-###############################################################################
-macro(HIP_ADD_LIBRARY hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
-endmacro()
-
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/common/Communication.h b/common/Communication.h
index 7c2f8d08..7819a0bb 100644
--- a/common/Communication.h
+++ b/common/Communication.h
@@ -1,7 +1,7 @@
 #ifndef COMMUNICATION_H_INC
 #define COMMUNICATION_H_INC
 
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 #include "common/Array.h"
 
@@ -38,7 +38,7 @@ struct RankInfoStruct {
 //! Redistribute domain data (dst may be smaller than the src)
 template<class TYPE>
 Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src_data,
-    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, const Utilities::MPI& comm );
+    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, MPI_Comm comm );
 
 
 /*!
@@ -59,7 +59,7 @@ public:
      * @param[in] fill          Fill {faces,edges,corners}
      * @param[in] periodic      Periodic dimensions
      */
-    fillHalo( const Utilities::MPI& comm, const RankInfoStruct& info,
+    fillHalo( MPI_Comm comm, const RankInfoStruct& info,
         std::array<int,3> n, std::array<int,3> ng, int tag, int depth,
         std::array<bool,3> fill = {true,true,true},
         std::array<bool,3> periodic = {true,true,true} );
@@ -83,7 +83,7 @@ public:
 
 
 private:
-    Utilities::MPI comm;
+    MPI_Comm comm;
     RankInfoStruct info;
     std::array<int,3> n, ng;
     int depth;
@@ -93,6 +93,8 @@ private:
     TYPE *mem;
     TYPE *send[3][3][3], *recv[3][3][3];
     MPI_Request send_req[3][3][3], recv_req[3][3][3];
+    size_t N_type;
+    MPI_Datatype datatype;
     fillHalo();                             // Private empty constructor
     fillHalo(const fillHalo&);              // Private copy constructor
     fillHalo& operator=(const fillHalo&);   // Private assignment operator
@@ -134,7 +136,7 @@ void InitializeRanks( const int rank, const int nprocx, const int nprocy, const
 
 
 //***************************************************************************************
-inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
+inline void CommunicateSendRecvCounts( MPI_Comm Communicator, int sendtag, int recvtag, 
 		int rank_x, int rank_y, int rank_z, 
 		int rank_X, int rank_Y, int rank_Z,
 		int rank_xy, int rank_XY, int rank_xY, int rank_Xy,
@@ -153,53 +155,53 @@ inline void CommunicateSendRecvCounts( const Utilities::MPI& Communicator, int s
 {
 	MPI_Request req1[18], req2[18];
 	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator.getCommunicator(),&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator.getCommunicator(),&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator.getCommunicator(),&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator.getCommunicator(),&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator.getCommunicator(),&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator.getCommunicator(),&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator.getCommunicator(),&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator.getCommunicator(),&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator.getCommunicator(),&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator.getCommunicator(),&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator.getCommunicator(),&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator.getCommunicator(),&req2[5]);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x,sendtag+0,Communicator,&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X,recvtag+0,Communicator,&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X,sendtag+1,Communicator,&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x,recvtag+1,Communicator,&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y,sendtag+2,Communicator,&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y,recvtag+2,Communicator,&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y,sendtag+3,Communicator,&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y,recvtag+3,Communicator,&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z,sendtag+4,Communicator,&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z,recvtag+4,Communicator,&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z,sendtag+5,Communicator,&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z,recvtag+5,Communicator,&req2[5]);
 
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator.getCommunicator(),&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator.getCommunicator(),&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator.getCommunicator(),&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator.getCommunicator(),&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator.getCommunicator(),&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator.getCommunicator(),&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator.getCommunicator(),&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator.getCommunicator(),&req2[9]);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy,sendtag+6,Communicator,&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY,recvtag+6,Communicator,&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY,sendtag+7,Communicator,&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy,recvtag+7,Communicator,&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy,sendtag+8,Communicator,&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY,recvtag+8,Communicator,&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY,sendtag+9,Communicator,&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy,recvtag+9,Communicator,&req2[9]);
 
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator.getCommunicator(),&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator.getCommunicator(),&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator.getCommunicator(),&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator.getCommunicator(),&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator.getCommunicator(),&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator.getCommunicator(),&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator.getCommunicator(),&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator.getCommunicator(),&req2[13]);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz,sendtag+10,Communicator,&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ,recvtag+10,Communicator,&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ,sendtag+11,Communicator,&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz,recvtag+11,Communicator,&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz,sendtag+12,Communicator,&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ,recvtag+12,Communicator,&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ,sendtag+13,Communicator,&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz,recvtag+13,Communicator,&req2[13]);
 
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator.getCommunicator(),&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator.getCommunicator(),&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator.getCommunicator(),&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator.getCommunicator(),&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator.getCommunicator(),&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator.getCommunicator(),&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator.getCommunicator(),&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator.getCommunicator(),&req2[17]);
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz,sendtag+14,Communicator,&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ,recvtag+14,Communicator,&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ,sendtag+15,Communicator,&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz,recvtag+15,Communicator,&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz,sendtag+16,Communicator,&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ,recvtag+16,Communicator,&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ,sendtag+17,Communicator,&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz,recvtag+17,Communicator,&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
-	Communicator.barrier();
+	MPI_Barrier(Communicator);
 }
 
 
 //***************************************************************************************
-inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendtag, int recvtag, 
+inline void CommunicateRecvLists( MPI_Comm Communicator, int sendtag, int recvtag, 
 		int *sendList_x, int *sendList_y, int *sendList_z, int *sendList_X, int *sendList_Y, int *sendList_Z,
 		int *sendList_xy, int *sendList_XY, int *sendList_xY, int *sendList_Xy,
 		int *sendList_xz, int *sendList_XZ, int *sendList_xZ, int *sendList_Xz,
@@ -221,52 +223,52 @@ inline void CommunicateRecvLists( const Utilities::MPI& Communicator, int sendta
 {
 	MPI_Request req1[18], req2[18];
 	MPI_Status stat1[18],stat2[18];
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator.getCommunicator(),&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator.getCommunicator(),&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator.getCommunicator(),&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator.getCommunicator(),&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator.getCommunicator(),&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator.getCommunicator(),&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator.getCommunicator(),&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator.getCommunicator(),&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator.getCommunicator(),&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator.getCommunicator(),&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator.getCommunicator(),&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator.getCommunicator(),&req2[5]);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x,sendtag,Communicator,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X,recvtag,Communicator,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X,sendtag,Communicator,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x,recvtag,Communicator,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y,sendtag,Communicator,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y,recvtag,Communicator,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y,sendtag,Communicator,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y,recvtag,Communicator,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z,sendtag,Communicator,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z,recvtag,Communicator,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z,sendtag,Communicator,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z,recvtag,Communicator,&req2[5]);
 
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator.getCommunicator(),&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator.getCommunicator(),&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator.getCommunicator(),&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator.getCommunicator(),&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator.getCommunicator(),&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator.getCommunicator(),&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator.getCommunicator(),&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator.getCommunicator(),&req2[9]);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy,sendtag,Communicator,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY,recvtag,Communicator,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY,sendtag,Communicator,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy,recvtag,Communicator,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy,sendtag,Communicator,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY,recvtag,Communicator,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY,sendtag,Communicator,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy,recvtag,Communicator,&req2[9]);
 
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator.getCommunicator(),&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator.getCommunicator(),&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator.getCommunicator(),&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator.getCommunicator(),&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator.getCommunicator(),&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator.getCommunicator(),&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator.getCommunicator(),&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator.getCommunicator(),&req2[13]);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz,sendtag,Communicator,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ,recvtag,Communicator,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ,sendtag,Communicator,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz,recvtag,Communicator,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz,sendtag,Communicator,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ,recvtag,Communicator,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ,sendtag,Communicator,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz,recvtag,Communicator,&req2[13]);
 
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator.getCommunicator(),&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator.getCommunicator(),&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator.getCommunicator(),&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator.getCommunicator(),&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator.getCommunicator(),&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator.getCommunicator(),&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator.getCommunicator(),&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator.getCommunicator(),&req2[17]);
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz,sendtag,Communicator,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ,recvtag,Communicator,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ,sendtag,Communicator,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz,recvtag,Communicator,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz,sendtag,Communicator,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ,recvtag,Communicator,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ,sendtag,Communicator,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz,recvtag,Communicator,&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
 }
 
 
 //***************************************************************************************
-inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communicator,
+inline void CommunicateMeshHalo(DoubleArray &Mesh, MPI_Comm Communicator,
 		double *sendbuf_x,double *sendbuf_y,double *sendbuf_z,double *sendbuf_X,double *sendbuf_Y,double *sendbuf_Z,
 		double *sendbuf_xy,double *sendbuf_XY,double *sendbuf_xY,double *sendbuf_Xy,
 		double *sendbuf_xz,double *sendbuf_XZ,double *sendbuf_xZ,double *sendbuf_Xz,
@@ -317,41 +319,41 @@ inline void CommunicateMeshHalo(DoubleArray &Mesh, const Utilities::MPI& Communi
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendbuf_YZ, MeshData);
 	//......................................................................................
 	MPI_Sendrecv(sendbuf_x,sendCount_x,MPI_DOUBLE,rank_x,sendtag,
-			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_X,recvCount_X,MPI_DOUBLE,rank_X,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_X,sendCount_X,MPI_DOUBLE,rank_X,sendtag,
-			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_x,recvCount_x,MPI_DOUBLE,rank_x,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_y,sendCount_y,MPI_DOUBLE,rank_y,sendtag,
-			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_Y,recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Y,sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,
-			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_y,recvCount_y,MPI_DOUBLE,rank_y,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_z,sendCount_z,MPI_DOUBLE,rank_z,sendtag,
-			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_Z,recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Z,sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,
-			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_z,recvCount_z,MPI_DOUBLE,rank_z,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xy,sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,
-			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_XY,recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_XY,sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,
-			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_xy,recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,
-			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_xY,recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xY,sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,
-			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xz,sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,
-			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,
-			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_xz,recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,
-			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,
-			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_yz,sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,
-			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,
-			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_yz,recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,
-			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,Communicator,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendbuf_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,
-			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator.getCommunicator(),MPI_STATUS_IGNORE);
+			recvbuf_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,Communicator,MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvbuf_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvbuf_X, MeshData);
diff --git a/common/Communication.hpp b/common/Communication.hpp
index ca310ea5..33fed3a7 100644
--- a/common/Communication.hpp
+++ b/common/Communication.hpp
@@ -2,8 +2,9 @@
 #define COMMUNICATION_HPP_INC
 
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
+//#include "ProfilerApp.h"
 
 
 /********************************************************
@@ -11,19 +12,17 @@
 ********************************************************/
 template<class TYPE>
 Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src_data,
-    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, const Utilities::MPI& comm )
+    const RankInfoStruct& dst_rank, std::array<int,3> dst_size, MPI_Comm comm )
 {
-    if ( comm.getSize() == 1 ) {
-        return src_data.subset( { 0, (size_t) dst_size[0]-1, 0, (size_t) dst_size[1]-1, 0, (size_t) dst_size[2]-1 } );
-    }
+#ifdef USE_MPI
     // Get the src size
     std::array<int,3> src_size;
     int size0[3] = { (int) src_data.size(0), (int) src_data.size(1), (int) src_data.size(2) };
-    comm.maxReduce( size0, src_size.data(), 3 );
+    MPI_Allreduce( size0, src_size.data(), 3, MPI_INT, MPI_MAX, comm );
     if ( !src_data.empty() )
         ASSERT( src_size[0] == size0[0] && src_size[1] == size0[1] && src_size[2] == size0[2] );
     // Check that dst_size matches on all ranks
-    comm.maxReduce( dst_size.data(), size0, 3 );
+    MPI_Allreduce( dst_size.data(), size0, 3, MPI_INT, MPI_MAX, comm );
     ASSERT( dst_size[0] == size0[0] && dst_size[1] == size0[1] && dst_size[2] == size0[2] );
     // Function to get overlap range
     auto calcOverlap = []( int i1[3], int i2[3], int j1[3], int j2[3] ) {
@@ -61,7 +60,7 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
     }
     std::vector<MPI_Request> send_request( send_rank.size() );
     for (size_t i=0; i<send_rank.size(); i++)
-        send_request[i] = comm.Isend( send_data[i].data(), send_data[i].length(), send_rank[i], 5462 );
+        MPI_Isend( send_data[i].data(), sizeof(TYPE)*send_data[i].length(), MPI_BYTE, send_rank[i], 5462, comm, &send_request[i]);
     // Unpack data from the appropriate ranks (including myself)
     Array<TYPE> dst_data( dst_size[0], dst_size[1], dst_size[2] );
     int i1[3] = { dst_size[0] * dst_rank.ix, dst_size[1] * dst_rank.jy, dst_size[2] * dst_rank.kz };
@@ -76,14 +75,17 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
                     continue;
                 int rank  = src_rank.getRankForBlock(i,j,k);
                 Array<TYPE> data( index[1] - index[0] + 1, index[3] - index[2] + 1, index[5] - index[4] + 1 );
-                comm.recv( data.data(), data.length(), rank, 5462 );
+                MPI_Recv( data.data(), sizeof(TYPE)*data.length(), MPI_BYTE, rank, 5462, comm, MPI_STATUS_IGNORE );
                 dst_data.copySubset( index, data );
             }
         }
     }
     // Free data
-    comm.waitAll( send_request.size(), send_request.data() );
+    MPI_Waitall( send_request.size(), send_request.data(), MPI_STATUSES_IGNORE );
     return dst_data;
+#else
+    return src_data.subset( { 0, dst_size[0]-1, 0, dst_size[1]-1, 0, dst_size[2]-1 );
+#endif
 }
 
 
@@ -92,11 +94,27 @@ Array<TYPE> redistribute( const RankInfoStruct& src_rank, const Array<TYPE>& src
 *  Structure to fill halo cells                         *
 ********************************************************/
 template<class TYPE>
-fillHalo<TYPE>::fillHalo( const Utilities::MPI& comm_, const RankInfoStruct& info_,
+fillHalo<TYPE>::fillHalo( MPI_Comm comm_, const RankInfoStruct& info_,
     std::array<int,3> n_, std::array<int,3> ng_, int tag0, int depth_,
     std::array<bool,3> fill, std::array<bool,3> periodic ):
     comm(comm_), info(info_), n(n_), ng(ng_), depth(depth_)
 {
+    if ( std::is_same<TYPE,double>() ) {
+        N_type = 1;
+        datatype = MPI_DOUBLE;
+    } else if ( std::is_same<TYPE,float>() ) {
+        N_type = 1;
+        datatype = MPI_FLOAT;
+    } else if ( sizeof(TYPE)%sizeof(double)==0 ) {
+        N_type = sizeof(TYPE) / sizeof(double);
+        datatype = MPI_DOUBLE;
+    } else if ( sizeof(TYPE)%sizeof(float)==0 ) {
+        N_type = sizeof(TYPE) / sizeof(float);
+        datatype = MPI_FLOAT;
+    } else {
+        N_type = sizeof(TYPE);
+        datatype = MPI_BYTE;
+    }
     // Set the fill pattern
     memset(fill_pattern,0,sizeof(fill_pattern));
     if ( fill[0] ) {
@@ -233,8 +251,8 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
             for (int k=0; k<3; k++) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                recv_req[i][j][k] = comm.Irecv( recv[i][j][k], depth2*N_send_recv[i][j][k], 
-                    info.rank[i][j][k], tag[2-i][2-j][2-k] );
+                MPI_Irecv( recv[i][j][k], N_type*depth2*N_send_recv[i][j][k], datatype, 
+                    info.rank[i][j][k], tag[2-i][2-j][2-k], comm, &recv_req[i][j][k] );
             }
         }
     }
@@ -245,18 +263,19 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
                 if ( !fill_pattern[i][j][k] )
                     continue;
                 pack( data, i-1, j-1, k-1, send[i][j][k] );
-                send_req[i][j][k] = comm.Isend( send[i][j][k], depth2*N_send_recv[i][j][k], 
-                    info.rank[i][j][k], tag[i][j][k] );
+                MPI_Isend( send[i][j][k], N_type*depth2*N_send_recv[i][j][k], datatype, 
+                    info.rank[i][j][k], tag[i][j][k], comm, &send_req[i][j][k] );
             }
         }
     }
     // Recv the dst data and unpack (we recive in reverse order to match the sends)
+    MPI_Status status;
     for (int i=2; i>=0; i--) {
         for (int j=2; j>=0; j--) {
             for (int k=2; k>=0; k--) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                comm.wait( recv_req[i][j][k] );
+                MPI_Wait(&recv_req[i][j][k],&status);
                 unpack( data, i-1, j-1, k-1, recv[i][j][k] );
             }
         }
@@ -267,7 +286,7 @@ void fillHalo<TYPE>::fill( Array<TYPE>& data )
             for (int k=0; k<3; k++) {
                 if ( !fill_pattern[i][j][k] )
                     continue;
-                comm.wait( send_req[i][j][k] );
+                MPI_Wait(&send_req[i][j][k],&status);
             }
         }
     }
diff --git a/common/Domain.cpp b/common/Domain.cpp
index ff6c6b68..7da902c6 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -12,7 +12,7 @@
 #include "common/Domain.h"
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 
 // Inline function to read line without a return argument
@@ -62,10 +62,11 @@ Domain::Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz,
     NULL_USE( npy );
     NULL_USE( npz );
 	// set up the neighbor ranks
-    int myrank = Comm.getRank();
+    int myrank;
+    MPI_Comm_rank( Comm, &myrank );
 	rank_info = RankInfoStruct( myrank, rank_info.nx, rank_info.ny, rank_info.nz );
 	
-	Comm.barrier();
+	MPI_Barrier(Comm);
 	
     auto db = std::make_shared<Database>( );
     db->putScalar<int>( "BC", BC );
@@ -75,9 +76,10 @@ Domain::Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz,
     db->putVector<double>( "L", { lx, ly, lz } );
     initialize( db );
 }
-Domain::Domain( std::shared_ptr<Database> db, const Utilities::MPI& Communicator):
+Domain::Domain( std::shared_ptr<Database> db, MPI_Comm Communicator):
 	database(db), Nx(0), Ny(0), Nz(0), 
 	Lx(0), Ly(0), Lz(0), Volume(0), BoundaryCondition(0),
+	Comm(MPI_COMM_NULL),
 	inlet_layers_x(0), inlet_layers_y(0), inlet_layers_z(0),
 	outlet_layers_x(0), outlet_layers_y(0), outlet_layers_z(0),
     inlet_layers_phase(1),outlet_layers_phase(2),
@@ -107,13 +109,14 @@ Domain::Domain( std::shared_ptr<Database> db, const Utilities::MPI& Communicator
 	recvData_xY(NULL), recvData_yZ(NULL), recvData_Xz(NULL), recvData_XY(NULL), recvData_YZ(NULL), recvData_XZ(NULL),
 	id(NULL)
 {
-    Comm = Communicator.dup();
+    MPI_Comm_dup(Communicator,&Comm);
 
 	// set up the neighbor ranks
-    int myrank = Comm.getRank();
+    int myrank;
+    MPI_Comm_rank( Comm, &myrank );
     initialize( db );
 	rank_info = RankInfoStruct( myrank, rank_info.nx, rank_info.ny, rank_info.nz );
-    Comm.barrier();
+	MPI_Barrier(Comm);
 }
 
 Domain::~Domain()
@@ -162,6 +165,10 @@ Domain::~Domain()
 	delete [] recvData_yZ;  delete [] recvData_Yz;  delete [] recvData_YZ;
 	// Free id
 	delete [] id;
+	// Free the communicator
+	if ( Comm != MPI_COMM_WORLD && Comm != MPI_COMM_NULL ) {
+		MPI_Comm_free(&Comm);
+	}
 }
 
 void Domain::initialize( std::shared_ptr<Database> db )
@@ -212,7 +219,8 @@ void Domain::initialize( std::shared_ptr<Database> db )
     Ny = ny+2;
     Nz = nz+2;
     // Initialize ranks
-    int myrank = Comm.getRank();
+    int myrank;
+    MPI_Comm_rank( Comm, &myrank );
 	rank_info = RankInfoStruct(myrank,nproc[0],nproc[1],nproc[2]);
 	// inlet layers only apply to lower part of domain
 	if (rank_info.ix > 0) inlet_layers_x = 0;
@@ -231,7 +239,8 @@ void Domain::initialize( std::shared_ptr<Database> db )
 	id = new signed char[N];
 	memset(id,0,N);
 	BoundaryCondition = d_db->getScalar<int>("BC");
-    int nprocs = Comm.getSize();
+    int nprocs;
+    MPI_Comm_size( Comm, &nprocs );
 	INSIST(nprocs == nproc[0]*nproc[1]*nproc[2],"Fatal error in processor count!");
 }
 
@@ -560,7 +569,7 @@ void Domain::Decomp( const std::string& Filename )
 					}
 					else{
 						//printf("Sending data to process %i \n", rnk);
-						Comm.send(loc_id,N,rnk,15);
+						MPI_Send(loc_id,N,MPI_CHAR,rnk,15,Comm);
 					}
 					// Write the data for this rank data 
 					sprintf(LocalRankFilename,"ID.%05i",rnk+rank_offset);
@@ -575,10 +584,10 @@ void Domain::Decomp( const std::string& Filename )
 	else{
 		// Recieve the subdomain from rank = 0
 		//printf("Ready to recieve data %i at process %i \n", N,rank);
-		Comm.recv(id,N,0,15);
+		MPI_Recv(id,N,MPI_CHAR,0,15,Comm,MPI_STATUS_IGNORE);
 	}
-	Comm.barrier();
-
+	//Comm.barrier();
+	MPI_Barrier(Comm);
 	// Compute the porosity
 	double sum;
 	double sum_local=0.0;
@@ -618,7 +627,8 @@ void Domain::Decomp( const std::string& Filename )
             }
         }
     }
-    sum = Comm.sumReduce(sum_local);
+    MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,Comm);
+    //sum = Comm.sumReduce(sum_local);
     porosity = sum*iVol_global;
     if (rank()==0) printf("Media porosity = %f \n",porosity);
  	//.........................................................
@@ -661,7 +671,7 @@ void Domain::AggregateLabels( const std::string& filename ){
 			}
 		}
 	}
-	Comm.barrier();
+	MPI_Barrier(Comm);
 
 	// populate the FullID 
 	if (rank() == 0){
@@ -687,7 +697,7 @@ void Domain::AggregateLabels( const std::string& filename ){
 			ipx = (rnk - ipz*npx*npy - ipy*npx); 
 			//printf("ipx=%i ipy=%i ipz=%i\n", ipx, ipy, ipz);
 			int tag = 15+rnk;
-			Comm.recv(LocalID,local_size,rnk,tag);
+			MPI_Recv(LocalID,local_size,MPI_CHAR,rnk,tag,Comm,MPI_STATUS_IGNORE);
 			for (int k=1; k<nz-1; k++){
 				for (int j=1; j<ny-1; j++){
 					for (int i=1; i<nx-1; i++){
@@ -710,9 +720,9 @@ void Domain::AggregateLabels( const std::string& filename ){
 		// send LocalID to rank=0
 		int tag = 15+ rank();
 		int dstrank = 0;
-		Comm.send(LocalID,local_size,dstrank,tag);
+		MPI_Send(LocalID,local_size,MPI_CHAR,dstrank,tag,Comm);
 	}
-	Comm.barrier();
+	MPI_Barrier(Comm);
 
 }
 
@@ -837,45 +847,45 @@ void Domain::CommInit()
 	sendBuf_YZ = new int [sendCount_YZ];
 	sendBuf_XZ = new int [sendCount_XZ];
 	//......................................................................................
-	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm.getCommunicator(),&req1[0]);
-	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm.getCommunicator(),&req2[0]);
-	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm.getCommunicator(),&req1[1]);
-	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm.getCommunicator(),&req2[1]);
-	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm.getCommunicator(),&req1[2]);
-	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm.getCommunicator(),&req2[2]);
-	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm.getCommunicator(),&req1[3]);
-	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm.getCommunicator(),&req2[3]);
-	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm.getCommunicator(),&req1[4]);
-	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm.getCommunicator(),&req2[4]);
-	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm.getCommunicator(),&req1[5]);
-	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm.getCommunicator(),&req2[5]);
-	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm.getCommunicator(),&req1[6]);
-	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm.getCommunicator(),&req2[6]);
-	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm.getCommunicator(),&req1[7]);
-	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm.getCommunicator(),&req2[7]);
-	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm.getCommunicator(),&req1[8]);
-	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm.getCommunicator(),&req2[8]);
-	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm.getCommunicator(),&req1[9]);
-	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm.getCommunicator(),&req2[9]);
-	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm.getCommunicator(),&req1[10]);
-	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm.getCommunicator(),&req2[10]);
-	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm.getCommunicator(),&req1[11]);
-	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm.getCommunicator(),&req2[11]);
-	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm.getCommunicator(),&req1[12]);
-	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm.getCommunicator(),&req2[12]);
-	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm.getCommunicator(),&req1[13]);
-	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm.getCommunicator(),&req2[13]);
-	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm.getCommunicator(),&req1[14]);
-	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm.getCommunicator(),&req2[14]);
-	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm.getCommunicator(),&req1[15]);
-	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm.getCommunicator(),&req2[15]);
-	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm.getCommunicator(),&req1[16]);
-	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm.getCommunicator(),&req2[16]);
-	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm.getCommunicator(),&req1[17]);
-	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm.getCommunicator(),&req2[17]);
+	MPI_Isend(&sendCount_x, 1,MPI_INT,rank_x(),sendtag+0,Comm,&req1[0]);
+	MPI_Irecv(&recvCount_X, 1,MPI_INT,rank_X(),recvtag+0,Comm,&req2[0]);
+	MPI_Isend(&sendCount_X, 1,MPI_INT,rank_X(),sendtag+1,Comm,&req1[1]);
+	MPI_Irecv(&recvCount_x, 1,MPI_INT,rank_x(),recvtag+1,Comm,&req2[1]);
+	MPI_Isend(&sendCount_y, 1,MPI_INT,rank_y(),sendtag+2,Comm,&req1[2]);
+	MPI_Irecv(&recvCount_Y, 1,MPI_INT,rank_Y(),recvtag+2,Comm,&req2[2]);
+	MPI_Isend(&sendCount_Y, 1,MPI_INT,rank_Y(),sendtag+3,Comm,&req1[3]);
+	MPI_Irecv(&recvCount_y, 1,MPI_INT,rank_y(),recvtag+3,Comm,&req2[3]);
+	MPI_Isend(&sendCount_z, 1,MPI_INT,rank_z(),sendtag+4,Comm,&req1[4]);
+	MPI_Irecv(&recvCount_Z, 1,MPI_INT,rank_Z(),recvtag+4,Comm,&req2[4]);
+	MPI_Isend(&sendCount_Z, 1,MPI_INT,rank_Z(),sendtag+5,Comm,&req1[5]);
+	MPI_Irecv(&recvCount_z, 1,MPI_INT,rank_z(),recvtag+5,Comm,&req2[5]);
+	MPI_Isend(&sendCount_xy, 1,MPI_INT,rank_xy(),sendtag+6,Comm,&req1[6]);
+	MPI_Irecv(&recvCount_XY, 1,MPI_INT,rank_XY(),recvtag+6,Comm,&req2[6]);
+	MPI_Isend(&sendCount_XY, 1,MPI_INT,rank_XY(),sendtag+7,Comm,&req1[7]);
+	MPI_Irecv(&recvCount_xy, 1,MPI_INT,rank_xy(),recvtag+7,Comm,&req2[7]);
+	MPI_Isend(&sendCount_Xy, 1,MPI_INT,rank_Xy(),sendtag+8,Comm,&req1[8]);
+	MPI_Irecv(&recvCount_xY, 1,MPI_INT,rank_xY(),recvtag+8,Comm,&req2[8]);
+	MPI_Isend(&sendCount_xY, 1,MPI_INT,rank_xY(),sendtag+9,Comm,&req1[9]);
+	MPI_Irecv(&recvCount_Xy, 1,MPI_INT,rank_Xy(),recvtag+9,Comm,&req2[9]);
+	MPI_Isend(&sendCount_xz, 1,MPI_INT,rank_xz(),sendtag+10,Comm,&req1[10]);
+	MPI_Irecv(&recvCount_XZ, 1,MPI_INT,rank_XZ(),recvtag+10,Comm,&req2[10]);
+	MPI_Isend(&sendCount_XZ, 1,MPI_INT,rank_XZ(),sendtag+11,Comm,&req1[11]);
+	MPI_Irecv(&recvCount_xz, 1,MPI_INT,rank_xz(),recvtag+11,Comm,&req2[11]);
+	MPI_Isend(&sendCount_Xz, 1,MPI_INT,rank_Xz(),sendtag+12,Comm,&req1[12]);
+	MPI_Irecv(&recvCount_xZ, 1,MPI_INT,rank_xZ(),recvtag+12,Comm,&req2[12]);
+	MPI_Isend(&sendCount_xZ, 1,MPI_INT,rank_xZ(),sendtag+13,Comm,&req1[13]);
+	MPI_Irecv(&recvCount_Xz, 1,MPI_INT,rank_Xz(),recvtag+13,Comm,&req2[13]);
+	MPI_Isend(&sendCount_yz, 1,MPI_INT,rank_yz(),sendtag+14,Comm,&req1[14]);
+	MPI_Irecv(&recvCount_YZ, 1,MPI_INT,rank_YZ(),recvtag+14,Comm,&req2[14]);
+	MPI_Isend(&sendCount_YZ, 1,MPI_INT,rank_YZ(),sendtag+15,Comm,&req1[15]);
+	MPI_Irecv(&recvCount_yz, 1,MPI_INT,rank_yz(),recvtag+15,Comm,&req2[15]);
+	MPI_Isend(&sendCount_Yz, 1,MPI_INT,rank_Yz(),sendtag+16,Comm,&req1[16]);
+	MPI_Irecv(&recvCount_yZ, 1,MPI_INT,rank_yZ(),recvtag+16,Comm,&req2[16]);
+	MPI_Isend(&sendCount_yZ, 1,MPI_INT,rank_yZ(),sendtag+17,Comm,&req1[17]);
+	MPI_Irecv(&recvCount_Yz, 1,MPI_INT,rank_Yz(),recvtag+17,Comm,&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
-	Comm.barrier();
+	MPI_Barrier(Comm);
 	//......................................................................................
 	// recv buffers
 	recvList_x = new int [recvCount_x];
@@ -897,42 +907,42 @@ void Domain::CommInit()
 	recvList_YZ = new int [recvCount_YZ];
 	recvList_XZ = new int [recvCount_XZ];
 	//......................................................................................
-	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm.getCommunicator(),&req1[0]);
-	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm.getCommunicator(),&req2[0]);
-	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm.getCommunicator(),&req1[1]);
-	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm.getCommunicator(),&req2[1]);
-	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm.getCommunicator(),&req1[2]);
-	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm.getCommunicator(),&req2[2]);
-	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm.getCommunicator(),&req1[3]);
-	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm.getCommunicator(),&req2[3]);
-	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm.getCommunicator(),&req1[4]);
-	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm.getCommunicator(),&req2[4]);
-	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm.getCommunicator(),&req1[5]);
-	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm.getCommunicator(),&req2[5]);
-	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm.getCommunicator(),&req1[6]);
-	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm.getCommunicator(),&req2[6]);
-	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm.getCommunicator(),&req1[7]);
-	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm.getCommunicator(),&req2[7]);
-	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm.getCommunicator(),&req1[8]);
-	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm.getCommunicator(),&req2[8]);
-	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm.getCommunicator(),&req1[9]);
-	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm.getCommunicator(),&req2[9]);
-	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm.getCommunicator(),&req1[10]);
-	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm.getCommunicator(),&req2[10]);
-	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm.getCommunicator(),&req1[11]);
-	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm.getCommunicator(),&req2[11]);
-	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm.getCommunicator(),&req1[12]);
-	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm.getCommunicator(),&req2[12]);
-	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm.getCommunicator(),&req1[13]);
-	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm.getCommunicator(),&req2[13]);
-	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm.getCommunicator(),&req1[14]);
-	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm.getCommunicator(),&req2[14]);
-	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm.getCommunicator(),&req1[15]);
-	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm.getCommunicator(),&req2[15]);
-	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm.getCommunicator(),&req1[16]);
-	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm.getCommunicator(),&req2[16]);
-	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm.getCommunicator(),&req1[17]);
-	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm.getCommunicator(),&req2[17]);
+	MPI_Isend(sendList_x, sendCount_x,MPI_INT,rank_x(),sendtag,Comm,&req1[0]);
+	MPI_Irecv(recvList_X, recvCount_X,MPI_INT,rank_X(),recvtag,Comm,&req2[0]);
+	MPI_Isend(sendList_X, sendCount_X,MPI_INT,rank_X(),sendtag,Comm,&req1[1]);
+	MPI_Irecv(recvList_x, recvCount_x,MPI_INT,rank_x(),recvtag,Comm,&req2[1]);
+	MPI_Isend(sendList_y, sendCount_y,MPI_INT,rank_y(),sendtag,Comm,&req1[2]);
+	MPI_Irecv(recvList_Y, recvCount_Y,MPI_INT,rank_Y(),recvtag,Comm,&req2[2]);
+	MPI_Isend(sendList_Y, sendCount_Y,MPI_INT,rank_Y(),sendtag,Comm,&req1[3]);
+	MPI_Irecv(recvList_y, recvCount_y,MPI_INT,rank_y(),recvtag,Comm,&req2[3]);
+	MPI_Isend(sendList_z, sendCount_z,MPI_INT,rank_z(),sendtag,Comm,&req1[4]);
+	MPI_Irecv(recvList_Z, recvCount_Z,MPI_INT,rank_Z(),recvtag,Comm,&req2[4]);
+	MPI_Isend(sendList_Z, sendCount_Z,MPI_INT,rank_Z(),sendtag,Comm,&req1[5]);
+	MPI_Irecv(recvList_z, recvCount_z,MPI_INT,rank_z(),recvtag,Comm,&req2[5]);
+	MPI_Isend(sendList_xy, sendCount_xy,MPI_INT,rank_xy(),sendtag,Comm,&req1[6]);
+	MPI_Irecv(recvList_XY, recvCount_XY,MPI_INT,rank_XY(),recvtag,Comm,&req2[6]);
+	MPI_Isend(sendList_XY, sendCount_XY,MPI_INT,rank_XY(),sendtag,Comm,&req1[7]);
+	MPI_Irecv(recvList_xy, recvCount_xy,MPI_INT,rank_xy(),recvtag,Comm,&req2[7]);
+	MPI_Isend(sendList_Xy, sendCount_Xy,MPI_INT,rank_Xy(),sendtag,Comm,&req1[8]);
+	MPI_Irecv(recvList_xY, recvCount_xY,MPI_INT,rank_xY(),recvtag,Comm,&req2[8]);
+	MPI_Isend(sendList_xY, sendCount_xY,MPI_INT,rank_xY(),sendtag,Comm,&req1[9]);
+	MPI_Irecv(recvList_Xy, recvCount_Xy,MPI_INT,rank_Xy(),recvtag,Comm,&req2[9]);
+	MPI_Isend(sendList_xz, sendCount_xz,MPI_INT,rank_xz(),sendtag,Comm,&req1[10]);
+	MPI_Irecv(recvList_XZ, recvCount_XZ,MPI_INT,rank_XZ(),recvtag,Comm,&req2[10]);
+	MPI_Isend(sendList_XZ, sendCount_XZ,MPI_INT,rank_XZ(),sendtag,Comm,&req1[11]);
+	MPI_Irecv(recvList_xz, recvCount_xz,MPI_INT,rank_xz(),recvtag,Comm,&req2[11]);
+	MPI_Isend(sendList_Xz, sendCount_Xz,MPI_INT,rank_Xz(),sendtag,Comm,&req1[12]);
+	MPI_Irecv(recvList_xZ, recvCount_xZ,MPI_INT,rank_xZ(),recvtag,Comm,&req2[12]);
+	MPI_Isend(sendList_xZ, sendCount_xZ,MPI_INT,rank_xZ(),sendtag,Comm,&req1[13]);
+	MPI_Irecv(recvList_Xz, recvCount_Xz,MPI_INT,rank_Xz(),recvtag,Comm,&req2[13]);
+	MPI_Isend(sendList_yz, sendCount_yz,MPI_INT,rank_yz(),sendtag,Comm,&req1[14]);
+	MPI_Irecv(recvList_YZ, recvCount_YZ,MPI_INT,rank_YZ(),recvtag,Comm,&req2[14]);
+	MPI_Isend(sendList_YZ, sendCount_YZ,MPI_INT,rank_YZ(),sendtag,Comm,&req1[15]);
+	MPI_Irecv(recvList_yz, recvCount_yz,MPI_INT,rank_yz(),recvtag,Comm,&req2[15]);
+	MPI_Isend(sendList_Yz, sendCount_Yz,MPI_INT,rank_Yz(),sendtag,Comm,&req1[16]);
+	MPI_Irecv(recvList_yZ, recvCount_yZ,MPI_INT,rank_yZ(),recvtag,Comm,&req2[16]);
+	MPI_Isend(sendList_yZ, sendCount_yZ,MPI_INT,rank_yZ(),sendtag,Comm,&req1[17]);
+	MPI_Irecv(recvList_Yz, recvCount_Yz,MPI_INT,rank_Yz(),recvtag,Comm,&req2[17]);
 	MPI_Waitall(18,req1,stat1);
 	MPI_Waitall(18,req2,stat2);
 	//......................................................................................
@@ -1077,7 +1087,7 @@ void Domain::ReadIDs(){
             }
         }
     }
-    sum = Comm.sumReduce(sum_local);
+    MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,Comm);
     porosity = sum*iVol_global;
     if (rank()==0) printf("Media porosity = %f \n",porosity);
  	//.........................................................
@@ -1125,41 +1135,41 @@ void Domain::CommunicateMeshHalo(DoubleArray &Mesh)
 	PackMeshData(sendList_YZ, sendCount_YZ ,sendData_YZ, MeshData);
 	//......................................................................................
 	MPI_Sendrecv(sendData_x,sendCount_x,MPI_DOUBLE,rank_x(),sendtag,
-			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_X,recvCount_X,MPI_DOUBLE,rank_X(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_X,sendCount_X,MPI_DOUBLE,rank_X(),sendtag,
-			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_x,recvCount_x,MPI_DOUBLE,rank_x(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_y,sendCount_y,MPI_DOUBLE,rank_y(),sendtag,
-			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_Y,recvCount_Y,MPI_DOUBLE,rank_Y(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Y,sendCount_Y,MPI_DOUBLE,rank_Y(),sendtag,
-			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_y,recvCount_y,MPI_DOUBLE,rank_y(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_z,sendCount_z,MPI_DOUBLE,rank_z(),sendtag,
-			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_Z,recvCount_Z,MPI_DOUBLE,rank_Z(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Z,sendCount_Z,MPI_DOUBLE,rank_Z(),sendtag,
-			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_z,recvCount_z,MPI_DOUBLE,rank_z(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xy,sendCount_xy,MPI_DOUBLE,rank_xy(),sendtag,
-			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_XY,recvCount_XY,MPI_DOUBLE,rank_XY(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_XY,sendCount_XY,MPI_DOUBLE,rank_XY(),sendtag,
-			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_xy,recvCount_xy,MPI_DOUBLE,rank_xy(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Xy,sendCount_Xy,MPI_DOUBLE,rank_Xy(),sendtag,
-			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_xY,recvCount_xY,MPI_DOUBLE,rank_xY(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xY,sendCount_xY,MPI_DOUBLE,rank_xY(),sendtag,
-			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_Xy,recvCount_Xy,MPI_DOUBLE,rank_Xy(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xz,sendCount_xz,MPI_DOUBLE,rank_xz(),sendtag,
-			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_XZ,recvCount_XZ,MPI_DOUBLE,rank_XZ(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_XZ,sendCount_XZ,MPI_DOUBLE,rank_XZ(),sendtag,
-			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_xz,recvCount_xz,MPI_DOUBLE,rank_xz(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Xz,sendCount_Xz,MPI_DOUBLE,rank_Xz(),sendtag,
-			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_xZ,recvCount_xZ,MPI_DOUBLE,rank_xZ(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_xZ,sendCount_xZ,MPI_DOUBLE,rank_xZ(),sendtag,
-			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_Xz,recvCount_Xz,MPI_DOUBLE,rank_Xz(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_yz,sendCount_yz,MPI_DOUBLE,rank_yz(),sendtag,
-			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_YZ,recvCount_YZ,MPI_DOUBLE,rank_YZ(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_YZ,sendCount_YZ,MPI_DOUBLE,rank_YZ(),sendtag,
-			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_yz,recvCount_yz,MPI_DOUBLE,rank_yz(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_Yz,sendCount_Yz,MPI_DOUBLE,rank_Yz(),sendtag,
-			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_yZ,recvCount_yZ,MPI_DOUBLE,rank_yZ(),recvtag,Comm,MPI_STATUS_IGNORE);
 	MPI_Sendrecv(sendData_yZ,sendCount_yZ,MPI_DOUBLE,rank_yZ(),sendtag,
-			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm.getCommunicator(),MPI_STATUS_IGNORE);
+			recvData_Yz,recvCount_Yz,MPI_DOUBLE,rank_Yz(),recvtag,Comm,MPI_STATUS_IGNORE);
 	//........................................................................................
 	UnpackMeshData(recvList_x, recvCount_x ,recvData_x, MeshData);
 	UnpackMeshData(recvList_X, recvCount_X ,recvData_X, MeshData);
diff --git a/common/Domain.h b/common/Domain.h
index 22b05af7..df2812c1 100755
--- a/common/Domain.h
+++ b/common/Domain.h
@@ -12,7 +12,7 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 
@@ -63,7 +63,7 @@ private:
 class Domain{
 public:
     //! Default constructor
-    Domain( std::shared_ptr<Database> db, const Utilities::MPI& Communicator);
+    Domain( std::shared_ptr<Database> db, MPI_Comm Communicator);
 
     //! Obsolete constructor
     Domain( int nx, int ny, int nz, int rnk, int npx, int npy, int npz, 
@@ -116,7 +116,7 @@ public: // Public variables (need to create accessors instead)
     double porosity;
     RankInfoStruct rank_info;
 
-    Utilities::MPI Comm;        // MPI Communicator for this domain
+    MPI_Comm Comm;        // MPI Communicator for this domain
 
     int BoundaryCondition;
 
diff --git a/common/MPI.I b/common/MPI.I
deleted file mode 100644
index 8cbc9c09..00000000
--- a/common/MPI.I
+++ /dev/null
@@ -1,1143 +0,0 @@
-// This file contains the default instantiations for templated operations
-// Note: Intel compilers need definitions before all default instantions to compile correctly
-#ifndef included_MPI_I
-#define included_MPI_I
-
-#include "common/Utilities.h"
-
-#include <typeinfo>
-
-
-#define MPI_CLASS MPI
-#define MPI_CLASS_ERROR ERROR
-#define MPI_CLASS_ASSERT ASSERT
-
-#undef NULL_USE
-#define NULL_USE( variable )                    \
-    do {                                        \
-        if ( 0 ) {                              \
-            auto static t = (char *) &variable; \
-            t++;                                \
-        }                                       \
-    } while ( 0 )
-
-
-namespace Utilities {
-
-
-// Function to test if a type is a std::pair
-template<typename>
-struct is_pair : std::false_type {
-};
-template<typename T, typename U>
-struct is_pair<std::pair<T, U>> : std::true_type {
-};
-
-
-// Function to test if a type can be passed by MPI
-template<class TYPE>
-constexpr typename std::enable_if<std::is_trivially_copyable<TYPE>::value,bool>::type
-    is_mpi_copyable()
-{
-    return true;
-}
-template<class TYPE>
-constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&is_pair<TYPE>::value,bool>::type
-    is_mpi_copyable()
-{
-    return is_mpi_copyable<typename TYPE::first_type>() && is_mpi_copyable<typename TYPE::second_type>();
-}
-template<class TYPE>
-constexpr typename std::enable_if<!std::is_trivially_copyable<TYPE>::value&&!is_pair<TYPE>::value,bool>::type
-    is_mpi_copyable()
-{
-    return false;
-}
-
-
-/************************************************************************
- *  sumReduce                                                            *
- ************************************************************************/
-template<class TYPE>
-inline TYPE MPI_CLASS::sumReduce( const TYPE value ) const
-{
-    if ( comm_size > 1 ) {
-        TYPE tmp = value;
-        call_sumReduce( &tmp, 1 );
-        return tmp;
-    } else {
-        return value;
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::sumReduce( TYPE *x, const int n ) const
-{
-    if ( comm_size > 1 )
-        call_sumReduce( x, n );
-}
-template<class TYPE>
-inline void MPI_CLASS::sumReduce( const TYPE *x, TYPE *y, const int n ) const
-{
-    if ( comm_size > 1 ) {
-        call_sumReduce( x, y, n );
-    } else {
-        for ( int i = 0; i < n; i++ )
-            y[i] = x[i];
-    }
-}
-// Define specializations of call_sumReduce(TYPE*, const int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<char>( char *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<int>( int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<unsigned long int>( unsigned long int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<long int>( long int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<size_t>( size_t *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<float>( float *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<double>( double *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *, const int ) const;
-#endif
-// Default instantiations of call_sumReduce(TYPE*, const int)
-template<class TYPE>
-void MPI_CLASS::call_sumReduce( TYPE *, const int ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-// Define specializations of call_sumReduce(const TYPE*, TYPE*, const int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_sumReduce<unsigned char>(
-    const unsigned char *, unsigned char *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<char>( const char *, char *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<unsigned int>(
-    const unsigned int *, unsigned int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<int>( const int *, int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<unsigned long int>(
-    const unsigned long int *, unsigned long int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<long int>( const long int *, long int *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<size_t>( const size_t *, size_t *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<float>( const float *, float *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<double>( const double *, double *, const int ) const;
-template<>
-void MPI_CLASS::call_sumReduce<std::complex<double>>(
-    const std::complex<double> *, std::complex<double> *, const int ) const;
-#endif
-// Default instantiations of call_sumReduce(const TYPE*, TYPE*, const int)
-template<class TYPE>
-void MPI_CLASS::call_sumReduce( const TYPE *x, TYPE *y, const int n ) const
-{
-    NULL_USE( x );
-    NULL_USE( y );
-    NULL_USE( n );
-    char message[200];
-    sprintf( message, "Default instantion of sumReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  minReduce                                                            *
- ************************************************************************/
-template<class TYPE>
-inline TYPE MPI_CLASS::minReduce( const TYPE value ) const
-{
-    if ( comm_size > 1 ) {
-        TYPE tmp = value;
-        call_minReduce( &tmp, 1, nullptr );
-        return tmp;
-    } else {
-        return value;
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::minReduce( TYPE *x, const int n, int *rank_of_min ) const
-{
-    if ( comm_size > 1 ) {
-        call_minReduce( x, n, rank_of_min );
-    } else {
-        if ( rank_of_min != nullptr ) {
-            for ( int i = 0; i < n; i++ )
-                rank_of_min[i] = 0;
-        }
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::minReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_min ) const
-{
-    if ( comm_size > 1 ) {
-        call_minReduce( x, y, n, rank_of_min );
-    } else {
-        for ( int i = 0; i < n; i++ ) {
-            y[i] = x[i];
-            if ( rank_of_min != nullptr )
-                rank_of_min[i] = 0;
-        }
-    }
-}
-// Define specializations of call_minReduce(TYPE*, const int, int*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_minReduce<unsigned char>( unsigned char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<char>( char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned int>( unsigned int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<int>( int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<long int>( long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned long long int>(
-    unsigned long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<long long int>( long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<size_t>( size_t *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<float>( float *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<double>( double *, const int, int * ) const;
-#endif
-// Default instantiations of call_minReduce(TYPE*, const int, int*)
-template<class TYPE>
-void MPI_CLASS::call_minReduce( TYPE *, const int, int * ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-// Define specializations of call_minReduce(const TYPE*, TYPE*, const int, int*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_minReduce<unsigned char>(
-    const unsigned char *, unsigned char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<char>( const char *, char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned int>(
-    const unsigned int *, unsigned int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<int>( const int *, int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned long int>(
-    const unsigned long int *, unsigned long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<long int>( const long int *, long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<unsigned long long int>(
-    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<long long int>(
-    const long long int *, long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<float>( const float *, float *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_minReduce<double>( const double *, double *, const int, int * ) const;
-#endif
-// Default instantiations of call_minReduce(const TYPE*, TYPE*, const int, int*)
-template<class TYPE>
-void MPI_CLASS::call_minReduce( const TYPE *, TYPE *, const int, int * ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of minReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  maxReduce                                                         *
- ************************************************************************/
-template<class TYPE>
-inline TYPE MPI_CLASS::maxReduce( const TYPE value ) const
-{
-    if ( comm_size > 1 ) {
-        TYPE tmp = value;
-        call_maxReduce( &tmp, 1, nullptr );
-        return tmp;
-    } else {
-        return value;
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::maxReduce( TYPE *x, const int n, int *rank_of_max ) const
-{
-    if ( comm_size > 1 ) {
-        call_maxReduce( x, n, rank_of_max );
-    } else {
-        if ( rank_of_max != nullptr ) {
-            for ( int i = 0; i < n; i++ )
-                rank_of_max[i] = 0;
-        }
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::maxReduce( const TYPE *x, TYPE *y, const int n, int *rank_of_max ) const
-{
-    if ( comm_size > 1 ) {
-        call_maxReduce( x, y, n, rank_of_max );
-    } else {
-        for ( int i = 0; i < n; i++ ) {
-            y[i] = x[i];
-            if ( rank_of_max != nullptr )
-                rank_of_max[i] = 0;
-        }
-    }
-}
-// Define specializations of call_maxReduce(TYPE*, const int, int*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_maxReduce<unsigned char>( unsigned char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<char>( char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned int>( unsigned int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<int>( int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long int>( unsigned long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<long int>( long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long long int>(
-    unsigned long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<long long int>( long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<size_t>( size_t *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<float>( float *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<double>( double *, const int, int * ) const;
-#endif
-// Default instantiations of call_maxReduce(TYPE*, const int, int*)
-template<class TYPE>
-void MPI_CLASS::call_maxReduce( TYPE *, const int, int * ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-// Define specializations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_maxReduce<unsigned char>(
-    const unsigned char *, unsigned char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<char>( const char *, char *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned int>(
-    const unsigned int *, unsigned int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<int>( const int *, int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long int>(
-    const unsigned long int *, unsigned long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<long int>( const long int *, long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long long int>(
-    const unsigned long long int *, unsigned long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<long long int>(
-    const long long int *, long long int *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<size_t>( const size_t *, size_t *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<float>( const float *, float *, const int, int * ) const;
-template<>
-void MPI_CLASS::call_maxReduce<double>( const double *, double *, const int, int * ) const;
-#endif
-// Default instantiations of call_maxReduce(const TYPE*, TYPE*, const int, int*)
-template<class TYPE>
-void MPI_CLASS::call_maxReduce( const TYPE *, TYPE *, const int, int * ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  bcast                                                                *
- ************************************************************************/
-// Define specializations of bcast(TYPE*, const int, const int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_bcast<unsigned char>( unsigned char *, const int, const int ) const;
-template<>
-void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
-template<>
-void MPI_CLASS::call_bcast<unsigned int>( unsigned int *, const int, const int ) const;
-template<>
-void MPI_CLASS::call_bcast<int>( int *, const int, const int ) const;
-template<>
-void MPI_CLASS::call_bcast<float>( float *, const int, const int ) const;
-template<>
-void MPI_CLASS::call_bcast<double>( double *, const int, const int ) const;
-#else
-template<>
-void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const;
-#endif
-// Default instantiations of bcast(TYPE*, const int, const int)
-template<class TYPE>
-void MPI_CLASS::call_bcast( TYPE *x, const int n, const int root ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    call_bcast<char>( (char *) x, (int) n * sizeof( TYPE ), root );
-}
-// Specialization of bcast for std::string
-template<>
-inline std::string MPI_CLASS::bcast<std::string>( const std::string &value, const int root ) const
-{
-    if ( comm_size == 1 )
-        return value;
-    int length = static_cast<int>( value.size() );
-    call_bcast<int>( &length, 1, root );
-    if ( length == 0 )
-        return std::string();
-    char *str = new char[length + 1];
-    if ( root == comm_rank ) {
-        for ( int i = 0; i < length; i++ )
-            str[i] = value[i];
-    }
-    call_bcast<char>( str, length, root );
-    str[length] = 0;
-    std::string result( str );
-    delete[] str;
-    return result;
-}
-template<>
-inline void MPI_CLASS::bcast<std::string>( std::string *, const int, const int ) const
-{
-    MPI_CLASS_ERROR( "Cannot bcast an array of strings" );
-}
-// Default implimentation of bcast
-template<class TYPE>
-inline TYPE MPI_CLASS::bcast( const TYPE &value, const int root ) const
-{
-    if ( root >= comm_size )
-        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
-    if ( comm_size > 1 ) {
-        TYPE tmp = value;
-        call_bcast( &tmp, 1, root );
-        return tmp;
-    } else {
-        return value;
-    }
-}
-template<class TYPE>
-inline void MPI_CLASS::bcast( TYPE *x, const int n, const int root ) const
-{
-    if ( root >= comm_size )
-        MPI_CLASS_ERROR( "root cannot be >= size in bcast" );
-    if ( comm_size > 1 )
-        call_bcast( x, n, root );
-}
-
-
-/************************************************************************
- *  send                                                                 *
- ************************************************************************/
-// Define specializations of send(const TYPE*, const int, const int, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
-template<>
-void MPI_CLASS::send<int>( const int *, int, const int, int ) const;
-template<>
-void MPI_CLASS::send<float>( const float *, const int, const int, int ) const;
-template<>
-void MPI_CLASS::send<double>( const double *, const int, const int, int ) const;
-#else
-template<>
-void MPI_CLASS::send<char>( const char *, const int, const int, int ) const;
-#endif
-// Default instantiations of send(const TYPE*, const int, const int, int)
-template<class TYPE>
-inline void MPI_CLASS::send(
-    const TYPE *buf, const int length, const int recv_proc_number, int tag ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    send<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
-}
-
-
-/************************************************************************
- *  Isend                                                                *
- ************************************************************************/
-// Define specializations of Isend(const TYPE*, const int, const int, const int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Isend<int>( const int *, int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Isend<float>( const float *, const int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Isend<double>( const double *, const int, const int, const int ) const;
-#else
-template<>
-MPI_Request MPI_CLASS::Isend<char>( const char *, const int, const int, const int ) const;
-#endif
-// Default instantiations of Isend(const TYPE*, const int, const int, const int)
-template<class TYPE>
-inline MPI_Request MPI_CLASS::Isend(
-    const TYPE *buf, const int length, const int recv_proc_number, const int tag ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    return Isend<char>( (const char *) buf, length * sizeof( TYPE ), recv_proc_number, tag );
-}
-
-
-/************************************************************************
- *  recv                                                                 *
- ************************************************************************/
-// Define specializations of recv(TYPE*, int&, const int, const bool, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
-template<>
-void MPI_CLASS::recv<int>( int *, int &, const int, const bool, int ) const;
-template<>
-void MPI_CLASS::recv<float>( float *, int &, const int, const bool, int ) const;
-template<>
-void MPI_CLASS::recv<double>( double *, int &, const int, const bool, int ) const;
-#else
-template<>
-void MPI_CLASS::recv<char>( char *, int &, const int, const bool, int ) const;
-#endif
-// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
-template<class TYPE>
-inline void MPI_CLASS::recv(
-    TYPE *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    int size = length * sizeof( TYPE );
-    recv<char>( (char *) buf, size, send_proc_number, get_length, tag );
-    if ( get_length ) {
-        MPI_CLASS_ASSERT( size % sizeof( TYPE ) == 0 );
-        length = size / sizeof( TYPE );
-    }
-}
-
-
-/************************************************************************
- *  Irecv                                                                *
- ************************************************************************/
-// Define specializations of recv(TYPE*, int&, const int, const bool, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Irecv<int>( int *, const int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Irecv<float>( float *, const int, const int, const int ) const;
-template<>
-MPI_Request MPI_CLASS::Irecv<double>( double *, const int, const int, const int ) const;
-#else
-template<>
-MPI_Request MPI_CLASS::Irecv<char>( char *, const int, const int, const int ) const;
-#endif
-// Default instantiations of recv(TYPE*, int&, const int, const bool, int)
-template<class TYPE>
-inline MPI_Request MPI_CLASS::Irecv(
-    TYPE *buf, const int length, const int send_proc, const int tag ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    return Irecv<char>( (char *) buf, length * sizeof( TYPE ), send_proc, tag );
-}
-
-
-/************************************************************************
- *  allGather                                                            *
- ************************************************************************/
-template<class TYPE>
-std::vector<TYPE> MPI_CLASS::allGather( const TYPE &x ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    if ( getSize() <= 1 )
-        return std::vector<TYPE>( 1, x );
-    std::vector<TYPE> data( getSize() );
-    allGather( x, data.data() );
-    return data;
-}
-template<class TYPE>
-std::vector<TYPE> MPI_CLASS::allGather( const std::vector<TYPE> &x ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    if ( getSize() <= 1 )
-        return x;
-    std::vector<int> count = allGather<int>( x.size() );
-    std::vector<int> disp( getSize(), 0 );
-    size_t N = count[0];
-    for ( size_t i = 1; i < count.size(); i++ ) {
-        disp[i] = disp[i - 1] + count[i - 1];
-        N += count[i];
-    }
-    std::vector<TYPE> data( N );
-    allGather<TYPE>( x.data(), x.size(), data.data(), count.data(), disp.data(), true );
-    return data;
-}
-// Specialization of MPI_CLASS::allGather for std::string
-template<>
-inline void MPI_CLASS::allGather<std::string>( const std::string &x_in, std::string *x_out ) const
-{
-    // Get the bytes recvied per processor
-    std::vector<int> recv_cnt( comm_size, 0 );
-    allGather<int>( (int) x_in.size() + 1, &recv_cnt[0] );
-    std::vector<int> recv_disp( comm_size, 0 );
-    for ( int i = 1; i < comm_size; i++ )
-        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
-    // Call the vector form of allGather for the char arrays
-    char *recv_data = new char[recv_disp[comm_size - 1] + recv_cnt[comm_size - 1]];
-    allGather<char>(
-        x_in.c_str(), (int) x_in.size() + 1, recv_data, &recv_cnt[0], &recv_disp[0], true );
-    for ( int i = 0; i < comm_size; i++ )
-        x_out[i] = std::string( &recv_data[recv_disp[i]] );
-    delete[] recv_data;
-}
-// Default instantiation of MPI_CLASS::allGather
-template<class TYPE>
-inline void MPI_CLASS::allGather( const TYPE &x_in, TYPE *x_out ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    if ( comm_size > 1 ) {
-        // We can use the vector form of allGather with a char array to ge the data we want
-        call_allGather( x_in, x_out );
-    } else {
-        // Single processor case
-        x_out[0] = x_in;
-    }
-}
-// Specialization of MPI_CLASS::allGather for std::string
-template<>
-inline int MPI_CLASS::allGather<std::string>(
-    const std::string *, const int, std::string *, int *, int *, bool ) const
-{
-    MPI_CLASS_ERROR( "Cannot allGather an array of strings" );
-    return 0;
-}
-// Define specializations of call_allGather(const TYPE, TYPE*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_allGather<unsigned char>( const unsigned char &, unsigned char * ) const;
-template<>
-void MPI_CLASS::call_allGather<char>( const char &, char * ) const;
-template<>
-void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &, unsigned int * ) const;
-template<>
-void MPI_CLASS::call_allGather<int>( const int &, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<unsigned long int>(
-    const unsigned long int &, unsigned long int * ) const;
-template<>
-void MPI_CLASS::call_allGather<long int>( const long int &, long int * ) const;
-template<>
-void MPI_CLASS::call_allGather<float>( const float &, float * ) const;
-template<>
-void MPI_CLASS::call_allGather<double>( const double &, double * ) const;
-#endif
-// Default instantiation of MPI_CLASS::allGather
-template<class TYPE>
-int MPI_CLASS::allGather( const TYPE *send_data, const int send_cnt, TYPE *recv_data, int *recv_cnt,
-    int *recv_disp, bool known_recv ) const
-{
-    // Check the inputs
-    if ( known_recv && ( recv_cnt == nullptr || recv_disp == nullptr ) )
-        MPI_CLASS_ERROR( "Error calling allGather" );
-    // Check if we are dealing with a single processor
-    if ( comm_size == 1 ) {
-        if ( send_data == nullptr && send_cnt > 0 ) {
-            MPI_CLASS_ERROR( "send_data is null" );
-        } else if ( !known_recv ) {
-            // We do not know the recieved sizes
-            for ( int i = 0; i < send_cnt; i++ )
-                recv_data[i] = send_data[i];
-            if ( recv_cnt != nullptr )
-                recv_cnt[0] = send_cnt;
-            if ( recv_disp != nullptr )
-                recv_disp[0] = 0;
-        } else {
-            // We know the recieved sizes
-            for ( int i = 0; i < send_cnt; i++ )
-                recv_data[i + recv_disp[0]] = send_data[i];
-        }
-        return send_cnt;
-    }
-    // Get the sizes of the recieved data (if necessary)
-    int *recv_cnt2  = recv_cnt;
-    int *recv_disp2 = recv_disp;
-    if ( !known_recv ) {
-        if ( recv_cnt == nullptr )
-            recv_cnt2 = new int[comm_size];
-        if ( recv_disp == nullptr )
-            recv_disp2 = new int[comm_size];
-        call_allGather( send_cnt, recv_cnt2 );
-        recv_disp2[0] = 0;
-        for ( int i = 1; i < comm_size; i++ )
-            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
-    }
-    int N_recv = 0;
-    for ( int i = 0; i < comm_size; i++ )
-        N_recv += recv_cnt2[i];
-    // Send/recv the data
-    call_allGather( send_data, send_cnt, recv_data, recv_cnt2, recv_disp2 );
-    // Delete any temporary memory
-    if ( recv_cnt == nullptr )
-        delete[] recv_cnt2;
-    if ( recv_disp == nullptr )
-        delete[] recv_disp2;
-    return N_recv;
-}
-// Default instantiations of call_allGather(const TYPE, TYPE*)
-template<class TYPE>
-void MPI_CLASS::call_allGather( const TYPE &x_in, TYPE *x_out ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    allGather<char>( (const char *) &x_in, (int) sizeof( TYPE ), (char *) x_out );
-}
-// Define specializations of call_allGather(const TYPE*, int, TYPE*, int*, int*)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_allGather<unsigned char>(
-    const unsigned char *, int, unsigned char *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<unsigned int>(
-    const unsigned int *, int, unsigned int *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<int>( const int *, int, int *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<unsigned long int>(
-    const unsigned long int *, int, unsigned long int *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<long int>( const long int *, int, long int *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<float>( const float *, int, float *, int *, int * ) const;
-template<>
-void MPI_CLASS::call_allGather<double>( const double *, int, double *, int *, int * ) const;
-#else
-template<>
-void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const;
-#endif
-// Default instantiations of int call_allGather(const TYPE*, int, TYPE*, int*)
-template<class TYPE>
-void MPI_CLASS::call_allGather(
-    const TYPE *x_in, int size_in, TYPE *x_out, int *size_out, int *disp_out ) const
-{
-    int *size2 = new int[comm_size];
-    int *disp2 = new int[comm_size];
-    for ( int i = 0; i < comm_size; i++ ) {
-        size2[i] = size_out[i] * sizeof( TYPE );
-        disp2[i] = disp_out[i] * sizeof( TYPE );
-    }
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    call_allGather<char>(
-        (const char *) x_in, (int) size_in * sizeof( TYPE ), (char *) x_out, size2, disp2 );
-    delete[] size2;
-    delete[] disp2;
-}
-
-
-/************************************************************************
- *  setGather                                                            *
- ************************************************************************/
-template<class TYPE>
-inline void MPI_CLASS::setGather( std::set<TYPE> &set ) const
-{
-    std::vector<TYPE> send_buf( set.begin(), set.end() );
-    std::vector<int> recv_cnt( this->comm_size, 0 );
-    this->allGather<int>( (int) send_buf.size(), &recv_cnt[0] );
-    std::vector<int> recv_disp( this->comm_size, 0 );
-    for ( int i = 1; i < this->comm_size; i++ )
-        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
-    size_t N_recv_tot = 0;
-    for ( int i = 0; i < this->comm_size; i++ )
-        N_recv_tot += recv_cnt[i];
-    if ( N_recv_tot == 0 )
-        return;
-    std::vector<TYPE> recv_buf( N_recv_tot );
-    TYPE *send_data = nullptr;
-    if ( send_buf.size() > 0 ) {
-        send_data = &send_buf[0];
-    }
-    TYPE *recv_data = &recv_buf[0];
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    this->allGather<TYPE>(
-        send_data, (int) send_buf.size(), recv_data, &recv_cnt[0], &recv_disp[0], true );
-    for ( size_t i = 0; i < recv_buf.size(); i++ )
-        set.insert( recv_buf[i] );
-}
-
-
-/************************************************************************
- *  mapGather                                                            *
- ************************************************************************/
-template<class KEY, class DATA>
-inline void MPI_CLASS::mapGather( std::map<KEY, DATA> &map ) const
-{
-    std::vector<KEY> send_id;
-    std::vector<DATA> send_data;
-    send_id.reserve( map.size() );
-    send_data.reserve( map.size() );
-    for ( auto it = map.begin(); it != map.end(); ++it ) {
-        send_id.push_back( it->first );
-        send_data.push_back( it->second );
-    }
-    int send_size = (int) send_id.size();
-    std::vector<int> recv_cnt( this->comm_size, 0 );
-    this->allGather<int>( send_size, &recv_cnt[0] );
-    std::vector<int> recv_disp( this->comm_size, 0 );
-    for ( int i = 1; i < this->comm_size; i++ )
-        recv_disp[i] = recv_disp[i - 1] + recv_cnt[i - 1];
-    size_t N_recv_tot = 0;
-    for ( int i = 0; i < this->comm_size; i++ )
-        N_recv_tot += recv_cnt[i];
-    if ( N_recv_tot == 0 )
-        return;
-    std::vector<KEY> recv_id( N_recv_tot );
-    std::vector<DATA> recv_data( N_recv_tot );
-    KEY *send_data1  = nullptr;
-    DATA *send_data2 = nullptr;
-    if ( send_id.size() > 0 ) {
-        send_data1 = &send_id[0];
-        send_data2 = &send_data[0];
-    }
-    static_assert( is_mpi_copyable<DATA>(), "Object is not trivially copyable" );
-    this->allGather<KEY>( send_data1, send_size, &recv_id[0], &recv_cnt[0], &recv_disp[0], true );
-    this->allGather<DATA>(
-        send_data2, send_size, &recv_data[0], &recv_cnt[0], &recv_disp[0], true );
-    map = std::map<KEY, DATA>();
-    for ( size_t i = 0; i < N_recv_tot; i++ )
-        map.insert( std::pair<KEY, DATA>( recv_id[i], recv_data[i] ) );
-}
-
-
-/************************************************************************
- *  sumScan                                                              *
- ************************************************************************/
-template<class TYPE>
-inline void MPI_CLASS::sumScan( const TYPE *x, TYPE *y, const int n ) const
-{
-    if ( comm_size > 1 ) {
-        call_sumScan( x, y, n );
-    } else {
-        for ( int i = 0; i < n; i++ )
-            y[i] = x[i];
-    }
-}
-// Define specializations of call_sumScan(const TYPE*, TYPE*, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_sumScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<char>( const char *, char *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<int>( const int *, int *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<unsigned long int>(
-    const unsigned long int *, unsigned long int *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<long int>( const long int *, long int *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<size_t>( const size_t *, size_t *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<float>( const float *, float *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<double>( const double *, double *, int ) const;
-template<>
-void MPI_CLASS::call_sumScan<std::complex<double>>(
-    const std::complex<double> *, std::complex<double> *, int ) const;
-#endif
-// Default instantiations of call_sumScan(const TYPE*, TYPE*, int)
-template<class TYPE>
-void MPI_CLASS::call_sumScan( const TYPE *, TYPE *, int ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of sumScan in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  minScan                                                              *
- ************************************************************************/
-template<class TYPE>
-inline void MPI_CLASS::minScan( const TYPE *x, TYPE *y, const int n ) const
-{
-    if ( comm_size > 1 ) {
-        call_minScan( x, y, n );
-    } else {
-        for ( int i = 0; i < n; i++ )
-            y[i] = x[i];
-    }
-}
-// Define specializations of call_minScan(const TYPE*, TYPE*, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_minScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<char>( const char *, char *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<int>( const int *, int *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<unsigned long int>(
-    const unsigned long int *, unsigned long int *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<long int>( const long int *, long int *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<size_t>( const size_t *, size_t *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<float>( const float *, float *, int ) const;
-template<>
-void MPI_CLASS::call_minScan<double>( const double *, double *, int ) const;
-#endif
-// Default instantiations of call_minScan(const TYPE*, TYPE*, int)
-template<class TYPE>
-void MPI_CLASS::call_minScan( const TYPE *, TYPE *, int ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of minScan in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  maxScan                                                              *
- ************************************************************************/
-template<class TYPE>
-inline void MPI_CLASS::maxScan( const TYPE *x, TYPE *y, const int n ) const
-{
-    if ( comm_size > 1 ) {
-        call_maxScan( x, y, n );
-    } else {
-        for ( int i = 0; i < n; i++ )
-            y[i] = x[i];
-    }
-}
-// Define specializations of call_maxScan(const TYPE*, TYPE*, int)
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_maxScan<unsigned char>( const unsigned char *, unsigned char *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<char>( const char *, char *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<unsigned int>( const unsigned int *, unsigned int *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<int>( const int *, int *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<unsigned long int>(
-    const unsigned long int *, unsigned long int *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<long int>( const long int *, long int *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<size_t>( const size_t *, size_t *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<float>( const float *, float *, int ) const;
-template<>
-void MPI_CLASS::call_maxScan<double>( const double *, double *, int ) const;
-#endif
-// Default instantiations of call_maxScan(const TYPE*, TYPE*, int)
-template<class TYPE>
-void MPI_CLASS::call_maxScan( const TYPE *, TYPE *, int ) const
-{
-    char message[200];
-    sprintf( message, "Default instantion of maxReduce in parallel is not supported (%s)",
-        typeid( TYPE ).name() );
-    MPI_CLASS_ERROR( message );
-}
-
-
-/************************************************************************
- *  allToAll                                                             *
- ************************************************************************/
-// Define specializations of allToAll(const int n, const char*, char* )
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::allToAll<unsigned char>(
-    const int n, const unsigned char *, unsigned char * ) const;
-template<>
-void MPI_CLASS::allToAll<char>( const int n, const char *, char * ) const;
-template<>
-void MPI_CLASS::allToAll<unsigned int>( const int n, const unsigned int *, unsigned int * ) const;
-template<>
-void MPI_CLASS::allToAll<int>( const int n, const int *, int * ) const;
-template<>
-void MPI_CLASS::allToAll<unsigned long int>(
-    const int n, const unsigned long int *, unsigned long int * ) const;
-template<>
-void MPI_CLASS::allToAll<long int>( const int n, const long int *, long int * ) const;
-template<>
-void MPI_CLASS::allToAll<float>( const int n, const float *, float * ) const;
-template<>
-void MPI_CLASS::allToAll<double>( const int n, const double *, double * ) const;
-#endif
-// Default instantiations of allToAll(const int n, const char*, char* )
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<class TYPE>
-void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
-{
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    allToAll<char>( n * sizeof( TYPE ), (char *) send_data, (char *) recv_data );
-}
-#else
-template<class TYPE>
-void MPI_CLASS::allToAll( const int n, const TYPE *send_data, TYPE *recv_data ) const
-{
-    if ( comm_size != 1 )
-        MPI_CLASS_ERROR( "Invalid size for allToAll" );
-    for ( int i = 0; i < n; i++ )
-        recv_data[i] = send_data[i];
-}
-#endif
-
-
-/************************************************************************
- *  allToAll                                                             *
- ************************************************************************/
-template<class TYPE>
-int MPI_CLASS::allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
-    TYPE *recv_data, int *recv_cnt, int *recv_disp, bool known_recv ) const
-{
-    int N_recieved = 0;
-    if ( comm_size == 1 ) {
-        // Special case for single-processor communicators
-        if ( known_recv ) {
-            if ( recv_cnt[0] != send_cnt[0] && send_cnt[0] > 0 )
-                MPI_CLASS_ERROR( "Single processor send/recv are different sizes" );
-        } else {
-            if ( recv_cnt != nullptr )
-                recv_cnt[0] = send_cnt[0];
-            if ( recv_disp != nullptr )
-                recv_disp[0] = send_disp[0];
-        }
-        for ( int i = 0; i < send_cnt[0]; i++ )
-            recv_data[i + recv_disp[0]] = send_data[i + send_disp[0]];
-        N_recieved = send_cnt[0];
-    } else if ( known_recv ) {
-        // The recieve sizes are known
-        MPI_CLASS_ASSERT( recv_cnt != nullptr && recv_disp != nullptr );
-        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt, recv_disp );
-        for ( int i = 0; i < comm_size; i++ )
-            N_recieved += recv_cnt[i];
-    } else {
-        // The recieve sizes are not known, we need to communicate that information first
-        int *recv_cnt2  = recv_cnt;
-        int *recv_disp2 = recv_disp;
-        if ( recv_cnt == nullptr )
-            recv_cnt2 = new int[comm_size];
-        if ( recv_disp == nullptr )
-            recv_disp2 = new int[comm_size];
-        // Communicate the size we will be recieving from each processor
-        allToAll<int>( 1, send_cnt, recv_cnt2 );
-        recv_disp2[0] = 0;
-        for ( int i = 1; i < comm_size; i++ )
-            recv_disp2[i] = recv_disp2[i - 1] + recv_cnt2[i - 1];
-        // Send the data
-        call_allToAll( send_data, send_cnt, send_disp, recv_data, recv_cnt2, recv_disp2 );
-        for ( int i = 0; i < comm_size; i++ )
-            N_recieved += recv_cnt2[i];
-        if ( recv_cnt == nullptr )
-            delete[] recv_cnt2;
-        if ( recv_disp == nullptr )
-            delete[] recv_disp2;
-    }
-    return N_recieved;
-}
-// Define specializations of call_allToAll
-#if defined( USE_MPI ) || defined( USE_EXT_MPI )
-template<>
-void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *, const int *, const int *,
-    unsigned char *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<char>(
-    const char *, const int *, const int *, char *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *, const int *, const int *,
-    unsigned int *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<int>(
-    const int *, const int *, const int *, int *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *, const int *,
-    const int *, unsigned long int *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<long int>(
-    const long int *, const int *, const int *, long int *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<float>(
-    const float *, const int *, const int *, float *, const int *, const int * ) const;
-template<>
-void MPI_CLASS::call_allToAll<double>(
-    const double *, const int *, const int *, double *, const int *, const int * ) const;
-#else
-template<>
-void MPI_CLASS::call_allToAll<char>(
-    const char *, const int *, const int *, char *, const int *, const int * ) const;
-#endif
-// Default instantiations of call_allToAll
-template<class TYPE>
-void MPI_CLASS::call_allToAll( const TYPE *send_data, const int send_cnt[], const int send_disp[],
-    TYPE *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    int *send_cnt2  = new int[comm_size];
-    int *recv_cnt2  = new int[comm_size];
-    int *send_disp2 = new int[comm_size];
-    int *recv_disp2 = new int[comm_size];
-    for ( int i = 0; i < comm_size; i++ ) {
-        send_cnt2[i]  = send_cnt[i] * sizeof( TYPE );
-        send_disp2[i] = send_disp[i] * sizeof( TYPE );
-        recv_cnt2[i]  = recv_cnt[i] * sizeof( TYPE );
-        recv_disp2[i] = recv_disp[i] * sizeof( TYPE );
-    }
-    static_assert( is_mpi_copyable<TYPE>(), "Object is not trivially copyable" );
-    call_allToAll<char>(
-        (char *) send_data, send_cnt2, send_disp2, (char *) recv_data, recv_cnt2, recv_disp2 );
-    delete[] send_cnt2;
-    delete[] recv_cnt2;
-    delete[] send_disp2;
-    delete[] recv_disp2;
-}
-
-
-} // namespace Utilities
-
-#endif
diff --git a/common/MPI.cpp b/common/MPI.cpp
deleted file mode 100644
index 73932d03..00000000
--- a/common/MPI.cpp
+++ /dev/null
@@ -1,3758 +0,0 @@
-// This file impliments a wrapper class for MPI functions
-
-#include "common/MPI.h"
-#include "common/Utilities.h"
-
-#include "ProfilerApp.h"
-#include "StackTrace/ErrorHandlers.h"
-#include "StackTrace/StackTrace.h"
-
-// Include all other headers
-#include <algorithm>
-#include <chrono>
-#include <climits>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <random>
-#include <stdexcept>
-#include <thread>
-#include <typeinfo>
-
-
-// Include OS specific headers
-#undef USE_WINDOWS
-#undef USE_LINUX
-#undef USE_MAC
-#if defined( WIN32 ) || defined( _WIN32 ) || defined( WIN64 ) || defined( _WIN64 )
-// We are using windows
-#define USE_WINDOWS
-#include <process.h>
-#include <windows.h>
-#define sched_yield() Sleep( 0 )
-#elif defined( __APPLE__ )
-// Using MAC
-#define USE_MAC
-#include <sched.h>
-#elif defined( __linux ) || defined( __linux__ ) || defined( __unix ) || defined( __posix )
-// We are using linux
-#define USE_LINUX
-#include <sched.h>
-#include <unistd.h>
-#else
-#error Unknown OS
-#endif
-
-
-// Convience defines
-#define MPI_ERROR ERROR
-#define MPI_ASSERT ASSERT
-#define MPI_INSIST INSIST
-#define MPI_WARNING WARNING
-#define MPI_CLASS_COMM_NULL MPI_COMM_NULL
-#define MPI_CLASS_COMM_SELF MPI_COMM_SELF
-#define MPI_CLASS_COMM_WORLD MPI_COMM_WORLD
-
-
-// Global variable to track create new unique comms (dup and split)
-#ifndef USE_MPI
-MPI_Comm uniqueGlobalComm = 11;
-#endif
-
-
-#if defined( USE_SAMRAI ) && defined( USE_PETSC ) && !defined( USE_MPI )
-int MPI_REQUEST_NULL  = 3;
-int MPI_ERR_IN_STATUS = 4;
-#endif
-
-
-namespace Utilities {
-
-
-// Some special structs to work with MPI
-#ifdef USE_MPI
-struct IntIntStruct {
-    int j;
-    int i;
-};
-struct LongIntStruct {
-    long int j;
-    int i;
-};
-struct FloatIntStruct {
-    float f;
-    int i;
-};
-struct DoubleIntStruct {
-    double d;
-    int i;
-};
-#endif
-
-
-// Initialized the static member variables
-volatile unsigned int MPI_CLASS::N_MPI_Comm_created   = 0;
-volatile unsigned int MPI_CLASS::N_MPI_Comm_destroyed = 0;
-short MPI_CLASS::profile_level                        = 127;
-
-
-// Define a type for use with size_t
-#ifdef USE_MPI
-static MPI_Datatype MPI_SIZE_T = 0x0;
-static MPI_Datatype getSizeTDataType()
-{
-    int size_int, size_long, size_longlong, size_longlong2;
-    MPI_Type_size( MPI_UNSIGNED, &size_int );
-    MPI_Type_size( MPI_UNSIGNED_LONG, &size_long );
-    MPI_Type_size( MPI_UNSIGNED_LONG_LONG, &size_longlong );
-    MPI_Type_size( MPI_LONG_LONG_INT, &size_longlong2 );
-    if ( sizeof( size_t ) == size_int ) {
-        return MPI_UNSIGNED;
-    } else if ( sizeof( size_t ) == size_long ) {
-        return MPI_UNSIGNED_LONG;
-    } else if ( sizeof( size_t ) == size_longlong ) {
-        return MPI_UNSIGNED_LONG_LONG;
-    } else if ( sizeof( size_t ) == size_longlong2 ) {
-        MPI_WARNING( "Using signed long long datatype for size_t in MPI" );
-        return MPI_LONG_LONG_INT; // Note: this is not unsigned
-    } else {
-        MPI_ERROR( "No suitable datatype found" );
-    }
-    return 0;
-}
-#endif
-
-
-// Static data for asyncronous communication without MPI
-// Note: these routines may not be thread-safe yet
-#ifndef USE_MPI
-static const int mpi_max_tag = 0x003FFFFF;
-struct Isendrecv_struct {
-    const char *data; // Pointer to data
-    int status;       // Status: 1-sending, 2-recieving
-};
-std::map<MPI_Request, Isendrecv_struct> global_isendrecv_list;
-static MPI_Request getRequest( MPI_Comm comm, int tag )
-{
-    MPI_ASSERT( tag >= 0 && tag <= mpi_max_tag );
-    // Use hashing function: 2^64*0.5*(sqrt(5)-1)
-    uint64_t a    = static_cast<uint8_t>( comm ) * 0x9E3779B97F4A7C15;
-    uint64_t b    = static_cast<uint8_t>( tag ) * 0x9E3779B97F4A7C15;
-    uint64_t hash = a ^ b;
-    MPI_Request request;
-    memcpy( &request, &hash, sizeof( MPI_Request ) );
-    return request;
-}
-#endif
-
-
-// Check the mpi error code
-#ifdef USE_MPI
-inline void check_MPI( int error )
-{
-    if ( error != MPI_SUCCESS )
-        MPI_ERROR( "Error calling MPI routine" );
-}
-#endif
-
-
-/******************************************************************
- * Some helper functions to convert between signed/unsigned types  *
- ******************************************************************/
-DISABLE_WARNINGS
-static inline constexpr unsigned int offset_int()
-{
-    return ~static_cast<unsigned int>( std::numeric_limits<int>::min() ) + 1;
-}
-static inline constexpr unsigned long int offset_long()
-{
-    return ~static_cast<long int>( std::numeric_limits<long int>::min() ) + 1;
-}
-static inline constexpr unsigned long long int offset_long_long()
-{
-    return ~static_cast<long long int>( std::numeric_limits<long long int>::min() ) + 1;
-}
-ENABLE_WARNINGS
-static inline unsigned int signed_to_unsigned( int x )
-{
-    const auto offset = offset_int();
-    return ( x >= 0 ) ? static_cast<unsigned int>( x ) + offset :
-                        offset - static_cast<unsigned int>( -x );
-}
-static inline unsigned long int signed_to_unsigned( long int x )
-{
-    const auto offset = offset_long();
-    return ( x >= 0 ) ? static_cast<unsigned long int>( x ) + offset :
-                        offset - static_cast<unsigned long int>( -x );
-}
-static inline unsigned long long int signed_to_unsigned( long long int x )
-{
-    const auto offset = offset_long_long();
-    return ( x >= 0 ) ? static_cast<unsigned long long int>( x ) + offset :
-                        offset - static_cast<unsigned long long int>( -x );
-}
-static inline int unsigned_to_signed( unsigned int x )
-{
-    const auto offset = offset_int();
-    return ( x >= offset ) ? static_cast<int>( x - offset ) : -static_cast<int>( offset - x );
-}
-static inline long int unsigned_to_signed( unsigned long int x )
-{
-    const auto offset = offset_long();
-    return ( x >= offset ) ? static_cast<long int>( x - offset ) :
-                             -static_cast<long int>( offset - x );
-}
-static inline long long int unsigned_to_signed( unsigned long long int x )
-{
-    const auto offset = offset_long_long();
-    return ( x >= offset ) ? static_cast<long long int>( x - offset ) :
-                             -static_cast<long long int>( offset - x );
-}
-
-
-/************************************************************************
- *  Get the MPI version                                                  *
- ************************************************************************/
-std::array<int, 2> MPI_CLASS::version()
-{
-#ifdef USE_MPI
-    int MPI_version;
-    int MPI_subversion;
-    MPI_Get_version( &MPI_version, &MPI_subversion );
-    return { MPI_version, MPI_subversion };
-#else
-    return { 0, 0 };
-#endif
-}
-std::string MPI_CLASS::info()
-{
-#ifdef USE_MPI
-#if MPI_VERSION >= 3
-    int MPI_version_length = 0;
-    char MPI_version_string[MPI_MAX_LIBRARY_VERSION_STRING];
-    MPI_Get_library_version( MPI_version_string, &MPI_version_length );
-    if ( MPI_version_length > 0 ) {
-        std::string MPI_info( MPI_version_string, MPI_version_length );
-        size_t pos = MPI_info.find( '\n' );
-        while ( pos != std::string::npos ) {
-            MPI_info.insert( pos + 1, "   " );
-            pos = MPI_info.find( '\n', pos + 1 );
-        }
-        return MPI_info;
-    }
-#endif
-    auto tmp = version();
-    return std::to_string( tmp[0] ) + "." + std::to_string( tmp[0] );
-#else
-    return std::string();
-#endif
-}
-
-
-/************************************************************************
- *  Functions to get/set the process affinities                          *
- ************************************************************************/
-int MPI_CLASS::getNumberOfProcessors() { return std::thread::hardware_concurrency(); }
-std::vector<int> MPI_CLASS::getProcessAffinity()
-{
-    std::vector<int> procs;
-#ifdef USE_LINUX
-    cpu_set_t mask;
-    int error = sched_getaffinity( getpid(), sizeof( cpu_set_t ), &mask );
-    if ( error != 0 )
-        MPI_ERROR( "Error getting process affinity" );
-    for ( int i = 0; i < (int) sizeof( cpu_set_t ) * CHAR_BIT; i++ ) {
-        if ( CPU_ISSET( i, &mask ) )
-            procs.push_back( i );
-    }
-#elif defined( USE_MAC )
-    // MAC does not support getting or setting the affinity
-    printf( "Warning: MAC does not support getting the process affinity\n" );
-    procs.clear();
-#elif defined( USE_WINDOWS )
-    HANDLE hProc = GetCurrentProcess();
-    size_t procMask;
-    size_t sysMask;
-    PDWORD_PTR procMaskPtr = reinterpret_cast<PDWORD_PTR>( &procMask );
-    PDWORD_PTR sysMaskPtr  = reinterpret_cast<PDWORD_PTR>( &sysMask );
-    GetProcessAffinityMask( hProc, procMaskPtr, sysMaskPtr );
-    for ( int i = 0; i < (int) sizeof( size_t ) * CHAR_BIT; i++ ) {
-        if ( ( procMask & 0x1 ) != 0 )
-            procs.push_back( i );
-        procMask >>= 1;
-    }
-#else
-#error Unknown OS
-#endif
-    return procs;
-}
-void MPI_CLASS::setProcessAffinity( const std::vector<int> &procs )
-{
-#ifdef USE_LINUX
-    cpu_set_t mask;
-    CPU_ZERO( &mask );
-    for ( auto cpu : procs )
-        CPU_SET( cpu, &mask );
-    int error = sched_setaffinity( getpid(), sizeof( cpu_set_t ), &mask );
-    if ( error != 0 )
-        MPI_ERROR( "Error setting process affinity" );
-#elif defined( USE_MAC )
-    // MAC does not support getting or setting the affinity
-    NULL_USE( procs );
-#elif defined( USE_WINDOWS )
-    DWORD mask = 0;
-    for ( size_t i = 0; i < procs.size(); i++ )
-        mask |= ( (DWORD) 1 ) << procs[i];
-    HANDLE hProc = GetCurrentProcess();
-    SetProcessAffinityMask( hProc, mask );
-#else
-#error Unknown OS
-#endif
-}
-
-
-/************************************************************************
- *  Function to check if MPI is active                                   *
- ************************************************************************/
-bool MPI_CLASS::MPI_active()
-{
-#ifdef USE_MPI
-    int initialized = 0, finalized = 0;
-    MPI_Initialized( &initialized );
-    MPI_Finalized( &finalized );
-    return initialized != 0 && finalized == 0;
-#else
-    return true;
-#endif
-}
-MPI_CLASS::ThreadSupport MPI_CLASS::queryThreadSupport()
-{
-#ifdef USE_MPI
-    int provided = 0;
-    MPI_Query_thread( &provided );
-    if ( provided == MPI_THREAD_SINGLE )
-        return ThreadSupport::SINGLE;
-    if ( provided == MPI_THREAD_FUNNELED )
-        return ThreadSupport::FUNNELED;
-    if ( provided == MPI_THREAD_SERIALIZED )
-        return ThreadSupport::SERIALIZED;
-    if ( provided == MPI_THREAD_MULTIPLE )
-        return ThreadSupport::MULTIPLE;
-    return ThreadSupport::SINGLE;
-#else
-    return ThreadSupport::MULTIPLE;
-#endif
-}
-
-
-/************************************************************************
- *  Function to perform a load balance of the given processes            *
- ************************************************************************/
-void MPI_CLASS::balanceProcesses( const MPI_CLASS &globalComm, const int method,
-    const std::vector<int> &procs, const int N_min_in, const int N_max_in )
-{
-    // Build the list of processors to use
-    std::vector<int> cpus = procs;
-    if ( cpus.empty() ) {
-        for ( int i = 0; i < getNumberOfProcessors(); i++ )
-            cpus.push_back( i );
-    }
-    // Handle the "easy cases"
-    if ( method == 1 ) {
-        // Trivial case where we do not need any communication
-        setProcessAffinity( cpus );
-        return;
-    }
-    // Get the sub-communicator for the current node
-    MPI_CLASS nodeComm = globalComm.splitByNode();
-    int N_min          = std::min<int>( std::max<int>( N_min_in, 1 ), cpus.size() );
-    int N_max          = N_max_in;
-    if ( N_max == -1 )
-        N_max = cpus.size();
-    N_max = std::min<int>( N_max, cpus.size() );
-    MPI_ASSERT( N_max >= N_min );
-    // Perform the load balance within the node
-    if ( method == 2 ) {
-        int N_proc = cpus.size() / nodeComm.getSize();
-        N_proc     = std::max<int>( N_proc, N_min );
-        N_proc     = std::min<int>( N_proc, N_max );
-        std::vector<int> cpus2( N_proc, -1 );
-        for ( int i = 0; i < N_proc; i++ )
-            cpus2[i] = cpus[( nodeComm.getRank() * N_proc + i ) % cpus.size()];
-        setProcessAffinity( cpus2 );
-    } else {
-        MPI_ERROR( "Unknown method for load balance" );
-    }
-}
-
-
-/************************************************************************
- *  Empty constructor                                                    *
- ************************************************************************/
-MPI_CLASS::MPI_CLASS()
-{
-// Initialize the data members to a defaul communicator of self
-#ifdef USE_MPI
-    communicator = MPI_COMM_NULL;
-    d_maxTag     = 0x7FFFFFFF;
-#else
-    communicator = MPI_CLASS_COMM_NULL;
-    d_maxTag     = mpi_max_tag;
-#endif
-    d_ranks       = nullptr;
-    d_count       = nullptr;
-    d_manage      = false;
-    comm_rank     = 0;
-    comm_size     = 1;
-    d_isNull      = true;
-    d_currentTag  = nullptr;
-    d_call_abort  = true;
-    tmp_alignment = -1;
-}
-
-
-/************************************************************************
- *  Empty deconstructor                                                  *
- ************************************************************************/
-MPI_CLASS::~MPI_CLASS() { reset(); }
-void MPI_CLASS::reset()
-{
-    // Decrement the count if used
-    int count = -1;
-    if ( d_count != nullptr )
-        count = --( *d_count );
-    if ( count == 0 ) {
-        // We are holding that last reference to the MPI_Comm object, we need to free it
-        if ( d_manage ) {
-#ifdef USE_MPI
-            MPI_Comm_set_errhandler( communicator, MPI_ERRORS_ARE_FATAL );
-            int err = MPI_Comm_free( &communicator );
-            if ( err != MPI_SUCCESS )
-                MPI_ERROR( "Problem free'ing MPI_Comm object" );
-            communicator = MPI_CLASS_COMM_NULL;
-            ++N_MPI_Comm_destroyed;
-#endif
-        }
-        if ( d_ranks != nullptr )
-            delete[] d_ranks;
-        delete d_count;
-    }
-    if ( d_currentTag == nullptr ) {
-        // No tag index
-    } else if ( d_currentTag[1] > 1 ) {
-        --( d_currentTag[1] );
-    } else {
-        delete[] d_currentTag;
-    }
-    d_manage     = false;
-    d_count      = nullptr;
-    d_ranks      = nullptr;
-    comm_rank    = 0;
-    comm_size    = 1;
-    d_maxTag     = 0;
-    d_isNull     = true;
-    d_currentTag = nullptr;
-    d_call_abort = true;
-}
-
-
-/************************************************************************
- *  Copy constructors                                                    *
- ************************************************************************/
-MPI_CLASS::MPI_CLASS( const MPI_CLASS &comm )
-    : communicator( comm.communicator ),
-      d_isNull( comm.d_isNull ),
-      d_manage( comm.d_manage ),
-      comm_rank( comm.comm_rank ),
-      comm_size( comm.comm_size ),
-      d_ranks( comm.d_ranks ),
-      d_maxTag( comm.d_maxTag ),
-      d_currentTag( comm.d_currentTag )
-{
-    // Initialize the data members to the existing comm object
-    if ( d_currentTag != nullptr )
-        ++d_currentTag[1];
-    d_call_abort = comm.d_call_abort;
-    // Set and increment the count
-    d_count = comm.d_count;
-    if ( d_count != nullptr )
-        ++( *d_count );
-    tmp_alignment = -1;
-}
-MPI_CLASS::MPI_CLASS( MPI_CLASS &&rhs ) : MPI_CLASS()
-{
-    std::swap( communicator, rhs.communicator );
-    std::swap( d_isNull, rhs.d_isNull );
-    std::swap( d_manage, rhs.d_manage );
-    std::swap( d_call_abort, rhs.d_call_abort );
-    std::swap( profile_level, rhs.profile_level );
-    std::swap( comm_rank, rhs.comm_rank );
-    std::swap( comm_size, rhs.comm_size );
-    std::swap( d_ranks, rhs.d_ranks );
-    std::swap( d_maxTag, rhs.d_maxTag );
-    std::swap( d_currentTag, rhs.d_currentTag );
-    std::swap( d_count, rhs.d_count );
-    std::swap( tmp_alignment, rhs.tmp_alignment );
-}
-
-
-/************************************************************************
- *  Assignment operators                                                 *
- ************************************************************************/
-MPI_CLASS &MPI_CLASS::operator=( const MPI_CLASS &comm )
-{
-    if ( this == &comm ) // protect against invalid self-assignment
-        return *this;
-    // Destroy the previous object
-    this->reset();
-    // Initialize the data members to the existing object
-    this->communicator = comm.communicator;
-    this->comm_rank    = comm.comm_rank;
-    this->comm_size    = comm.comm_size;
-    this->d_ranks      = comm.d_ranks;
-    this->d_isNull     = comm.d_isNull;
-    this->d_manage     = comm.d_manage;
-    this->d_maxTag     = comm.d_maxTag;
-    this->d_call_abort = comm.d_call_abort;
-    this->d_currentTag = comm.d_currentTag;
-    if ( this->d_currentTag != nullptr )
-        ++( this->d_currentTag[1] );
-    // Set and increment the count
-    this->d_count = comm.d_count;
-    if ( this->d_count != nullptr )
-        ++( *d_count );
-    this->tmp_alignment = -1;
-    return *this;
-}
-MPI_CLASS &MPI_CLASS::operator=( MPI_CLASS &&rhs )
-{
-    if ( this == &rhs ) // protect against invalid self-assignment
-        return *this;
-    std::swap( communicator, rhs.communicator );
-    std::swap( d_isNull, rhs.d_isNull );
-    std::swap( d_manage, rhs.d_manage );
-    std::swap( d_call_abort, rhs.d_call_abort );
-    std::swap( profile_level, rhs.profile_level );
-    std::swap( comm_rank, rhs.comm_rank );
-    std::swap( comm_size, rhs.comm_size );
-    std::swap( d_ranks, rhs.d_ranks );
-    std::swap( d_maxTag, rhs.d_maxTag );
-    std::swap( d_currentTag, rhs.d_currentTag );
-    std::swap( d_count, rhs.d_count );
-    std::swap( tmp_alignment, rhs.tmp_alignment );
-    return *this;
-}
-
-
-/************************************************************************
- *  Constructor from existing MPI communicator                           *
- ************************************************************************/
-int d_global_currentTag_world1[2] = { 1, 1 };
-int d_global_currentTag_world2[2] = { 1, 1 };
-int d_global_currentTag_self[2]   = { 1, 1 };
-#ifdef USE_MPI
-std::atomic_int d_global_count_world1 = { 1 };
-std::atomic_int d_global_count_world2 = { 1 };
-std::atomic_int d_global_count_self   = { 1 };
-#endif
-MPI_CLASS::MPI_CLASS( MPI_Comm comm, bool manage )
-{
-    d_count       = nullptr;
-    d_ranks       = nullptr;
-    d_manage      = false;
-    tmp_alignment = -1;
-    // Check if we are using our version of comm_world
-    if ( comm == MPI_CLASS_COMM_WORLD ) {
-        communicator = MPI_COMM_WORLD;
-    } else if ( comm == MPI_CLASS_COMM_SELF ) {
-        communicator = MPI_COMM_SELF;
-    } else if ( comm == MPI_CLASS_COMM_NULL ) {
-        communicator = MPI_COMM_NULL;
-    } else {
-        communicator = comm;
-    }
-#ifdef USE_MPI
-    // We are using MPI, use the MPI communicator to initialize the data
-    if ( communicator != MPI_COMM_NULL ) {
-        // Set the MPI_SIZE_T datatype if it has not been set
-        if ( MPI_SIZE_T == 0x0 )
-            MPI_SIZE_T = getSizeTDataType();
-        // Attach the error handler
-        StackTrace::setMPIErrorHandler( communicator );
-        // Get the communicator properties
-        MPI_Comm_rank( communicator, &comm_rank );
-        MPI_Comm_size( communicator, &comm_size );
-        int flag, *val;
-        int ierr = MPI_Comm_get_attr( communicator, MPI_TAG_UB, &val, &flag );
-        MPI_ASSERT( ierr == MPI_SUCCESS );
-        if ( flag == 0 ) {
-            d_maxTag = 0x7FFFFFFF; // The tag is not a valid attribute (set to 2^31-1)
-        } else {
-            d_maxTag = *val;
-            if ( d_maxTag < 0 ) {
-                d_maxTag = 0x7FFFFFFF;
-            } // The maximum tag is > a signed int (set to 2^31-1)
-            MPI_INSIST( d_maxTag >= 0x7FFF, "maximum tag size is < MPI standard" );
-        }
-    } else {
-        comm_rank = 1;
-        comm_size = 0;
-        d_maxTag  = 0x7FFFFFFF;
-    }
-    d_isNull = communicator == MPI_COMM_NULL;
-    if ( manage && communicator != MPI_COMM_NULL && communicator != MPI_COMM_SELF &&
-         communicator != MPI_COMM_WORLD )
-        d_manage = true;
-    // Create the count (Note: we do not need to worry about thread safety)
-    if ( communicator == MPI_CLASS_COMM_WORLD ) {
-        d_count = &d_global_count_world1;
-        ++( *d_count );
-    } else if ( communicator == MPI_COMM_WORLD ) {
-        d_count = &d_global_count_world2;
-        ++( *d_count );
-    } else if ( communicator == MPI_COMM_SELF ) {
-        d_count = &d_global_count_self;
-        ++( *d_count );
-    } else if ( communicator == MPI_COMM_NULL ) {
-        d_count = nullptr;
-    } else {
-        d_count  = new std::atomic_int;
-        *d_count = 1;
-    }
-    if ( d_manage )
-        ++N_MPI_Comm_created;
-    // Create d_ranks
-    if ( comm_size > 1 ) {
-        d_ranks    = new int[comm_size];
-        d_ranks[0] = -1;
-    }
-#else
-    // We are not using MPI, intialize based on the communicator
-    NULL_USE( manage );
-    comm_rank = 0;
-    comm_size = 1;
-    d_maxTag  = mpi_max_tag;
-    d_isNull  = communicator == MPI_COMM_NULL;
-    if ( d_isNull )
-        comm_size    = 0;
-#endif
-    if ( communicator == MPI_CLASS_COMM_WORLD ) {
-        d_currentTag = d_global_currentTag_world1;
-        ++( this->d_currentTag[1] );
-    } else if ( communicator == MPI_COMM_WORLD ) {
-        d_currentTag = d_global_currentTag_world2;
-        ++( this->d_currentTag[1] );
-    } else if ( communicator == MPI_COMM_SELF ) {
-        d_currentTag = d_global_currentTag_self;
-        ++( this->d_currentTag[1] );
-    } else if ( communicator == MPI_COMM_NULL ) {
-        d_currentTag = nullptr;
-    } else {
-        d_currentTag    = new int[2];
-        d_currentTag[0] = ( d_maxTag <= 0x10000 ) ? 1 : 0x1FFF;
-        d_currentTag[1] = 1;
-    }
-    d_call_abort = true;
-}
-
-
-/************************************************************************
- *  Return the ranks of the communicator in the global comm              *
- ************************************************************************/
-std::vector<int> MPI_CLASS::globalRanks() const
-{
-    // Get my global rank if it has not been set
-    static int myGlobalRank = -1;
-    if ( myGlobalRank == -1 ) {
-#ifdef USE_MPI
-        if ( MPI_active() )
-            MPI_Comm_rank( MPI_CLASS_COMM_WORLD, &myGlobalRank );
-#else
-        myGlobalRank = 0;
-#endif
-    }
-    // Check if we are dealing with a serial or null communicator
-    if ( comm_size == 1 )
-        return std::vector<int>( 1, myGlobalRank );
-    if ( d_ranks == nullptr || communicator == MPI_COMM_NULL )
-        return std::vector<int>();
-    // Fill d_ranks if necessary
-    if ( d_ranks[0] == -1 ) {
-        if ( communicator == MPI_CLASS_COMM_WORLD ) {
-            for ( int i = 0; i < comm_size; i++ )
-                d_ranks[i] = i;
-        } else {
-
-            MPI_ASSERT( myGlobalRank != -1 );
-            this->allGather( myGlobalRank, d_ranks );
-        }
-    }
-    // Return d_ranks
-    return std::vector<int>( d_ranks, d_ranks + comm_size );
-}
-
-
-/************************************************************************
- *  Generate a random number                                             *
- ************************************************************************/
-size_t MPI_CLASS::rand() const
-{
-    size_t val = 0;
-    if ( getRank() == 0 ) {
-        static std::random_device rd;
-        static std::mt19937 gen( rd() );
-        static std::uniform_int_distribution<size_t> dist;
-        val = dist( gen );
-    }
-    val = bcast( val, 0 );
-    return val;
-}
-
-
-/************************************************************************
- *  Intersect two communicators                                          *
- ************************************************************************/
-#ifdef USE_MPI
-static inline void MPI_Group_free2( MPI_Group *group )
-{
-    if ( *group != MPI_GROUP_EMPTY ) {
-        // MPICH is fine with free'ing an empty group, OpenMPI crashes
-        MPI_Group_free( group );
-    }
-}
-MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
-{
-    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY;
-    if ( !comm1.isNull() ) {
-        MPI_Group_free2( &group1 );
-        MPI_Comm_group( comm1.communicator, &group1 );
-    }
-    if ( !comm2.isNull() ) {
-        MPI_Group_free2( &group2 );
-        MPI_Comm_group( comm2.communicator, &group2 );
-    }
-    MPI_Group group12;
-    MPI_Group_intersection( group1, group2, &group12 );
-    int compare1, compare2;
-    MPI_Group_compare( group1, group12, &compare1 );
-    MPI_Group_compare( group2, group12, &compare2 );
-    MPI_CLASS new_comm( MPI_CLASS_COMM_NULL );
-    int size;
-    MPI_Group_size( group12, &size );
-    if ( compare1 != MPI_UNEQUAL && size != 0 ) {
-        // The intersection matches comm1
-        new_comm = comm1;
-    } else if ( compare2 != MPI_UNEQUAL && size != 0 ) {
-        // The intersection matches comm2
-        new_comm = comm2;
-    } else if ( comm1.isNull() ) {
-        // comm1 is null, we can return safely (comm1 is needed for communication)
-    } else {
-        // The intersection is smaller than comm1 or comm2
-        // Check if the new comm is nullptr for all processors
-        int max_size = 0;
-        MPI_Allreduce( &size, &max_size, 1, MPI_INT, MPI_MAX, comm1.communicator );
-        if ( max_size == 0 ) {
-            // We are dealing with completely disjoint sets
-            new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
-        } else {
-            // Create the new comm
-            // Note: OpenMPI crashes if the intersection group is EMPTY for any processors
-            // We will set it to SELF for the EMPTY processors, then create a nullptr comm later
-            if ( group12 == MPI_GROUP_EMPTY ) {
-                MPI_Group_free2( &group12 );
-                MPI_Comm_group( MPI_COMM_SELF, &group12 );
-            }
-            MPI_Comm new_MPI_comm;
-            MPI_Comm_create( comm1.communicator, group12, &new_MPI_comm );
-            if ( size > 0 ) {
-                // This is the valid case where we create a new intersection comm
-                new_comm = MPI_CLASS( new_MPI_comm, true );
-            } else {
-                // We actually want a null comm for this communicator
-                new_comm = MPI_CLASS( MPI_CLASS_COMM_NULL, false );
-                MPI_Comm_free( &new_MPI_comm );
-            }
-        }
-    }
-    MPI_Group_free2( &group1 );
-    MPI_Group_free2( &group2 );
-    MPI_Group_free2( &group12 );
-    return new_comm;
-}
-#else
-MPI_CLASS MPI_CLASS::intersect( const MPI_CLASS &comm1, const MPI_CLASS &comm2 )
-{
-    if ( comm1.isNull() || comm2.isNull() )
-        return MPI_CLASS( MPI_CLASS_COMM_NULL, false );
-    MPI_ASSERT( comm1.comm_size == 1 && comm2.comm_size == 1 );
-    return comm1;
-}
-#endif
-
-
-/************************************************************************
- *  Split a comm						                                    *
- ************************************************************************/
-MPI_CLASS MPI_CLASS::split( int color, int key ) const
-{
-    if ( d_isNull ) {
-        return MPI_CLASS( MPI_CLASS_COMM_NULL );
-    } else if ( comm_size == 1 ) {
-        if ( color == -1 )
-            return MPI_CLASS( MPI_CLASS_COMM_NULL );
-        return dup();
-    }
-    MPI_Comm new_MPI_comm = MPI_CLASS_COMM_NULL;
-#ifdef USE_MPI
-    // USE MPI to split the communicator
-    if ( color == -1 ) {
-        check_MPI( MPI_Comm_split( communicator, MPI_UNDEFINED, key, &new_MPI_comm ) );
-    } else {
-        check_MPI( MPI_Comm_split( communicator, color, key, &new_MPI_comm ) );
-    }
-#endif
-    // Create the new object
-    NULL_USE( key );
-    MPI_CLASS new_comm( new_MPI_comm, true );
-    new_comm.d_call_abort = d_call_abort;
-    return new_comm;
-}
-MPI_CLASS MPI_CLASS::splitByNode( int key ) const
-{
-    // Check if we are dealing with a single processor (trivial case)
-    if ( comm_size == 1 )
-        return this->split( 0, 0 );
-    // Get the node name
-    std::string name = MPI_CLASS::getNodeName();
-    // Gather the names from all ranks
-    std::vector<std::string> list( comm_size );
-    allGather( name, &list[0] );
-    // Create the colors
-    std::vector<int> color( comm_size, -1 );
-    color[0] = 0;
-    for ( int i = 1; i < comm_size; i++ ) {
-        const std::string tmp1 = list[i];
-        for ( int j = 0; j < i; j++ ) {
-            const std::string tmp2 = list[j];
-            if ( tmp1 == tmp2 ) {
-                color[i] = color[j];
-                break;
-            }
-            color[i] = color[i - 1] + 1;
-        }
-    }
-    MPI_CLASS new_comm = this->split( color[comm_rank], key );
-    return new_comm;
-}
-
-
-/************************************************************************
- *  Duplicate an exisiting comm object                                   *
- ************************************************************************/
-MPI_CLASS MPI_CLASS::dup() const
-{
-    if ( d_isNull )
-        return MPI_CLASS( MPI_CLASS_COMM_NULL );
-    MPI_Comm new_MPI_comm = communicator;
-#if defined( USE_MPI ) || defined( USE_PETSC )
-    // USE MPI to duplicate the communicator
-    MPI_Comm_dup( communicator, &new_MPI_comm );
-#else
-    new_MPI_comm = uniqueGlobalComm;
-    uniqueGlobalComm++;
-#endif
-    // Create the new comm object
-    MPI_CLASS new_comm( new_MPI_comm, true );
-    new_comm.d_isNull     = d_isNull;
-    new_comm.d_call_abort = d_call_abort;
-    return new_comm;
-}
-
-
-/************************************************************************
- *  Get the node name                                                    *
- ************************************************************************/
-std::string MPI_CLASS::getNodeName()
-{
-#ifdef USE_MPI
-    int length;
-    char name[MPI_MAX_PROCESSOR_NAME + 1];
-    memset( name, 0, MPI_MAX_PROCESSOR_NAME + 1 );
-    MPI_Get_processor_name( name, &length );
-    return std::string( name );
-#else
-    return "Node0";
-#endif
-}
-
-
-/************************************************************************
- *  Overload operator ==                                                 *
- ************************************************************************/
-bool MPI_CLASS::operator==( const MPI_CLASS &comm ) const
-{
-    return communicator == comm.communicator;
-}
-
-
-/************************************************************************
- *  Overload operator !=                                                 *
- ************************************************************************/
-bool MPI_CLASS::operator!=( const MPI_CLASS &comm ) const
-{
-    return communicator != comm.communicator;
-}
-
-
-/************************************************************************
- *  Overload operator <                                                  *
- ************************************************************************/
-bool MPI_CLASS::operator<( const MPI_CLASS &comm ) const
-{
-    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
-    bool flag = true;
-    // First check if either communicator is NULL
-    if ( this->d_isNull )
-        return false;
-    if ( comm.d_isNull )
-        flag = false;
-    // Use compare to check if the comms are equal
-    if ( compare( comm ) != 0 )
-        return false;
-    // Check that the size of the other communicator is > the current communicator size
-    if ( comm_size >= comm.comm_size )
-        flag = false;
-// Check the union of the communicator groups
-// this is < comm iff this group is a subgroup of comm's group
-#ifdef USE_MPI
-    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
-    if ( !d_isNull )
-        MPI_Comm_group( communicator, &group1 );
-    if ( !comm.d_isNull )
-        MPI_Comm_group( comm.communicator, &group2 );
-    MPI_Group_union( group1, group2, &group12 );
-    int compare;
-    MPI_Group_compare( group2, group12, &compare );
-    if ( compare == MPI_UNEQUAL )
-        flag = false;
-    MPI_Group_free( &group1 );
-    MPI_Group_free( &group2 );
-    MPI_Group_free( &group12 );
-#endif
-    // Perform a global reduce of the flag (equivalent to all operation)
-    return allReduce( flag );
-}
-
-
-/************************************************************************
- *  Overload operator <=                                                 *
- ************************************************************************/
-bool MPI_CLASS::operator<=( const MPI_CLASS &comm ) const
-{
-    MPI_ASSERT( !this->d_isNull && !comm.d_isNull );
-    bool flag = true;
-    // First check if either communicator is NULL
-    if ( this->d_isNull )
-        return false;
-    if ( comm.d_isNull )
-        flag = false;
-#ifdef USE_MPI
-    int world_size = 0;
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    if ( comm.getSize() == world_size )
-        return true;
-    if ( getSize() == 1 && !comm.d_isNull )
-        return true;
-#endif
-    // Use compare to check if the comms are equal
-    if ( compare( comm ) != 0 )
-        return true;
-    // Check that the size of the other communicator is > the current communicator size
-    // this is <= comm iff this group is a subgroup of comm's group
-    if ( comm_size > comm.comm_size )
-        flag = false;
-// Check the unnion of the communicator groups
-#ifdef USE_MPI
-    MPI_Group group1, group2, group12;
-    MPI_Comm_group( communicator, &group1 );
-    MPI_Comm_group( comm.communicator, &group2 );
-    MPI_Group_union( group1, group2, &group12 );
-    int compare;
-    MPI_Group_compare( group2, group12, &compare );
-    if ( compare == MPI_UNEQUAL )
-        flag = false;
-    MPI_Group_free( &group1 );
-    MPI_Group_free( &group2 );
-    MPI_Group_free( &group12 );
-#endif
-    // Perform a global reduce of the flag (equivalent to all operation)
-    return allReduce( flag );
-}
-
-
-/************************************************************************
- *  Overload operator >                                                  *
- ************************************************************************/
-bool MPI_CLASS::operator>( const MPI_CLASS &comm ) const
-{
-    bool flag = true;
-    // First check if either communicator is NULL
-    if ( this->d_isNull )
-        return false;
-    if ( comm.d_isNull )
-        flag = false;
-    // Use compare to check if the comms are equal
-    if ( compare( comm ) != 0 )
-        return false;
-    // Check that the size of the other communicator is > the current communicator size
-    if ( comm_size <= comm.comm_size )
-        flag = false;
-// Check the unnion of the communicator groups
-// this is > comm iff comm's group is a subgroup of this group
-#ifdef USE_MPI
-    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
-    if ( !d_isNull )
-        MPI_Comm_group( communicator, &group1 );
-    if ( !comm.d_isNull )
-        MPI_Comm_group( comm.communicator, &group2 );
-    MPI_Group_union( group1, group2, &group12 );
-    int compare;
-    MPI_Group_compare( group1, group12, &compare );
-    if ( compare == MPI_UNEQUAL )
-        flag = false;
-    MPI_Group_free( &group1 );
-    MPI_Group_free( &group2 );
-    MPI_Group_free( &group12 );
-#endif
-    // Perform a global reduce of the flag (equivalent to all operation)
-    return allReduce( flag );
-}
-
-
-/************************************************************************
- *  Overload operator >=                                                 *
- ************************************************************************/
-bool MPI_CLASS::operator>=( const MPI_CLASS &comm ) const
-{
-    bool flag = true;
-    // First check if either communicator is NULL
-    if ( this->d_isNull )
-        return false;
-    if ( comm.d_isNull )
-        flag = false;
-#ifdef USE_MPI
-    int world_size = 0;
-    MPI_Comm_size( MPI_COMM_WORLD, &world_size );
-    if ( getSize() == world_size )
-        return true;
-    if ( comm.getSize() == 1 && !comm.d_isNull )
-        return true;
-#endif
-    // Use compare to check if the comms are equal
-    if ( compare( comm ) != 0 )
-        return true;
-    // Check that the size of the other communicator is > the current communicator size
-    if ( comm_size < comm.comm_size )
-        flag = false;
-// Check the unnion of the communicator groups
-// this is >= comm iff comm's group is a subgroup of this group
-#ifdef USE_MPI
-    MPI_Group group1 = MPI_GROUP_EMPTY, group2 = MPI_GROUP_EMPTY, group12 = MPI_GROUP_EMPTY;
-    if ( !d_isNull )
-        MPI_Comm_group( communicator, &group1 );
-    if ( !comm.d_isNull )
-        MPI_Comm_group( comm.communicator, &group2 );
-    MPI_Group_union( group1, group2, &group12 );
-    int compare;
-    MPI_Group_compare( group1, group12, &compare );
-    if ( compare == MPI_UNEQUAL )
-        flag = false;
-    MPI_Group_free( &group1 );
-    MPI_Group_free( &group2 );
-    MPI_Group_free( &group12 );
-#endif
-    // Perform a global reduce of the flag (equivalent to all operation)
-    return allReduce( flag );
-}
-
-
-/************************************************************************
- *  Compare two comm objects                                             *
- ************************************************************************/
-int MPI_CLASS::compare( const MPI_CLASS &comm ) const
-{
-    if ( communicator == comm.communicator )
-        return 1;
-#ifdef USE_MPI
-    if ( d_isNull || comm.d_isNull )
-        return 0;
-    int result;
-    check_MPI( MPI_Comm_compare( communicator, comm.communicator, &result ) );
-    if ( result == MPI_IDENT )
-        return 2;
-    else if ( result == MPI_CONGRUENT )
-        return 3;
-    else if ( result == MPI_SIMILAR )
-        return 4;
-    else if ( result == MPI_UNEQUAL )
-        return 0;
-    MPI_ERROR( "Unknown results from comm compare" );
-#else
-    if ( comm.communicator == MPI_COMM_NULL || communicator == MPI_COMM_NULL )
-        return 0;
-    else
-        return 3;
-#endif
-    return 0;
-}
-
-
-/************************************************************************
- *  Abort the program.                                                   *
- ************************************************************************/
-void MPI_CLASS::setCallAbortInSerialInsteadOfExit( bool flag ) { d_call_abort = flag; }
-void MPI_CLASS::abort() const
-{
-#ifdef USE_MPI
-    MPI_Comm comm = communicator;
-    if ( comm == MPI_COMM_NULL )
-        comm = MPI_COMM_WORLD;
-    if ( !MPI_active() ) {
-        // MPI is not availible
-        exit( -1 );
-    } else if ( comm_size > 1 ) {
-        MPI_Abort( comm, -1 );
-    } else if ( d_call_abort ) {
-        MPI_Abort( comm, -1 );
-    } else {
-        exit( -1 );
-    }
-#else
-    exit( -1 );
-#endif
-}
-
-
-/************************************************************************
- *  newTag                                                               *
- ************************************************************************/
-int MPI_CLASS::newTag()
-{
-#ifdef USE_MPI
-    // Syncronize the processes to ensure all ranks enter this call
-    // Needed so the count will match
-    barrier();
-    // Return and increment the tag
-    int tag = ( *d_currentTag )++;
-    MPI_INSIST( tag <= d_maxTag, "Maximum number of tags exceeded\n" );
-    return tag;
-#else
-    static int globalCurrentTag = 1;
-    return globalCurrentTag++;
-#endif
-}
-
-
-/************************************************************************
- *  allReduce                                                            *
- ************************************************************************/
-bool MPI_CLASS::allReduce( const bool value ) const
-{
-    bool ret = value;
-    if ( comm_size > 1 ) {
-#ifdef USE_MPI
-        MPI_Allreduce(
-            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
-#else
-        MPI_ERROR( "This shouldn't be possible" );
-#endif
-    }
-    return ret;
-}
-
-
-/************************************************************************
- *  anyReduce                                                            *
- ************************************************************************/
-bool MPI_CLASS::anyReduce( const bool value ) const
-{
-    bool ret = value;
-    if ( comm_size > 1 ) {
-#ifdef USE_MPI
-        MPI_Allreduce(
-            (void *) &value, (void *) &ret, 1, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
-#else
-        MPI_ERROR( "This shouldn't be possible" );
-#endif
-    }
-    return ret;
-}
-
-
-/************************************************************************
- *  call_sumReduce                                                       *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_sumReduce<unsigned char>(
-    const unsigned char *send, unsigned char *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<unsigned char>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<unsigned char>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<unsigned char>( unsigned char *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<unsigned char>", profile_level );
-    auto send = x;
-    auto recv = new unsigned char[n];
-    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_sumReduce<char>( const char *send, char *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<char>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<char>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<char>( char *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<char>", profile_level );
-    auto send = x;
-    auto recv = new char[n];
-    MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_sumReduce<unsigned int>(
-    const unsigned int *send, unsigned int *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<unsigned int>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<unsigned int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<unsigned int>( unsigned int *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<unsigned int>", profile_level );
-    auto send = x;
-    auto recv = new unsigned int[n];
-    MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_sumReduce<int>( const int *send, int *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<int>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<int>( int *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<int>", profile_level );
-    auto send = x;
-    auto recv = new int[n];
-    MPI_Allreduce( send, recv, n, MPI_INT, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<int>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_sumReduce<long int>( const long int *send, long int *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<long int>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<long int>( long int *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<long int>", profile_level );
-    auto send = x;
-    auto recv = new long int[n];
-    MPI_Allreduce( send, recv, n, MPI_LONG, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<long int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_sumReduce<unsigned long>(
-    const unsigned long *send, unsigned long *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<unsigned long>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<unsigned long>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<unsigned long>( unsigned long *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<unsigned long>", profile_level );
-    auto send = x;
-    auto recv = new unsigned long int[n];
-    MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<unsigned long>", profile_level );
-}
-// size_t
-#ifdef USE_WINDOWS
-template<>
-void MPI_CLASS::call_sumReduce<size_t>( const size_t *send, size_t *recv, const int n ) const
-{
-    MPI_ASSERT( MPI_SIZE_T != 0 );
-    PROFILE_START( "sumReduce1<size_t>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<size_t>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<size_t>( size_t *x, const int n ) const
-{
-    MPI_ASSERT( MPI_SIZE_T != 0 );
-    PROFILE_START( "sumReduce2<size_t>", profile_level );
-    auto send = x;
-    auto recv = new size_t[n];
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<size_t>", profile_level );
-}
-#endif
-// float
-template<>
-void MPI_CLASS::call_sumReduce<float>( const float *send, float *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<float>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<float>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<float>( float *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<float>", profile_level );
-    auto send = x;
-    auto recv = new float[n];
-    MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_sumReduce<double>( const double *send, double *recv, const int n ) const
-{
-    PROFILE_START( "sumReduce1<double>", profile_level );
-    MPI_Allreduce( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
-    PROFILE_STOP( "sumReduce1<double>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<double>( double *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<double>", profile_level );
-    auto send = x;
-    auto recv = new double[n];
-    MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<double>", profile_level );
-}
-// std::complex<double>
-template<>
-void MPI_CLASS::call_sumReduce<std::complex<double>>(
-    const std::complex<double> *x, std::complex<double> *y, const int n ) const
-{
-    PROFILE_START( "sumReduce1<complex double>", profile_level );
-    auto send = new double[2 * n];
-    auto recv = new double[2 * n];
-    for ( int i = 0; i < n; i++ ) {
-        send[2 * i + 0] = real( x[i] );
-        send[2 * i + 1] = imag( x[i] );
-    }
-    MPI_Allreduce( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
-    delete[] send;
-    delete[] recv;
-    PROFILE_STOP( "sumReduce1<complex double>", profile_level );
-}
-template<>
-void MPI_CLASS::call_sumReduce<std::complex<double>>( std::complex<double> *x, const int n ) const
-{
-    PROFILE_START( "sumReduce2<complex double>", profile_level );
-    auto send = new double[2 * n];
-    auto recv = new double[2 * n];
-    for ( int i = 0; i < n; i++ ) {
-        send[2 * i + 0] = real( x[i] );
-        send[2 * i + 1] = imag( x[i] );
-    }
-    MPI_Allreduce( send, recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        x[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
-    delete[] send;
-    delete[] recv;
-    PROFILE_STOP( "sumReduce2<complex double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  call_minReduce                                                       *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_minReduce<unsigned char>(
-    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce1<unsigned char>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
-        PROFILE_STOP( "minReduce1<unsigned char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = send[i];
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<unsigned char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_minReduce<unsigned char>(
-    unsigned char *x, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce2<unsigned char>", profile_level );
-        auto send = x;
-        auto recv = new unsigned char[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "minReduce2<unsigned char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = x[i];
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            x[i] = static_cast<unsigned char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-// char
-template<>
-void MPI_CLASS::call_minReduce<char>(
-    const char *send, char *recv, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce1<char>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
-        PROFILE_STOP( "minReduce1<char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = send[i];
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_minReduce<char>( char *x, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce2<char>", profile_level );
-        auto send = x;
-        auto recv = new char[n];
-        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "minReduce2<char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = x[i];
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            x[i] = static_cast<char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_minReduce<unsigned int>(
-    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce1<unsigned int>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
-        PROFILE_STOP( "minReduce1<unsigned int>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( send[i] );
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_minReduce<unsigned int>(
-    unsigned int *x, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce2<unsigned int>", profile_level );
-        auto send = x;
-        auto recv = new unsigned int[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "minReduce2<unsigned int>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( x[i] );
-        call_minReduce<int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            x[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-// int
-template<>
-void MPI_CLASS::call_minReduce<int>(
-    const int *x, int *y, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
-    } else {
-        auto recv = new IntIntStruct[n];
-        auto send = new IntIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].j;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce1<int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<int>( int *x, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce2<int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        auto send = x;
-        auto recv = new int[n];
-        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new IntIntStruct[n];
-        auto send = new IntIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].j;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce2<int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_minReduce<unsigned long int>( const unsigned long int *send,
-    unsigned long int *recv, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce1<unsigned long>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
-        PROFILE_STOP( "minReduce1<unsigned long>", profile_level );
-    } else {
-        auto tmp = new long int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( send[i] );
-        call_minReduce<long int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_minReduce<unsigned long int>(
-    unsigned long int *x, const int n, int *comm_rank_of_min ) const
-{
-    if ( comm_rank_of_min == nullptr ) {
-        PROFILE_START( "minReduce2<unsigned long>", profile_level );
-        auto send = x;
-        auto recv = new unsigned long int[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "minReduce2<unsigned long>", profile_level );
-    } else {
-        auto tmp = new long int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( x[i] );
-        call_minReduce<long int>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            x[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-// long int
-template<>
-void MPI_CLASS::call_minReduce<long int>(
-    const long int *x, long int *y, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<long int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MIN, communicator );
-    } else {
-        auto recv = new LongIntStruct[n];
-        auto send = new LongIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].j;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<long int>( long int *x, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce2<long int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        auto send = x;
-        auto recv = new long int[n];
-        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MIN, communicator );
-        for ( long int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new LongIntStruct[n];
-        auto send = new LongIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].j;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce2<long int>", profile_level );
-}
-// unsigned long long int
-template<>
-void MPI_CLASS::call_minReduce<unsigned long long int>( const unsigned long long int *send,
-    unsigned long long int *recv, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<long int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        auto x = new long long int[n];
-        auto y = new long long int[n];
-        for ( int i = 0; i < n; i++ )
-            x[i] = unsigned_to_signed( send[i] );
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( y[i] );
-        delete[] x;
-        delete[] y;
-    } else {
-        printf( "minReduce<long long int> will use double\n" );
-        auto tmp = new double[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = static_cast<double>( send[i] );
-        call_minReduce<double>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<long long int>( tmp[i] );
-        delete[] tmp;
-    }
-    PROFILE_STOP( "minReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<unsigned long long int>(
-    unsigned long long int *x, const int n, int *comm_rank_of_min ) const
-{
-    auto recv = new unsigned long long int[n];
-    call_minReduce<unsigned long long int>( x, recv, n, comm_rank_of_min );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-}
-// long long int
-template<>
-void MPI_CLASS::call_minReduce<long long int>(
-    const long long int *x, long long int *y, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<long int>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MIN, communicator );
-    } else {
-        printf( "minReduce<long long int> will use double\n" );
-        auto tmp = new double[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = static_cast<double>( x[i] );
-        call_minReduce<double>( tmp, n, comm_rank_of_min );
-        for ( int i = 0; i < n; i++ )
-            y[i] = static_cast<long long int>( tmp[i] );
-        delete[] tmp;
-    }
-    PROFILE_STOP( "minReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<long long int>(
-    long long int *x, const int n, int *comm_rank_of_min ) const
-{
-    auto recv = new long long int[n];
-    call_minReduce<long long int>( x, recv, n, comm_rank_of_min );
-    for ( int i = 0; i < n; i++ )
-        x[i] = signed_to_unsigned( recv[i] );
-    delete[] recv;
-}
-// float
-template<>
-void MPI_CLASS::call_minReduce<float>(
-    const float *x, float *y, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<float>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MIN, communicator );
-    } else {
-        auto recv = new FloatIntStruct[n];
-        auto send = new FloatIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].f = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].f;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce1<float>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<float>( float *x, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce2<float>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        auto send = x;
-        auto recv = new float[n];
-        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new FloatIntStruct[n];
-        auto send = new FloatIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].f = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].f;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce2<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_minReduce<double>(
-    const double *x, double *y, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce1<double>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MIN, communicator );
-    } else {
-        auto recv = new DoubleIntStruct[n];
-        auto send = new DoubleIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].d = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].d;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce1<double>", profile_level );
-}
-template<>
-void MPI_CLASS::call_minReduce<double>( double *x, const int n, int *comm_rank_of_min ) const
-{
-    PROFILE_START( "minReduce2<double>", profile_level );
-    if ( comm_rank_of_min == nullptr ) {
-        auto send = x;
-        auto recv = new double[n];
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MIN, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new DoubleIntStruct[n];
-        auto send = new DoubleIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].d = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MINLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].d;
-            comm_rank_of_min[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "minReduce2<double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  call_maxReduce                                                    *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_maxReduce<unsigned char>(
-    const unsigned char *send, unsigned char *recv, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce1<unsigned char>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
-        PROFILE_STOP( "maxReduce1<unsigned char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = send[i];
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<unsigned char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_maxReduce<unsigned char>(
-    unsigned char *x, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce2<unsigned char>", profile_level );
-        auto send = x;
-        auto recv = new unsigned char[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "maxReduce2<unsigned char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = x[i];
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            x[i] = static_cast<unsigned char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-// char
-template<>
-void MPI_CLASS::call_maxReduce<char>(
-    const char *send, char *recv, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce1<char>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
-        PROFILE_STOP( "maxReduce1<char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = send[i];
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_maxReduce<char>( char *x, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce2<char>", profile_level );
-        auto send = x;
-        auto recv = new char[n];
-        MPI_Allreduce( send, recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "maxReduce2<char>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = x[i];
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            x[i] = static_cast<char>( tmp[i] );
-        delete[] tmp;
-    }
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_maxReduce<unsigned int>(
-    const unsigned int *send, unsigned int *recv, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce1<unsigned int>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
-        PROFILE_STOP( "maxReduce1<unsigned int>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( send[i] );
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_maxReduce<unsigned int>(
-    unsigned int *x, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce2<unsigned int>", profile_level );
-        auto send = x;
-        auto recv = new unsigned int[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "maxReduce2<unsigned int>", profile_level );
-    } else {
-        auto tmp = new int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( x[i] );
-        call_maxReduce<int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            x[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-// int
-template<>
-void MPI_CLASS::call_maxReduce<int>(
-    const int *x, int *y, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_INT, MPI_MAX, communicator );
-    } else {
-        auto recv = new IntIntStruct[n];
-        auto send = new IntIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].j;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce1<int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<int>( int *x, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce2<int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        int *send = x;
-        auto recv = new int[n];
-        MPI_Allreduce( send, recv, n, MPI_INT, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new IntIntStruct[n];
-        auto send = new IntIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_2INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].j;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce2<int>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_maxReduce<long int>(
-    const long int *x, long int *y, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<lond int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG, MPI_MAX, communicator );
-    } else {
-        auto recv = new LongIntStruct[n];
-        auto send = new LongIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].j;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce1<lond int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<long int>( long int *x, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce2<lond int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        auto send = x;
-        auto recv = new long int[n];
-        MPI_Allreduce( send, recv, n, MPI_LONG, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new LongIntStruct[n];
-        auto send = new LongIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].j = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_LONG_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].j;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce2<lond int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long int>( const unsigned long int *send,
-    unsigned long int *recv, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce1<unsigned long>", profile_level );
-        MPI_Allreduce( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
-        PROFILE_STOP( "maxReduce1<unsigned long>", profile_level );
-    } else {
-        auto tmp = new long int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( send[i] );
-        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long int>(
-    unsigned long int *x, const int n, int *comm_rank_of_max ) const
-{
-    if ( comm_rank_of_max == nullptr ) {
-        PROFILE_START( "maxReduce2<unsigned long>", profile_level );
-        auto send = x;
-        auto recv = new unsigned long int[n];
-        MPI_Allreduce( send, recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-        PROFILE_STOP( "maxReduce2<unsigned long>", profile_level );
-    } else {
-        auto tmp = new long int[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = unsigned_to_signed( x[i] );
-        call_maxReduce<long int>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            x[i] = signed_to_unsigned( tmp[i] );
-        delete[] tmp;
-    }
-}
-// unsigned long long int
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long long int>( const unsigned long long int *send,
-    unsigned long long int *recv, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<long int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        auto x = new long long int[n];
-        auto y = new long long int[n];
-        for ( int i = 0; i < n; i++ )
-            x[i] = unsigned_to_signed( send[i] );
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = signed_to_unsigned( y[i] );
-        delete[] x;
-        delete[] y;
-    } else {
-        printf( "maxReduce<long long int> will use double\n" );
-        auto tmp = new double[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = static_cast<double>( send[i] );
-        call_maxReduce<double>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            recv[i] = static_cast<long long int>( tmp[i] );
-        delete[] tmp;
-    }
-    PROFILE_STOP( "maxReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<unsigned long long int>(
-    unsigned long long int *x, const int n, int *comm_rank_of_max ) const
-{
-    auto recv = new unsigned long long int[n];
-    call_maxReduce<unsigned long long int>( x, recv, n, comm_rank_of_max );
-    for ( int i = 0; i < n; i++ )
-        x[i] = recv[i];
-    delete[] recv;
-}
-// long long int
-template<>
-void MPI_CLASS::call_maxReduce<long long int>(
-    const long long int *x, long long int *y, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<long int>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_LONG_LONG_INT, MPI_MAX, communicator );
-    } else {
-        printf( "maxReduce<long long int> will use double\n" );
-        auto tmp = new double[n];
-        for ( int i = 0; i < n; i++ )
-            tmp[i] = static_cast<double>( x[i] );
-        call_maxReduce<double>( tmp, n, comm_rank_of_max );
-        for ( int i = 0; i < n; i++ )
-            y[i] = static_cast<long long int>( tmp[i] );
-        delete[] tmp;
-    }
-    PROFILE_STOP( "maxReduce1<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<long long int>(
-    long long int *x, const int n, int *comm_rank_of_max ) const
-{
-    auto recv = new long long int[n];
-    call_maxReduce<long long int>( x, recv, n, comm_rank_of_max );
-    for ( int i = 0; i < n; i++ )
-        x[i] = signed_to_unsigned( recv[i] );
-    delete[] recv;
-}
-// float
-template<>
-void MPI_CLASS::call_maxReduce<float>(
-    const float *x, float *y, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<float>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_FLOAT, MPI_MAX, communicator );
-    } else {
-        auto recv = new FloatIntStruct[n];
-        auto send = new FloatIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].f = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].f;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce1<float>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<float>( float *x, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce2<float>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        auto send = x;
-        auto recv = new float[n];
-        MPI_Allreduce( send, recv, n, MPI_FLOAT, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new FloatIntStruct[n];
-        auto send = new FloatIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].f = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_FLOAT_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].f;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce2<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_maxReduce<double>(
-    const double *x, double *y, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce1<double>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        MPI_Allreduce( (void *) x, (void *) y, n, MPI_DOUBLE, MPI_MAX, communicator );
-    } else {
-        auto recv = new DoubleIntStruct[n];
-        auto send = new DoubleIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].d = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            y[i]                = recv[i].d;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce1<double>", profile_level );
-}
-template<>
-void MPI_CLASS::call_maxReduce<double>( double *x, const int n, int *comm_rank_of_max ) const
-{
-    PROFILE_START( "maxReduce2<double>", profile_level );
-    if ( comm_rank_of_max == nullptr ) {
-        auto send = x;
-        auto recv = new double[n];
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE, MPI_MAX, communicator );
-        for ( int i = 0; i < n; i++ )
-            x[i] = recv[i];
-        delete[] recv;
-    } else {
-        auto recv = new DoubleIntStruct[n];
-        auto send = new DoubleIntStruct[n];
-        for ( int i = 0; i < n; ++i ) {
-            send[i].d = x[i];
-            send[i].i = comm_rank;
-        }
-        MPI_Allreduce( send, recv, n, MPI_DOUBLE_INT, MPI_MAXLOC, communicator );
-        for ( int i = 0; i < n; ++i ) {
-            x[i]                = recv[i].d;
-            comm_rank_of_max[i] = recv[i].i;
-        }
-        delete[] recv;
-        delete[] send;
-    }
-    PROFILE_STOP( "maxReduce2<double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  bcast                                                                *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// char
-template<>
-void MPI_CLASS::call_bcast<unsigned char>( unsigned char *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<unsigned char>", profile_level );
-    MPI_Bcast( x, n, MPI_UNSIGNED_CHAR, root, communicator );
-    PROFILE_STOP( "bcast<unsigned char>", profile_level );
-}
-template<>
-void MPI_CLASS::call_bcast<char>( char *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<char>", profile_level );
-    MPI_Bcast( x, n, MPI_CHAR, root, communicator );
-    PROFILE_STOP( "bcast<char>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_bcast<unsigned int>( unsigned int *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<unsigned int>", profile_level );
-    MPI_Bcast( x, n, MPI_UNSIGNED, root, communicator );
-    PROFILE_STOP( "bcast<unsigned int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_bcast<int>( int *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<int>", profile_level );
-    MPI_Bcast( x, n, MPI_INT, root, communicator );
-    PROFILE_STOP( "bcast<int>", profile_level );
-}
-// float
-template<>
-void MPI_CLASS::call_bcast<float>( float *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<float>", profile_level );
-    MPI_Bcast( x, n, MPI_FLOAT, root, communicator );
-    PROFILE_STOP( "bcast<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_bcast<double>( double *x, const int n, const int root ) const
-{
-    PROFILE_START( "bcast<double>", profile_level );
-    MPI_Bcast( x, n, MPI_DOUBLE, root, communicator );
-    PROFILE_STOP( "bcast<double>", profile_level );
-}
-#else
-// We need a concrete instantiation of bcast<char>(x,n,root);
-template<>
-void MPI_CLASS::call_bcast<char>( char *, const int, const int ) const
-{
-}
-#endif
-
-
-/************************************************************************
- *  Perform a global barrier across all processors.                      *
- ************************************************************************/
-void MPI_CLASS::barrier() const
-{
-#ifdef USE_MPI
-    MPI_Barrier( communicator );
-#endif
-}
-
-
-/************************************************************************
- *  Send data array to another processor.                                *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// char
-template<>
-void MPI_CLASS::send<char>(
-    const char *buf, const int length, const int recv_proc_number, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    // Send the data
-    PROFILE_START( "send<char>", profile_level );
-    MPI_Send( (void *) buf, length, MPI_CHAR, recv_proc_number, tag, communicator );
-    PROFILE_STOP( "send<char>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::send<int>(
-    const int *buf, const int length, const int recv_proc_number, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    // Send the data
-    PROFILE_START( "send<int>", profile_level );
-    MPI_Send( (void *) buf, length, MPI_INT, recv_proc_number, tag, communicator );
-    PROFILE_STOP( "send<int>", profile_level );
-}
-// float
-template<>
-void MPI_CLASS::send<float>(
-    const float *buf, const int length, const int recv_proc_number, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    // Send the data
-    PROFILE_START( "send<float>", profile_level );
-    MPI_Send( (void *) buf, length, MPI_FLOAT, recv_proc_number, tag, communicator );
-    PROFILE_STOP( "send<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::send<double>(
-    const double *buf, const int length, const int recv_proc_number, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    // Send the data
-    PROFILE_START( "send<double>", profile_level );
-    MPI_Send( (void *) buf, length, MPI_DOUBLE, recv_proc_number, tag, communicator );
-    PROFILE_STOP( "send<double>", profile_level );
-}
-#else
-// We need a concrete instantiation of send for use without MPI
-template<>
-void MPI_CLASS::send<char>( const char *buf, const int length, const int, int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    PROFILE_START( "send<char>", profile_level );
-    auto id = getRequest( communicator, tag );
-    auto it = global_isendrecv_list.find( id );
-    MPI_INSIST( it == global_isendrecv_list.end(),
-        "send must be paired with a previous call to irecv in serial" );
-    MPI_ASSERT( it->second.status == 2 );
-    memcpy( (char *) it->second.data, buf, length );
-    global_isendrecv_list.erase( it );
-    PROFILE_START( "send<char>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  Non-blocking send data array to another processor.                   *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// char
-template<>
-MPI_Request MPI_CLASS::Isend<char>(
-    const char *buf, const int length, const int recv_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Isend<char>", profile_level );
-    MPI_Isend( (void *) buf, length, MPI_CHAR, recv_proc, tag, communicator, &request );
-    PROFILE_STOP( "Isend<char>", profile_level );
-    return request;
-}
-// int
-template<>
-MPI_Request MPI_CLASS::Isend<int>(
-    const int *buf, const int length, const int recv_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Isend<int>", profile_level );
-    MPI_Isend( (void *) buf, length, MPI_INT, recv_proc, tag, communicator, &request );
-    PROFILE_STOP( "Isend<int>", profile_level );
-    return request;
-}
-// float
-template<>
-MPI_Request MPI_CLASS::Isend<float>(
-    const float *buf, const int length, const int recv_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Isend<float>", profile_level );
-    MPI_Isend( (void *) buf, length, MPI_FLOAT, recv_proc, tag, communicator, &request );
-    PROFILE_STOP( "Isend<float>", profile_level );
-    return request;
-}
-// double
-template<>
-MPI_Request MPI_CLASS::Isend<double>(
-    const double *buf, const int length, const int recv_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Isend<double>", profile_level );
-    MPI_Isend( (void *) buf, length, MPI_DOUBLE, recv_proc, tag, communicator, &request );
-    PROFILE_STOP( "Isend<double>", profile_level );
-    return request;
-}
-#else
-// We need a concrete instantiation of send for use without mpi
-template<>
-MPI_Request MPI_CLASS::Isend<char>(
-    const char *buf, const int length, const int, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    PROFILE_START( "Isend<char>", profile_level );
-    auto id = getRequest( communicator, tag );
-    auto it = global_isendrecv_list.find( id );
-    if ( it == global_isendrecv_list.end() ) {
-        // We are calling isend first
-        Isendrecv_struct data;
-        data.data   = buf;
-        data.status = 1;
-        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
-    } else {
-        // We called irecv first
-        MPI_ASSERT( it->second.status == 2 );
-        memcpy( (char *) it->second.data, buf, length );
-        global_isendrecv_list.erase( it );
-    }
-    PROFILE_STOP( "Isend<char>", profile_level );
-    return id;
-}
-#endif
-
-
-/************************************************************************
- *  Send byte array to another processor.                                *
- ************************************************************************/
-void MPI_CLASS::sendBytes(
-    const void *buf, const int number_bytes, const int recv_proc_number, int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    send<char>( (const char *) buf, number_bytes, recv_proc_number, tag );
-}
-
-
-/************************************************************************
- *  Non-blocking send byte array to another processor.                   *
- ************************************************************************/
-MPI_Request MPI_CLASS::IsendBytes(
-    const void *buf, const int number_bytes, const int recv_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    return Isend<char>( (const char *) buf, number_bytes, recv_proc, tag );
-}
-
-
-/************************************************************************
- *  Recieve data array to another processor.                             *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// char
-template<>
-void MPI_CLASS::recv<char>(
-    char *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    PROFILE_START( "recv<char>", profile_level );
-    // Get the recieve length if necessary
-    if ( get_length ) {
-        int bytes       = this->probe( send_proc_number, tag );
-        int recv_length = bytes / sizeof( char );
-        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
-        length = recv_length;
-    }
-    // Send the data
-    MPI_Status status;
-    MPI_Recv( (void *) buf, length, MPI_CHAR, send_proc_number, tag, communicator, &status );
-    PROFILE_STOP( "recv<char>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::recv<int>(
-    int *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    PROFILE_START( "recv<int>", profile_level );
-    // Get the recieve length if necessary
-    if ( get_length ) {
-        int bytes       = this->probe( send_proc_number, tag );
-        int recv_length = bytes / sizeof( int );
-        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
-        length = recv_length;
-    }
-    // Send the data
-    MPI_Status status;
-    MPI_Recv( (void *) buf, length, MPI_INT, send_proc_number, tag, communicator, &status );
-    PROFILE_STOP( "recv<int>", profile_level );
-}
-// float
-template<>
-void MPI_CLASS::recv<float>(
-    float *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    PROFILE_START( "recv<float>", profile_level );
-    // Get the recieve length if necessary
-    if ( get_length ) {
-        int bytes       = this->probe( send_proc_number, tag );
-        int recv_length = bytes / sizeof( float );
-        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
-        length = recv_length;
-    }
-    // Send the data
-    MPI_Status status;
-    MPI_Recv( (void *) buf, length, MPI_FLOAT, send_proc_number, tag, communicator, &status );
-    PROFILE_STOP( "recv<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::recv<double>(
-    double *buf, int &length, const int send_proc_number, const bool get_length, int tag ) const
-{
-    // Set the tag to 0 if it is < 0
-    tag = ( tag >= 0 ) ? tag : 0;
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    PROFILE_START( "recv<double>", profile_level );
-    // Get the recieve length if necessary
-    if ( get_length ) {
-        int bytes       = this->probe( send_proc_number, tag );
-        int recv_length = bytes / sizeof( double );
-        MPI_INSIST( length >= recv_length, "Recived length is larger than allocated array" );
-        length = recv_length;
-    }
-    // Send the data
-    MPI_Status status;
-    MPI_Recv( (void *) buf, length, MPI_DOUBLE, send_proc_number, tag, communicator, &status );
-    PROFILE_STOP( "recv<double>", profile_level );
-}
-#else
-// We need a concrete instantiation of recv for use without mpi
-template<>
-void MPI_CLASS::recv<char>( char *buf, int &length, const int, const bool, int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    PROFILE_START( "recv<char>", profile_level );
-    auto id = getRequest( communicator, tag );
-    auto it = global_isendrecv_list.find( id );
-    MPI_INSIST( it != global_isendrecv_list.end(),
-        "recv must be paired with a previous call to isend in serial" );
-    MPI_ASSERT( it->second.status == 1 );
-    memcpy( buf, it->second.data, length );
-    global_isendrecv_list.erase( it );
-    PROFILE_STOP( "recv<char>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  Non-blocking recieve data array to another processor.                *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// char
-template<>
-MPI_Request MPI_CLASS::Irecv<char>(
-    char *buf, const int length, const int send_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Irecv<char>", profile_level );
-    MPI_Irecv( (void *) buf, length, MPI_CHAR, send_proc, tag, communicator, &request );
-    PROFILE_STOP( "Irecv<char>", profile_level );
-    return request;
-}
-// int
-template<>
-MPI_Request MPI_CLASS::Irecv<int>(
-    int *buf, const int length, const int send_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Irecv<int>", profile_level );
-    MPI_Irecv( (void *) buf, length, MPI_INT, send_proc, tag, communicator, &request );
-    PROFILE_STOP( "Irecv<int>", profile_level );
-    return request;
-}
-// float
-template<>
-MPI_Request MPI_CLASS::Irecv<float>(
-    float *buf, const int length, const int send_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Irecv<float>", profile_level );
-    MPI_Irecv( (void *) buf, length, MPI_FLOAT, send_proc, tag, communicator, &request );
-    PROFILE_STOP( "Irecv<float>", profile_level );
-    return request;
-}
-// double
-template<>
-MPI_Request MPI_CLASS::Irecv<double>(
-    double *buf, const int length, const int send_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Request request;
-    PROFILE_START( "Irecv<double>", profile_level );
-    MPI_Irecv( (void *) buf, length, MPI_DOUBLE, send_proc, tag, communicator, &request );
-    PROFILE_STOP( "Irecv<double>", profile_level );
-    return request;
-}
-#else
-// We need a concrete instantiation of irecv for use without mpi
-template<>
-MPI_Request MPI_CLASS::Irecv<char>( char *buf, const int length, const int, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    PROFILE_START( "Irecv<char>", profile_level );
-    auto id = getRequest( communicator, tag );
-    auto it = global_isendrecv_list.find( id );
-    if ( it == global_isendrecv_list.end() ) {
-        // We are calling Irecv first
-        Isendrecv_struct data;
-        data.data   = buf;
-        data.status = 2;
-        global_isendrecv_list.insert( std::pair<MPI_Request, Isendrecv_struct>( id, data ) );
-    } else {
-        // We called Isend first
-        MPI_ASSERT( it->second.status == 1 );
-        memcpy( buf, it->second.data, length );
-        global_isendrecv_list.erase( it );
-    }
-    PROFILE_STOP( "Irecv<char>", profile_level );
-    return id;
-}
-#endif
-
-
-/************************************************************************
- *  Recieve byte array to another processor.                             *
- ************************************************************************/
-void MPI_CLASS::recvBytes( void *buf, int &number_bytes, const int send_proc, int tag ) const
-{
-    recv<char>( (char *) buf, number_bytes, send_proc, false, tag );
-}
-
-
-/************************************************************************
- *  Recieve byte array to another processor.                             *
- ************************************************************************/
-MPI_Request MPI_CLASS::IrecvBytes(
-    void *buf, const int number_bytes, const int send_proc, const int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    return Irecv<char>( (char *) buf, number_bytes, send_proc, tag );
-}
-
-
-/************************************************************************
- *  allGather                                                            *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_allGather<unsigned char>(
-    const unsigned char &x_in, unsigned char *x_out ) const
-{
-    PROFILE_START( "allGather<unsigned char>", profile_level );
-    MPI_Allgather(
-        (void *) &x_in, 1, MPI_UNSIGNED_CHAR, (void *) x_out, 1, MPI_UNSIGNED_CHAR, communicator );
-    PROFILE_STOP( "allGather<unsigned char>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<unsigned char>( const unsigned char *x_in, int size_in,
-    unsigned char *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<unsigned char>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
-        communicator );
-    PROFILE_STOP( "allGatherv<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_allGather<char>( const char &x_in, char *x_out ) const
-{
-    PROFILE_START( "allGather<char>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_CHAR, (void *) x_out, 1, MPI_CHAR, communicator );
-    PROFILE_STOP( "allGather<char>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<char>(
-    const char *x_in, int size_in, char *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<char>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_CHAR, (void *) x_out, size_out, disp_out, MPI_CHAR,
-        communicator );
-    PROFILE_STOP( "allGatherv<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_allGather<unsigned int>( const unsigned int &x_in, unsigned int *x_out ) const
-{
-    PROFILE_START( "allGather<unsigned int>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_UNSIGNED, (void *) x_out, 1, MPI_UNSIGNED, communicator );
-    PROFILE_STOP( "allGather<unsigned int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<unsigned int>(
-    const unsigned int *x_in, int size_in, unsigned int *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<unsigned int>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED, (void *) x_out, size_out, disp_out,
-        MPI_UNSIGNED, communicator );
-    PROFILE_STOP( "allGatherv<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_allGather<int>( const int &x_in, int *x_out ) const
-{
-    PROFILE_START( "allGather<int>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_INT, (void *) x_out, 1, MPI_INT, communicator );
-    PROFILE_STOP( "allGather<int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<int>(
-    const int *x_in, int size_in, int *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<int>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_INT, (void *) x_out, size_out, disp_out, MPI_INT,
-        communicator );
-    PROFILE_STOP( "allGatherv<int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_allGather<unsigned long int>(
-    const unsigned long int &x_in, unsigned long int *x_out ) const
-{
-    PROFILE_START( "allGather<unsigned long>", profile_level );
-    MPI_Allgather(
-        (void *) &x_in, 1, MPI_UNSIGNED_LONG, (void *) x_out, 1, MPI_UNSIGNED_LONG, communicator );
-    PROFILE_STOP( "allGather<unsigned long>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<unsigned long int>( const unsigned long int *x_in, int size_in,
-    unsigned long int *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<unsigned long>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_UNSIGNED_LONG, (void *) x_out, size_out, disp_out,
-        MPI_UNSIGNED_LONG, communicator );
-    PROFILE_STOP( "allGatherv<unsigned long>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_allGather<long int>( const long int &x_in, long int *x_out ) const
-{
-    PROFILE_START( "allGather<long int>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_LONG, (void *) x_out, 1, MPI_LONG, communicator );
-    PROFILE_STOP( "allGather<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<long int>(
-    const long int *x_in, int size_in, long int *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<long int>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_LONG, (void *) x_out, size_out, disp_out, MPI_LONG,
-        communicator );
-    PROFILE_STOP( "allGatherv<long int>", profile_level );
-}
-// float
-template<>
-void MPI_CLASS::call_allGather<float>( const float &x_in, float *x_out ) const
-{
-    PROFILE_START( "allGather<float>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_FLOAT, (void *) x_out, 1, MPI_FLOAT, communicator );
-    PROFILE_STOP( "allGather<float>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<float>(
-    const float *x_in, int size_in, float *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<float>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_FLOAT, (void *) x_out, size_out, disp_out,
-        MPI_FLOAT, communicator );
-    PROFILE_STOP( "allGatherv<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_allGather<double>( const double &x_in, double *x_out ) const
-{
-    PROFILE_START( "allGather<double>", profile_level );
-    MPI_Allgather( (void *) &x_in, 1, MPI_DOUBLE, (void *) x_out, 1, MPI_DOUBLE, communicator );
-    PROFILE_STOP( "allGather<double>", profile_level );
-}
-template<>
-void MPI_CLASS::call_allGather<double>(
-    const double *x_in, int size_in, double *x_out, int *size_out, int *disp_out ) const
-{
-    PROFILE_START( "allGatherv<double>", profile_level );
-    MPI_Allgatherv( (void *) x_in, size_in, MPI_DOUBLE, (void *) x_out, size_out, disp_out,
-        MPI_DOUBLE, communicator );
-    PROFILE_STOP( "allGatherv<double>", profile_level );
-}
-#else
-// We need a concrete instantiation of call_allGather<char>(x_in,size_in,x_out,size_out)
-template<>
-void MPI_CLASS::call_allGather<char>( const char *, int, char *, int *, int * ) const
-{
-    MPI_ERROR( "Internal error in communicator (allGather) " );
-}
-#endif
-
-
-/************************************************************************
- *  allToAll                                                             *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-template<>
-void MPI_CLASS::allToAll<unsigned char>(
-    const int n, const unsigned char *send, unsigned char *recv ) const
-{
-    PROFILE_START( "allToAll<unsigned char>", profile_level );
-    MPI_Alltoall(
-        (void *) send, n, MPI_UNSIGNED_CHAR, (void *) recv, n, MPI_UNSIGNED_CHAR, communicator );
-    PROFILE_STOP( "allToAll<unsigned char>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<char>( const int n, const char *send, char *recv ) const
-{
-    PROFILE_START( "allToAll<char>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_CHAR, (void *) recv, n, MPI_CHAR, communicator );
-    PROFILE_STOP( "allToAll<char>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<unsigned int>(
-    const int n, const unsigned int *send, unsigned int *recv ) const
-{
-    PROFILE_START( "allToAll<unsigned int>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_UNSIGNED, (void *) recv, n, MPI_UNSIGNED, communicator );
-    PROFILE_STOP( "allToAll<unsigned int>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<int>( const int n, const int *send, int *recv ) const
-{
-    PROFILE_START( "allToAll<int>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_INT, (void *) recv, n, MPI_INT, communicator );
-    PROFILE_STOP( "allToAll<int>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<unsigned long int>(
-    const int n, const unsigned long int *send, unsigned long int *recv ) const
-{
-    PROFILE_START( "allToAll<unsigned long>", profile_level );
-    MPI_Alltoall(
-        (void *) send, n, MPI_UNSIGNED_LONG, (void *) recv, n, MPI_UNSIGNED_LONG, communicator );
-    PROFILE_STOP( "allToAll<unsigned long>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<long int>( const int n, const long int *send, long int *recv ) const
-{
-    PROFILE_START( "allToAll<long int>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_LONG, (void *) recv, n, MPI_LONG, communicator );
-    PROFILE_STOP( "allToAll<long int>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<float>( const int n, const float *send, float *recv ) const
-{
-    PROFILE_START( "allToAll<float>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_FLOAT, (void *) recv, n, MPI_FLOAT, communicator );
-    PROFILE_STOP( "allToAll<float>", profile_level );
-}
-template<>
-void MPI_CLASS::allToAll<double>( const int n, const double *send, double *recv ) const
-{
-    PROFILE_START( "allToAll<double>", profile_level );
-    MPI_Alltoall( (void *) send, n, MPI_DOUBLE, (void *) recv, n, MPI_DOUBLE, communicator );
-    PROFILE_STOP( "allToAll<double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  call_allToAll                                                        *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_allToAll<unsigned char>( const unsigned char *send_data, const int send_cnt[],
-    const int send_disp[], unsigned char *recv_data, const int *recv_cnt,
-    const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<unsigned char>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_CHAR,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_CHAR, communicator );
-    PROFILE_STOP( "allToAllv<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_allToAll<char>( const char *send_data, const int send_cnt[],
-    const int send_disp[], char *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<char>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_CHAR,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_CHAR, communicator );
-    PROFILE_STOP( "allToAllv<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_allToAll<unsigned int>( const unsigned int *send_data, const int send_cnt[],
-    const int send_disp[], unsigned int *recv_data, const int *recv_cnt,
-    const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<unsigned int>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED, communicator );
-    PROFILE_STOP( "allToAllv<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_allToAll<int>( const int *send_data, const int send_cnt[],
-    const int send_disp[], int *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<int>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_INT,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_INT, communicator );
-    PROFILE_STOP( "allToAllv<int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_allToAll<unsigned long int>( const unsigned long int *send_data,
-    const int send_cnt[], const int send_disp[], unsigned long int *recv_data, const int *recv_cnt,
-    const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<unsigned long>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_UNSIGNED_LONG,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_UNSIGNED_LONG, communicator );
-    PROFILE_STOP( "allToAllv<unsigned long>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_allToAll<long int>( const long int *send_data, const int send_cnt[],
-    const int send_disp[], long int *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<long int>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_LONG,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_LONG, communicator );
-    PROFILE_STOP( "allToAllv<long int>", profile_level );
-}
-// float
-template<>
-void MPI_CLASS::call_allToAll<float>( const float *send_data, const int send_cnt[],
-    const int send_disp[], float *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<float>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_FLOAT,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_FLOAT, communicator );
-    PROFILE_STOP( "allToAllv<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_allToAll<double>( const double *send_data, const int send_cnt[],
-    const int send_disp[], double *recv_data, const int *recv_cnt, const int *recv_disp ) const
-{
-    PROFILE_START( "allToAllv<double>", profile_level );
-    MPI_Alltoallv( (void *) send_data, (int *) send_cnt, (int *) send_disp, MPI_DOUBLE,
-        (void *) recv_data, (int *) recv_cnt, (int *) recv_disp, MPI_DOUBLE, communicator );
-    PROFILE_STOP( "allToAllv<double>", profile_level );
-}
-#else
-// Default instatiation of unsigned char
-template<>
-void MPI_CLASS::call_allToAll<char>(
-    const char *, const int[], const int[], char *, const int *, const int * ) const
-{
-    MPI_ERROR( "Should not reach this point" );
-}
-#endif
-
-
-/************************************************************************
- *  call_sumScan                                                         *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_sumScan<unsigned char>(
-    const unsigned char *send, unsigned char *recv, int n ) const
-{
-    PROFILE_START( "sumScan<unsigned char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_sumScan<char>( const char *send, char *recv, int n ) const
-{
-    PROFILE_START( "sumScan<char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_sumScan<unsigned int>(
-    const unsigned int *send, unsigned int *recv, int n ) const
-{
-    PROFILE_START( "sumScan<unsigned int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_sumScan<int>( const int *send, int *recv, int n ) const
-{
-    PROFILE_START( "sumScan<int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<int>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_sumScan<long int>( const long int *send, long int *recv, int n ) const
-{
-    PROFILE_START( "sumScan<long int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<long int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_sumScan<unsigned long>(
-    const unsigned long *send, unsigned long *recv, int n ) const
-{
-    PROFILE_START( "sumScan<unsigned long>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<unsigned long>", profile_level );
-}
-// size_t
-#ifdef USE_WINDOWS
-template<>
-void MPI_CLASS::call_sumScan<size_t>( const size_t *send, size_t *recv, int n ) const
-{
-    MPI_ASSERT( MPI_SIZE_T != 0 );
-    PROFILE_START( "sumScan<size_t>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<size_t>", profile_level );
-}
-#endif
-// float
-template<>
-void MPI_CLASS::call_sumScan<float>( const float *send, float *recv, int n ) const
-{
-    PROFILE_START( "sumScan<float>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_sumScan<double>( const double *send, double *recv, int n ) const
-{
-    PROFILE_START( "sumScan<double>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_SUM, communicator );
-    PROFILE_STOP( "sumScan<double>", profile_level );
-}
-// std::complex<double>
-template<>
-void MPI_CLASS::call_sumScan<std::complex<double>>(
-    const std::complex<double> *x, std::complex<double> *y, int n ) const
-{
-    auto send = new double[2 * n];
-    auto recv = new double[2 * n];
-    for ( int i = 0; i < n; i++ ) {
-        send[2 * i + 0] = real( x[i] );
-        send[2 * i + 1] = imag( x[i] );
-    }
-    MPI_Scan( (void *) send, (void *) recv, 2 * n, MPI_DOUBLE, MPI_SUM, communicator );
-    for ( int i = 0; i < n; i++ )
-        y[i] = std::complex<double>( recv[2 * i + 0], recv[2 * i + 1] );
-    delete[] send;
-    delete[] recv;
-}
-#endif
-
-
-/************************************************************************
- *  call_minScan                                                         *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_minScan<unsigned char>(
-    const unsigned char *send, unsigned char *recv, int n ) const
-{
-    PROFILE_START( "minScan<unsigned char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_minScan<char>( const char *send, char *recv, int n ) const
-{
-    PROFILE_START( "minScan<char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_minScan<unsigned int>(
-    const unsigned int *send, unsigned int *recv, int n ) const
-{
-    PROFILE_START( "minScan<unsigned int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_minScan<int>( const int *send, int *recv, int n ) const
-{
-    PROFILE_START( "minScan<int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_minScan<unsigned long int>(
-    const unsigned long int *send, unsigned long int *recv, int n ) const
-{
-    PROFILE_START( "minScan<unsigned long>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<unsigned long>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_minScan<long int>( const long int *send, long int *recv, int n ) const
-{
-    PROFILE_START( "minScan<long int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<long int>", profile_level );
-}
-// size_t
-#ifdef USE_WINDOWS
-template<>
-void MPI_CLASS::call_minScan<size_t>( const size_t *send, size_t *recv, int n ) const
-{
-    MPI_ASSERT( MPI_SIZE_T != 0 );
-    PROFILE_START( "minScan<size_t>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<size_t>", profile_level );
-}
-#endif
-// float
-template<>
-void MPI_CLASS::call_minScan<float>( const float *send, float *recv, int n ) const
-{
-    PROFILE_START( "minScan<float>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_FLOAT, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_minScan<double>( const double *send, double *recv, int n ) const
-{
-    PROFILE_START( "minScan<double>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MIN, communicator );
-    PROFILE_STOP( "minScan<double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  call_maxScan                                                         *
- *  Note: these specializations are only called when using MPI.          *
- ************************************************************************/
-#ifdef USE_MPI
-// unsigned char
-template<>
-void MPI_CLASS::call_maxScan<unsigned char>(
-    const unsigned char *send, unsigned char *recv, int n ) const
-{
-    PROFILE_START( "maxScan<unsigned char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_CHAR, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<unsigned char>", profile_level );
-}
-// char
-template<>
-void MPI_CLASS::call_maxScan<char>( const char *send, char *recv, int n ) const
-{
-    PROFILE_START( "maxScan<char>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIGNED_CHAR, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<char>", profile_level );
-}
-// unsigned int
-template<>
-void MPI_CLASS::call_maxScan<unsigned int>(
-    const unsigned int *send, unsigned int *recv, int n ) const
-{
-    PROFILE_START( "maxScan<unsigned int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<unsigned int>", profile_level );
-}
-// int
-template<>
-void MPI_CLASS::call_maxScan<int>( const int *send, int *recv, int n ) const
-{
-    PROFILE_START( "maxScan<int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<int>", profile_level );
-}
-// long int
-template<>
-void MPI_CLASS::call_maxScan<long int>( const long int *send, long int *recv, int n ) const
-{
-    PROFILE_START( "maxScan<long int>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_LONG, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<long int>", profile_level );
-}
-// unsigned long int
-template<>
-void MPI_CLASS::call_maxScan<unsigned long int>(
-    const unsigned long int *send, unsigned long int *recv, int n ) const
-{
-    PROFILE_START( "maxScan<unsigned long>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_UNSIGNED_LONG, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<unsigned long>", profile_level );
-}
-// size_t
-#ifdef USE_WINDOWS
-template<>
-void MPI_CLASS::call_maxScan<size_t>( const size_t *send, size_t *recv, int n ) const
-{
-    MPI_ASSERT( MPI_SIZE_T != 0 );
-    PROFILE_START( "maxScan<size_t>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_SIZE_T, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<size_t>", profile_level );
-}
-#endif
-// float
-template<>
-void MPI_CLASS::call_maxScan<float>( const float *send, float *recv, int n ) const
-{
-    PROFILE_START( "maxScan<float>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_INT, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<float>", profile_level );
-}
-// double
-template<>
-void MPI_CLASS::call_maxScan<double>( const double *send, double *recv, int n ) const
-{
-    PROFILE_START( "maxScan<double>", profile_level );
-    MPI_Scan( (void *) send, (void *) recv, n, MPI_DOUBLE, MPI_MAX, communicator );
-    PROFILE_STOP( "maxScan<double>", profile_level );
-}
-#endif
-
-
-/************************************************************************
- *  Communicate ranks for communication                                  *
- ************************************************************************/
-std::vector<int> MPI_CLASS::commRanks( const std::vector<int> &ranks ) const
-{
-#ifdef USE_MPI
-    // Get a byte array with the ranks to communicate
-    auto data1 = new char[comm_size];
-    auto data2 = new char[comm_size];
-    memset( data1, 0, comm_size );
-    memset( data2, 0, comm_size );
-    for ( auto &rank : ranks )
-        data1[rank] = 1;
-    MPI_Alltoall( data1, 1, MPI_CHAR, data2, 1, MPI_CHAR, communicator );
-    int N = 0;
-    for ( int i = 0; i < comm_size; i++ )
-        N += data2[i];
-    std::vector<int> ranks_out;
-    ranks_out.reserve( N );
-    for ( int i = 0; i < comm_size; i++ ) {
-        if ( data2[i] )
-            ranks_out.push_back( i );
-    }
-    delete[] data1;
-    delete[] data2;
-    return ranks_out;
-#else
-    return ranks;
-#endif
-}
-
-
-/************************************************************************
- *  Wait functions                                                       *
- ************************************************************************/
-#ifdef USE_MPI
-void MPI_CLASS::wait( MPI_Request request )
-{
-    PROFILE_START( "wait", profile_level );
-    MPI_Status status;
-    int flag = 0;
-    int err  = MPI_Test( &request, &flag, &status );
-    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
-    while ( !flag ) {
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-        // Check if the request has finished
-        MPI_Test( &request, &flag, &status );
-    }
-    PROFILE_STOP( "wait", profile_level );
-}
-int MPI_CLASS::waitAny( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return -1;
-    PROFILE_START( "waitAny", profile_level );
-    int index   = -1;
-    int flag    = 0;
-    auto status = new MPI_Status[count];
-    int err     = MPI_Testany( count, request, &index, &flag, status );
-    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
-    while ( !flag ) {
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-        // Check if the request has finished
-        MPI_Testany( count, request, &index, &flag, status );
-    }
-    MPI_ASSERT( index >= 0 ); // Check that the index is valid
-    delete[] status;
-    PROFILE_STOP( "waitAny", profile_level );
-    return index;
-}
-void MPI_CLASS::waitAll( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return;
-    PROFILE_START( "waitAll", profile_level );
-    int flag    = 0;
-    auto status = new MPI_Status[count];
-    int err     = MPI_Testall( count, request, &flag, status );
-    MPI_ASSERT( err == MPI_SUCCESS ); // Check that the first call is valid
-    while ( !flag ) {
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-        // Check if the request has finished
-        MPI_Testall( count, request, &flag, status );
-    }
-    PROFILE_STOP( "waitAll", profile_level );
-    delete[] status;
-}
-std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return std::vector<int>();
-    PROFILE_START( "waitSome", profile_level );
-    std::vector<int> indicies( count, -1 );
-    auto *status = new MPI_Status[count];
-    int outcount = 0;
-    int err      = MPI_Testsome( count, request, &outcount, &indicies[0], status );
-    MPI_ASSERT( err == MPI_SUCCESS );        // Check that the first call is valid
-    MPI_ASSERT( outcount != MPI_UNDEFINED ); // Check that the first call is valid
-    while ( outcount == 0 ) {
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-        // Check if the request has finished
-        MPI_Testsome( count, request, &outcount, &indicies[0], status );
-    }
-    indicies.resize( outcount );
-    delete[] status;
-    PROFILE_STOP( "waitSome", profile_level );
-    return indicies;
-}
-#else
-void MPI_CLASS::wait( MPI_Request request )
-{
-    PROFILE_START( "wait", profile_level );
-    while ( 1 ) {
-        // Check if the request is in our list
-        if ( global_isendrecv_list.find( request ) == global_isendrecv_list.end() )
-            break;
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-    }
-    PROFILE_STOP( "wait", profile_level );
-}
-int MPI_CLASS::waitAny( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return -1;
-    PROFILE_START( "waitAny", profile_level );
-    int index = 0;
-    while ( 1 ) {
-        // Check if the request is in our list
-        bool found_any = false;
-        for ( int i = 0; i < count; i++ ) {
-            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() ) {
-                found_any = true;
-                index     = i;
-            }
-        }
-        if ( found_any )
-            break;
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-    }
-    PROFILE_STOP( "waitAny", profile_level );
-    return index;
-}
-void MPI_CLASS::waitAll( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return;
-    PROFILE_START( "waitAll", profile_level );
-    while ( 1 ) {
-        // Check if the request is in our list
-        bool found_all = true;
-        for ( int i = 0; i < count; i++ ) {
-            if ( global_isendrecv_list.find( request[i] ) != global_isendrecv_list.end() )
-                found_all = false;
-        }
-        if ( found_all )
-            break;
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-    }
-    PROFILE_STOP( "waitAll", profile_level );
-}
-std::vector<int> MPI_CLASS::waitSome( int count, MPI_Request *request )
-{
-    if ( count == 0 )
-        return std::vector<int>();
-    PROFILE_START( "waitSome", profile_level );
-    std::vector<int> indicies;
-    while ( 1 ) {
-        // Check if the request is in our list
-        for ( int i = 0; i < count; i++ ) {
-            if ( global_isendrecv_list.find( request[i] ) == global_isendrecv_list.end() )
-                indicies.push_back( i );
-        }
-        if ( !indicies.empty() )
-            break;
-        // Put the current thread to sleep to allow other threads to run
-        sched_yield();
-    }
-    PROFILE_STOP( "waitSome", profile_level );
-    return indicies;
-}
-#endif
-
-
-/************************************************************************
- *  Probe functions                                                      *
- ************************************************************************/
-#ifdef USE_MPI
-int MPI_CLASS::Iprobe( int source, int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Status status;
-    int flag = 0;
-    MPI_Iprobe( source, tag, communicator, &flag, &status );
-    if ( flag == 0 )
-        return -1;
-    int count;
-    MPI_Get_count( &status, MPI_BYTE, &count );
-    MPI_ASSERT( count >= 0 );
-    return count;
-}
-int MPI_CLASS::probe( int source, int tag ) const
-{
-    MPI_INSIST( tag <= d_maxTag, "Maximum tag value exceeded" );
-    MPI_INSIST( tag >= 0, "tag must be >= 0" );
-    MPI_Status status;
-    MPI_Probe( source, tag, communicator, &status );
-    int count;
-    MPI_Get_count( &status, MPI_BYTE, &count );
-    MPI_ASSERT( count >= 0 );
-    return count;
-}
-#else
-int MPI_CLASS::Iprobe( int, int ) const
-{
-    MPI_ERROR( "Not implimented for serial codes (Iprobe)" );
-    return 0;
-}
-int MPI_CLASS::probe( int, int ) const
-{
-    MPI_ERROR( "Not implimented for serial codes (probe)" );
-    return 0;
-}
-#endif
-
-
-/************************************************************************
- *  Timer functions                                                      *
- ************************************************************************/
-#ifdef USE_MPI
-double MPI_CLASS::time() { return MPI_Wtime(); }
-double MPI_CLASS::tick() { return MPI_Wtick(); }
-#else
-double MPI_CLASS::time()
-{
-    auto t  = std::chrono::system_clock::now();
-    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>( t.time_since_epoch() );
-    return 1e-9 * ns.count();
-}
-double MPI_CLASS::tick()
-{
-    auto period = std::chrono::system_clock::period();
-    return static_cast<double>( period.num ) / static_cast<double>( period.den );
-}
-#endif
-
-
-/************************************************************************
- *  Serialize a block of code across MPI processes                       *
- ************************************************************************/
-void MPI_CLASS::serializeStart()
-{
-#ifdef USE_MPI
-    using namespace std::chrono_literals;
-    if ( comm_rank == 0 ) {
-        // Start rank 0 immediately
-    } else {
-        // Wait for a message from the previous rank
-        MPI_Request request;
-        MPI_Status status;
-        int flag = false, buf = 0;
-        MPI_Irecv( &buf, 1, MPI_INT, comm_rank - 1, 5627, MPI_COMM_WORLD, &request );
-        while ( !flag ) {
-            MPI_Test( &request, &flag, &status );
-            std::this_thread::sleep_for( 50ms );
-        }
-    }
-#endif
-}
-void MPI_CLASS::serializeStop()
-{
-#ifdef USE_MPI
-    using namespace std::chrono_literals;
-    if ( comm_rank < comm_size - 1 ) {
-        // Send flag to next rank
-        MPI_Send( &comm_rank, 1, MPI_INT, comm_rank + 1, 5627, MPI_COMM_WORLD );
-        // Wait for final finished flag
-        int flag = false, buf = 0;
-        MPI_Request request;
-        MPI_Status status;
-        MPI_Irecv( &buf, 1, MPI_INT, comm_size - 1, 5627, MPI_COMM_WORLD, &request );
-        while ( !flag ) {
-            MPI_Test( &request, &flag, &status );
-            std::this_thread::sleep_for( 50ms );
-        }
-    } else {
-        // Send final flag to all ranks
-        for ( int i = 0; i < comm_size - 1; i++ )
-            MPI_Send( &comm_rank, 1, MPI_INT, i, 5627, MPI_COMM_WORLD );
-    }
-#endif
-}
-
-
-/****************************************************************************
- * Function to start/stop MPI                                                *
- ****************************************************************************/
-#ifdef USE_EXT_MPI
-static bool called_MPI_Init = false;
-#endif
-bool MPI_CLASS::MPI_Active()
-{
-#ifdef USE_EXT_MPI
-    int MPI_initialized, MPI_finialized;
-    MPI_Initialized( &MPI_initialized );
-    MPI_Finalized( &MPI_finialized );
-    return MPI_initialized != 0 && MPI_finialized == 0;
-#else
-    return false;
-#endif
-}
-void MPI_CLASS::start_MPI( int argc, char *argv[], int profile_level )
-{
-    changeProfileLevel( profile_level );
-    NULL_USE( argc );
-    NULL_USE( argv );
-#ifdef USE_EXT_MPI
-    if ( MPI_Active() ) {
-        called_MPI_Init = false;
-    } else {
-        int provided;
-        int result = MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &provided );
-        if ( result != MPI_SUCCESS )
-            MPI_ERROR( "Unable to initialize MPI" );
-        if ( provided < MPI_THREAD_MULTIPLE )
-            std::cerr << "Warning: Failed to start MPI with MPI_THREAD_MULTIPLE\n";
-        called_MPI_Init = true;
-    }
-#endif
-}
-void MPI_CLASS::stop_MPI()
-{
-#ifdef USE_EXT_MPI
-    int finalized;
-    MPI_Finalized( &finalized );
-    if ( called_MPI_Init && !finalized ) {
-        MPI_Barrier( MPI_COMM_WORLD );
-        MPI_Finalize();
-        called_MPI_Init = true;
-    }
-#endif
-}
-
-
-} // namespace Utilities
-
diff --git a/common/MPI.h b/common/MPI.h
deleted file mode 100644
index e3fd3e13..00000000
--- a/common/MPI.h
+++ /dev/null
@@ -1,1152 +0,0 @@
-// This file includes a wrapper class for MPI functions
-// Note this is a modified version of the MPI class for the Advanced Multi-Physics Package
-// Used with permission
-
-/*
-
-Copyright (c) 2012 UT-Battelle, LLC
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-Collection of administrative costs for redistribution of the source code or binary form is allowed. However, collection of a royalty or other fee in excess of good faith amount for cost recovery for such redistribution is prohibited.
-
-*/
-
-#ifndef included_LBPM_MPI
-#define included_LBPM_MPI
-
-
-#include <array>
-#include <atomic>
-#include <complex>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-
-// Include mpi.h (or define MPI objects)
-// clang-format off
-#ifdef USE_MPI
-    #include "mpi.h"
-#else
-    typedef int MPI_Comm;
-    typedef int MPI_Request;
-    typedef int MPI_Status;
-    typedef void *MPI_Errhandler;
-    enum MPI_TYPES { MPI_INT, MPI_FLOAT, MPI_DOUBLE };
-    #define MPI_COMM_WORLD ( (MPI_Comm) 0xF4000010 )
-    #define MPI_COMM_SELF ( (MPI_Comm) 0xF4000001 )
-    #define MPI_COMM_NULL ( (MPI_Comm) 0xF4000000 )
-#endif
-// clang-format on
-
-
-namespace Utilities {
-
-
-/**
- * \class MPI
- *
- * @brief Provides C++ wrapper around MPI routines.
- *
- * Class MPI groups common MPI routines into one globally-accessible
- * location.  It provides small, simple routines that are common in MPI code.
- * In some cases, the calling syntax has been simplified for convenience.
- * Moreover, there is no reason to include the preprocessor ifdef/endif
- * guards around these calls, since the MPI libraries are not called in
- * these routines if the MPI libraries are not being used (e.g., when
- * writing serial code).
- * Note: Many of the communication routines are templated on type.  When using
- * unknown types the reduce calls will fail, the send and gather calls should
- * succeed provided that the size of the data type object is a fixed size on
- * all processors.  sizeof(type) must be the same for all elements and processors.
- */
-class MPI final
-{
-public:
-    enum class ThreadSupport : int { SINGLE, FUNNELED, SERIALIZED, MULTIPLE };
-
-public: // Constructors
-    /**
-     *\brief  Is MPI active
-     *\details  This returns true if MPI is initailized and not finalized
-     */
-    static bool MPI_active();
-
-    /**
-     *\brief  Empty constructor
-     *\details  This creates an empty constructor that does not contain an MPI communicator.
-     */
-    MPI();
-
-
-    //!  Empty destructor
-    ~MPI();
-
-
-    /**
-     * \brief Constructor from existing MPI communicator
-     * \details  This constructor creates a new communicator from an existing MPI communicator.
-     *    This does not create a new internal MPI_Comm, but uses the existing comm.
-     *    Note that by default, this will not free the MPI_Comm object and the user is
-     * responsible
-     *      for free'ing the MPI_Comm when it is no longer used.  This behavior is controlled by the
-     *      optional manage argument.
-     * \param comm      Existing MPI communicator
-     * \param manage    Do we want to manage the comm (free the MPI_Comm when this object leaves
-     * scope)
-     */
-    MPI( MPI_Comm comm, bool manage = false );
-
-
-    /**
-     * \brief Constructor from existing communicator
-     * \details  This constructor creates a new communicator from an existing communicator.
-     *   This does not create a new internal MPI_Comm, but uses the existing comm.
-     * \param comm Existing communicator
-     */
-    MPI( const MPI &comm );
-
-
-    /*!
-     * Move constructor
-     * @param rhs           Communicator to copy
-     */
-    MPI( MPI &&rhs );
-
-
-    /**
-     * \brief Assignment operator
-     * \details  This operator overloads the assignment to correctly copy an communicator
-     * \param comm Existing MPI object
-     */
-    MPI &operator=( const MPI &comm );
-
-
-    /*!
-     * Move assignment operator
-     * @param rhs           Communicator to copy
-     */
-    MPI &operator=( MPI &&rhs );
-
-
-    /**
-     * \brief Reset the object
-     * \details  This resets the object to the empty state without an MPI_Comm
-     */
-    void reset();
-
-
-public: // Member functions
-    /**
-     * \brief Get the node name
-     * \details  This function returns a unique name for each node.
-     *    It is a wrapper for MPI_Get_processor_name.
-     */
-    static std::string getNodeName();
-
-
-    //! Function to return the number of processors available
-    static int getNumberOfProcessors();
-
-
-    //! Function to return the affinity of the current process
-    static std::vector<int> getProcessAffinity();
-
-
-    //! Function to set the affinity of the current process
-    static void setProcessAffinity( const std::vector<int> &procs );
-
-
-    /**
-     * \brief Load balance the processes within a node
-     * \details  This function will redistribute the processes within a node using the
-     *    process affinities to achieve the desired load balance.
-     *    Note: this is a global operation on the given comm, and it is STRONGLY
-     *    recommended to use COMM_WORLD.
-     * \param comm      The communicator to use (Default is COMM_WORLD)
-     * \param method    The desired load balance method to use:
-     *                  1:  Adjust the affinities so all processes share the given processors.
-     *                      This effectively allows the OS to handle the load balancing
-     *                      by migrating the processes as necessary.  This is recommended
-     *                      for most users and use cases. (default)
-     *                  2:  Adjust the affinities so that the fewest number of processes overlap.
-     *                      This will try to give each process a unique set of processors while
-     *                      ensuring that each process has at least N_min processes.
-     * \param procs     An optional list of processors to use.  By default, setting this to an
-     *                  empty vector will use all available processors on the given node.
-     * \param N_min     The minimum number of processors for any process (-1 indicates all available
-     * processors).
-     * \param N_max     The maximum number of processors for any process (-1 indicates all available
-     * processors).
-     *
-     */
-    static void balanceProcesses( const MPI &comm = MPI( MPI_COMM_WORLD ), const int method = 1,
-        const std::vector<int> &procs = std::vector<int>(), const int N_min = 1,
-        const int N_max = -1 );
-
-
-    //! Query the level of thread support
-    static ThreadSupport queryThreadSupport();
-
-
-    /**
-     * \brief Generate a random number
-     * \details  This generates a random number that is consistent across the comm
-     */
-    size_t rand() const;
-
-
-    /**
-     * \brief Split an existing communicator
-     * \details  This creates a new communicator by splitting an existing communicator.
-     *   See MPI_Comm_split for information on how the underlying split will occur.
-     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
-     *   used by any MPI objects.
-     * \param color  Control of subset assignment (nonnegative integer).
-     *               Processes with the same color are in the same new communicator .
-     *               -1: processor will not be a member of any object (NULL object will be returned)
-     * \param key    Control of rank assignment (integer).
-     *               Note that, for a fixed color, the keys need not be unique. The processes will
-     * be sorted
-     *               in ascending order according to this key, then all the processes in a given
-     * color will
-     *               have the relative rank order as they did in their parent group. (See
-     * MPI_Comm_split)
-     */
-    MPI split( int color, int key = -1 ) const;
-
-
-    /**
-     * \brief Split an existing communicator by node
-     * \details  This creates a new communicator by splitting an existing communicator
-     *   by the node.  This will result in a separate MPI_Comm for each physical node.
-     *   Internally this will use MPI_Get_processor_name to identify the nodes.
-     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
-     *   used by any MPI objects)
-     * \param key    Control of rank assignment (integer).
-     *               Note that, for a fixed color, the keys need not be unique. The processes will
-     * be sorted
-     *               in ascending order according to this key, then all the processes in a given
-     * color will
-     *               have the relative rank order as they did in their parent group. (See
-     * MPI_Comm_split)
-     */
-    MPI splitByNode( int key = -1 ) const;
-
-
-    /**
-     * \brief Duplicate an existing communicator
-     * \details  This creates a new communicator by duplicating an existing communicator.
-     *   The resulting communicator will exist over the same processes, but have a different
-     * context.
-     *   Note: the underlying MPI_Comm object will be free'd automatically when it is no longer
-     *   used by any MPI objects.
-     */
-    MPI dup() const;
-
-
-    /**
-     * \brief Create a communicator from the intersection of two communicators
-     * \details  This creates a new communicator by intersecting two existing communicators.
-     *   Any processors that do not contain the both communicators will receive a NULL communicator.
-     *   There are 3 possible cases:
-     *      The communicators are disjoint (a null communicator will be returned on all processors).
-     *      One communicator is a sub communicator of another.  This will require communication on
-     *          the smaller communicator only.
-     *      The communicators partially overlap.  This will require communication on the first
-     * communicator.
-     */
-    static MPI intersect( const MPI &comm1, const MPI &comm2 );
-
-
-    /**
-     * Check if the current communicator is NULL
-     */
-    bool isNull() const { return d_isNull; }
-
-
-    /**
-     * \brief Return the global ranks for the comm
-     * \details  This returns a vector which contains the global ranks for each
-     *   member of the communicator.  The global ranks are defined according to WORLD comm.
-     */
-    std::vector<int> globalRanks() const;
-
-
-    /**
-     *  Get the current MPI communicator.
-     *  Note: The underlying MPI_Comm object may be free'd by the object when it is no
-     *  longer used by any communicators.  If the user has made a copy using the
-     *  getCommunicator routine, then it may be free'd without user knowledge.  The
-     *  user is responsible for checking if the communicator is valid, or keeping a
-     *  copy of the communicator that provided the MPI_Communicator.
-     */
-    const MPI_Comm &getCommunicator() const { return communicator; }
-
-
-    /**
-     * \brief Overload operator ==
-     * \details  Overload operator comm1 == comm2.  Two MPI objects are == if they share the same
-     * communicator.
-     *   Note: this is a local operation.
-     */
-    bool operator==( const MPI & ) const;
-
-
-    /**
-     * \brief Overload operator !=
-     * \details  Overload operator comm1 != comm2.  Two MPI objects are != if they
-     *   do not share the same communicator.
-     *   Note: this is a local operation.
-     */
-    bool operator!=( const MPI & ) const;
-
-
-    /**
-     * \brief Overload operator <
-     * \details  Overload operator comm1 < comm2.  One MPI object is < another iff all the
-     *   processors in the first object are also in the second.  Additionally, the second
-     *   object must contain at least one processor that is not in the first object.
-     *   This is a collective operation, based on the first communicator.
-     *   As a result all processors on the first communicator will return the same value,
-     *   while any processors that are not on the first communicator will return an unknown value.
-     *   Additionally, all processors on the first object MUST call this routine and will be
-     *   synchronized through this call (there is an internalallReduce).
-     */
-    bool operator<( const MPI & ) const;
-
-
-    /**
-     * \brief Overload operator <=
-     * \details  Overload operator comm1 <= comm2.  One MPI object is <= another iff all the
-     *   processors in the first object are also in the second.  This is a collective operation,
-     *   based on the first communicator.  As a result all processors on the first communicator
-     *   will return the same value, while any processors that are not on the first communicator
-     *   will return an unknown value.  Additionally, all processors on the first object MUST
-     *   call this routine and will be synchronized through this call (there is an internal
-     *   allReduce).
-     */
-    bool operator<=( const MPI & ) const;
-
-
-    /**
-     * \brief Overload operator >
-     * \details  Overload operator comm1 > comm2.  One MPI object is > another iff all the
-     *   processors in the second object are also in the first.  Additionally, the first object
-     *   must contain at least one processor that is not in the second object.
-     *   This is a collective operation, based on the first communicator.
-     *   As a result all processors on the first communicator will return the same value,
-     *   while any processors that are not on the first communicator will return an unknown value.
-     *   Additionally, all processors on the first object MUST call this routine and will be
-     *   synchronized through this call (there is an internal allReduce).
-     */
-    bool operator>( const MPI & ) const;
-
-
-    /**
-     * \brief Overload operator >=
-     * \details  Overload operator comm1 >= comm2.  One MPI object is > another iff all the
-     *   processors in the second object are also in the first.  Additionally, the first object
-     *   must contain at least one processor that is not in the second object.
-     *   This is a collective operation, based on the first communicator.
-     *   As a result all processors on the first communicator will return the same value, while any
-     *   processors that are not on the first communicator will return an unknown value.
-     *   Additionally, all processors on the first object MUST call this routine and will be
-     *   synchronized through this call (there is an internal allReduce).
-     */
-    bool operator>=( const MPI & ) const;
-
-
-    /**
-     * \brief Compare to another communicator
-     * \details  This compares the current communicator to another communicator.
-     *   This returns 1 if the two communicators are equal (they share the same MPI communicator),
-     *   2 if the contexts and groups are the same, 3 if different contexts but identical groups,
-     *   4 if different contexts but similar groups, and 0 otherwise.
-     *   Note: this is a local operation.
-     */
-    int compare( const MPI & ) const;
-
-
-    /**
-     * Return the processor rank (identifier) from 0 through the number of
-     * processors minus one.
-     */
-    int getRank() const { return comm_rank; }
-
-
-    /**
-     * Return the number of processors.
-     */
-    int getSize() const { return comm_size; }
-
-
-    /**
-     * Return the maximum tag
-     */
-    int maxTag() const { return d_maxTag; }
-
-
-    /**
-     * \brief   Return a new tag
-     * \details This routine will return an unused tag for communication.
-     *   Note that this tag may match a user tag, but this function will
-     *   not return two duplicate tags.  This is a global operation.
-     */
-    int newTag();
-
-
-    /**
-     * Call MPI_Abort or exit depending on whether running with one or more
-     * processes and value set by function above, if called.  The default is
-     * to call exit(-1) if running with one processor and to call MPI_Abort()
-     * otherwise.  This function avoids having to guard abort calls in
-     * application code.
-     */
-    void abort() const;
-
-
-    /**
-     * Set boolean flag indicating whether exit or abort is called when running
-     * with one processor.  Calling this function influences the behavior of
-     * calls to abort().  By default, the flag is true meaning that
-     * abort() will be called.  Passing false means exit(-1) will be called.
-     */
-    void setCallAbortInSerialInsteadOfExit( bool flag = true );
-
-
-    /**
-     * \brief   Boolean all reduce
-     * \details This function performs a boolean all reduce across all processors.
-     *   It returns true iff all processor are true;
-     * \param value  The input value for the all reduce
-     */
-    bool allReduce( const bool value ) const;
-
-
-    /**
-     * \brief   Boolean any reduce
-     * \details This function performs a boolean any reduce across all processors.
-     *   It returns true if any processor is true;
-     * \param value  The input value for the all reduce
-     */
-    bool anyReduce( const bool value ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details This function performs a sum all reduce across all processor.
-     *   It returns the sum across all processors;
-     * \param value  The input value for the all reduce
-     */
-    template<class type>
-    type sumReduce( const type value ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array sum Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise sum is returned in the same array.
-     * \param x  The input/output array for the reduce
-     * \param n  The number of values in the array (must match on all nodes)
-     */
-    template<class type>
-    void sumReduce( type *x, const int n = 1 ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array sum Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise sum is returned in the same array.
-     * \param x  The input array for the reduce
-     * \param y  The output array for the reduce
-     * \param n  The number of values in the array (must match on all nodes)
-     */
-    template<class type>
-    void sumReduce( const type *x, type *y, const int n = 1 ) const;
-
-
-    /**
-     * \brief   Min Reduce
-     * \details This function performs a min all reduce across all processor.
-     *   It returns the minimum value across all processors;
-     * \param value  The input value for the all reduce
-     */
-    template<class type>
-    type minReduce( const type value ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array min Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise minimum is returned in the same array.
-     *
-     * If a 'rank_of_min' argument is provided, it will set the array to the
-     * rank of process holding the minimum value.  Like the double argument,
-     * the size of the supplied 'rank_of_min' array should be n.
-     * \param x         The input/output array for the reduce
-     * \param n         The number of values in the array (must match on all nodes)
-     * \param rank_of_min  Optional array indicating the rank of the processor containing the
-     * minimum value
-     */
-    template<class type>
-    void minReduce( type *x, const int n = 1, int *rank_of_min = nullptr ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array min Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise minimum is returned in the same array.
-     *
-     * If a 'rank_of_min' argument is provided, it will set the array to the
-     * rank of process holding the minimum value.  Like the double argument,
-     * the size of the supplied 'rank_of_min' array should be n.
-     * \param x         The input array for the reduce
-     * \param y         The output array for the reduce
-     * \param n         The number of values in the array (must match on all nodes)
-     * \param rank_of_min  Optional array indicating the rank of the processor containing the
-     * minimum value
-     */
-    template<class type>
-    void minReduce( const type *x, type *y, const int n = 1, int *rank_of_min = nullptr ) const;
-
-
-    /**
-     * \brief   Max Reduce
-     * \details This function performs a max all reduce across all processor.
-     *   It returns the maximum value across all processors;
-     * \param value     The input value for the all reduce
-     */
-    template<class type>
-    type maxReduce( const type value ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array max Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise maximum is returned in the same array.
-     *
-     * If a 'rank_of_min' argument is provided, it will set the array to the
-     * rank of process holding the minimum value.  Like the double argument,
-     * the size of the supplied 'rank_of_min' array should be n.
-     * \param x         The input/output array for the reduce
-     * \param n         The number of values in the array (must match on all nodes)
-     * \param rank_of_max  Optional array indicating the rank of the processor containing the
-     * minimum value
-     */
-    template<class type>
-    void maxReduce( type *x, const int n = 1, int *rank_of_max = nullptr ) const;
-
-
-    /**
-     * \brief   Sum Reduce
-     * \details Perform an array max Reduce across all nodes.  Each
-     * processor contributes an array of values, and the
-     * element-wise maximum is returned in the same array.
-     *
-     * If a 'rank_of_min' argument is provided, it will set the array to the
-     * rank of process holding the minimum value.  Like the double argument,
-     * the size of the supplied 'rank_of_min' array should be n.
-     * \param x         The input array for the reduce
-     * \param y         The output array for the reduce
-     * \param n         The number of values in the array (must match on all nodes)
-     * \param rank_of_max  Optional array indicating the rank of the processor containing the
-     * minimum value
-     */
-    template<class type>
-    void maxReduce( const type *x, type *y, const int n = 1, int *rank_of_max = nullptr ) const;
-
-
-    /**
-     * \brief    Scan Sum Reduce
-     * \details  Computes the sum scan (partial reductions) of data on a collection of processes.
-     *   See MPI_Scan for more information.
-     * \param x         The input array for the scan
-     * \param y         The output array for the scan
-     * \param n         The number of values in the array (must match on all nodes)
-     */
-    template<class type>
-    void sumScan( const type *x, type *y, const int n = 1 ) const;
-
-
-    /**
-     * \brief    Scan Min Reduce
-     * \details  Computes the min scan (partial reductions) of data on a collection of processes.
-     *   See MPI_Scan for more information.
-     * \param x         The input array for the scan
-     * \param y         The output array for the scan
-     * \param n         The number of values in the array (must match on all nodes)
-     */
-    template<class type>
-    void minScan( const type *x, type *y, const int n = 1 ) const;
-
-
-    /**
-     * \brief    Scan Max Reduce
-     * \details  Computes the max scan (partial reductions) of data on a collection of processes.
-     *   See MPI_Scan for more information.
-     * \param x         The input array for the scan
-     * \param y         The output array for the scan
-     * \param n     The number of values in the array (must match on all nodes)
-     */
-    template<class type>
-    void maxScan( const type *x, type *y, const int n = 1 ) const;
-
-
-    /**
-     * \brief   Broadcast
-     * \details This function broadcasts a value from root to all processors
-     * \param value     The input value for the broadcast.
-     * \param root      The processor performing the broadcast
-     */
-    template<class type>
-    type bcast( const type &value, const int root ) const;
-
-
-    /**
-     * \brief   Broadcast
-     * \details This function broadcasts an array from root to all processors
-     * \param value     The input/output array for the broadcast
-     * \param n         The number of values in the array (must match on all nodes)
-     * \param root      The processor performing the broadcast
-     */
-    template<class type>
-    void bcast( type *value, const int n, const int root ) const;
-
-
-    /**
-     * Perform a global barrier across all processors.
-     */
-    void barrier() const;
-
-
-    /*!
-     * @brief This function sends an MPI message with an array to another processor.
-     *
-     * If the receiving processor knows in advance the length
-     * of the array, use "send_length = false;"  otherwise,
-     * this processor will first send the length of the array,
-     * then send the data.  This call must be paired with a
-     * matching call to recv.
-     *
-     * @param buf       Pointer to array buffer with length integers.
-     * @param length    Number of integers in buf that we want to send.
-     * @param recv      Receiving processor number.
-     * @param tag       Optional integer argument specifying an integer tag
-     *                  to be sent with this message.  Default tag is 0.
-     *                  The matching recv must share this tag.
-     */
-    template<class type>
-    void send( const type *buf, const int length, const int recv, int tag = 0 ) const;
-
-
-    /*!
-     * @brief This function sends an MPI message with an array of bytes
-     * (MPI_BYTES) to receiving_proc_number.
-     *
-     * This call must be paired with a matching call to recvBytes.
-     *
-     * @param buf       Void pointer to an array of number_bytes bytes to send.
-     * @param N_bytes   Integer number of bytes to send.
-     * @param recv      Receiving processor number.
-     * @param tag       Optional integer argument specifying an integer tag
-     *                  to be sent with this message.  Default tag is 0.
-     *                  The matching recv must share this tag.
-     */
-    void sendBytes( const void *buf, const int N_bytes, const int recv, int tag = 0 ) const;
-
-
-    /*!
-     * @brief This function sends an MPI message with an array
-     *   to another processor using a non-blocking call.
-     *   The receiving processor must know the length of the array.
-     *   This call must be paired  with a matching call to Irecv.
-     *
-     * @param buf       Pointer to array buffer with length integers.
-     * @param length    Number of integers in buf that we want to send.
-     * @param recv_proc Receiving processor number.
-     * @param tag       Integer argument specifying an integer tag
-     *                  to be sent with this message.
-     */
-    template<class type>
-    MPI_Request Isend(
-        const type *buf, const int length, const int recv_proc, const int tag ) const;
-
-
-    /*!
-     * @brief This function sends an MPI message with an array of bytes
-     *   (MPI_BYTES) to receiving_proc_number using a non-blocking call.
-     *   The receiving processor must know the number of bytes to receive.
-     *   This call must be paired with a matching call to IrecvBytes.
-     *
-     * @param buf       Void pointer to an array of number_bytes bytes to send.
-     * @param N_bytes   Integer number of bytes to send.
-     * @param recv_proc Receiving processor number.
-     * @param tag       Integer argument specifying an integer tag
-     *                  to be sent with this message.
-     */
-    MPI_Request IsendBytes(
-        const void *buf, const int N_bytes, const int recv_proc, const int tag ) const;
-
-
-    /*!
-     * @brief This function receives an MPI message with a data
-     * array from another processor.
-     *
-     * If this processor knows in advance the length of the array,
-     * use "get_length = false;" otherwise we will get the return size.
-     * This call must be paired with a matching call to send.
-     *
-     * @param buf        Pointer to integer array buffer with capacity of length integers.
-     * @param length     If get_length==true: The number of elements to be received, otherwise
-     *                   the maximum number of values that can be stored in buf.
-     *                   On output the number of received elements.
-     * @param send       Processor number of sender.
-     * @param tag        Optional integer argument specifying a tag which must be matched
-     *                   by the tag of the incoming message. Default tag is 0.
-     */
-    template<class type>
-    inline void recv( type *buf, int length, const int send, int tag ) const
-    {
-        int length2 = length;
-        recv( buf, length2, send, false, tag );
-    }
-
-
-    /*!
-     * @brief This function receives an MPI message with a data
-     * array from another processor.
-     *
-     * If this processor knows in advance the length of the array,
-     * use "get_length = false;" otherwise we will get the return size.
-     * This call must be paired with a matching call to send.
-     *
-     * @param buf        Pointer to integer array buffer with capacity of length integers.
-     * @param length     If get_length==true: The number of elements to be received, otherwise
-     *                   the maximum number of values that can be stored in buf.
-     *                   On output the number of received elements.
-     * @param send       Processor number of sender.
-     * @param get_length Optional boolean argument specifying if we first
-     *                   need to check the message size to get the size of the array.
-     *                   Default value is true.
-     * @param tag        Optional integer argument specifying a tag which must be matched
-     *                   by the tag of the incoming message. Default tag is 0.
-     */
-    template<class type>
-    void recv( type *buf, int &length, const int send, const bool get_length, int tag ) const;
-
-
-    /*!
-     * @brief This function receives an MPI message with an array of
-     * max size number_bytes (MPI_BYTES) from any processor.
-     *
-     * This call must be paired with a matching call to sendBytes.
-     *
-     * @param buf       Void pointer to a buffer of size number_bytes bytes.
-     * @param N_bytes   Integer number specifying size of buf in bytes.
-     * @param send      Integer number specifying size of buf in bytes.
-     * @param tag       Optional integer argument specifying a tag which
-     *   must be matched by the tag of the incoming message. Default
-     *   tag is 0.
-     */
-    void recvBytes( void *buf, int &N_bytes, const int send, int tag = 0 ) const;
-
-
-    /*!
-     * @brief This function receives an MPI message with a data
-     * array from another processor using a non-blocking call.
-     *
-     * @param buf        Pointer to integer array buffer with capacity of length integers.
-     * @param length     Maximum number of values that can be stored in buf.
-     * @param send_proc  Processor number of sender.
-     * @param tag        Optional integer argument specifying a tag which must
-     *                   be matched by the tag of the incoming message.
-     */
-    template<class type>
-    MPI_Request Irecv( type *buf, const int length, const int send_proc, const int tag ) const;
-
-
-    /*!
-     * @brief This function receives an MPI message with an array of
-     * max size number_bytes (MPI_BYTES) from any processor.
-     *
-     * This call must be paired with a matching call to sendBytes.
-     *
-     * @param buf       Void pointer to a buffer of size number_bytes bytes.
-     * @param N_bytes   Integer number specifying size of buf in bytes.
-     * @param send_proc Processor number of sender.
-     * @param tag       Integer argument specifying a tag which must
-     *                  be matched by the tag of the incoming message.
-     */
-    MPI_Request IrecvBytes(
-        void *buf, const int N_bytes, const int send_proc, const int tag ) const;
-
-
-    /*!
-     * Each processor sends every other processor a single value.
-     * @param[in] x      Input value for allGather
-     * @return           Output array for allGather
-     */
-    template<class type>
-    std::vector<type> allGather( const type &x ) const;
-
-
-    /*!
-     * Each processor sends every other processor an array
-     * @param[in] x      Input array for allGather
-     * @return           Output array for allGather
-     */
-    template<class type>
-    std::vector<type> allGather( const std::vector<type> &x_in ) const;
-
-
-    /*!
-     * Each processor sends every other processor a single value.
-     * The x_out array should be preallocated to a length equal
-     * to the number of processors.
-     * @param x_in      Input value for allGather
-     * @param x_out     Output array for allGather (must be preallocated to the size of the
-     * communicator)
-     */
-    template<class type>
-    void allGather( const type &x_in, type *x_out ) const;
-
-
-    /*!
-     * Each processor sends an array of data to all other processors.
-     * Each processor receives the values from all processors and gathers them
-     * to a single array.  If successful, the total number of received
-     * elements will be returned.
-     * @param send_data     Input array
-     * @param send_cnt      The number of values to send
-     * @param recv_data     Output array of received values
-     * @param recv_cnt      The number of values to receive from each processor (N).
-     *                      If known, this should be provided as an input.  Otherwise
-     *                      it is an optional output that will return the number of
-     *                      received values from each processor.
-     * @param recv_disp     The displacement (relative to the start of the array)
-     *                      from which to store the data received from processor i.
-     *                      If known, this should be provided as an input.  Otherwise
-     *                      it is an optional output that will return the starting location
-     *                      (relative to the start of the array) for the received data from
-     * processor i.
-     * @param known_recv    Are the received counts and displacements known.
-     *                      If the received sizes are known, then they must be provided,
-     *                      and an extra communication step is not necessary.  If the received
-     *                      sizes are not known, then an extra communication step will occur
-     * internally
-     *                      and the sizes and displacements will be returned (if desired).
-     */
-    template<class type>
-    int allGather( const type *send_data, const int send_cnt, type *recv_data,
-        int *recv_cnt = nullptr, int *recv_disp = nullptr, bool known_recv = false ) const;
-
-
-    /*!
-     * This function combines sets from different processors to create a single master set
-     * @param set       Input/Output std::set for the gather.
-     */
-    template<class type>
-    void setGather( std::set<type> &set ) const;
-
-
-    /*!
-     * This function combines std::maps from different processors to create a single master std::map
-     * If two or more ranks share the same key, the lowest rank will be used
-     * @param map       Input/Output std::map for the gather.
-     */
-    template<class KEY, class DATA>
-    void mapGather( std::map<KEY, DATA> &map ) const;
-
-
-    /*!
-     * Each processor sends an array of n values to each processor.
-     * Each processor sends an array of n values to each processor.
-     * The jth block of data is sent from processor i to processor j and placed
-     * in the ith block on the receiving processor.  In the variable
-     * description, N is the size of the communicator.  Note that this is a
-     * blocking global communication.
-     * @param n             The number of elements in each data block to send.
-     * @param send_data     Input array (nxN)
-     * @param recv_data     Output array of received values (nxN)
-     */
-    template<class type>
-    void allToAll( const int n, const type *send_data, type *recv_data ) const;
-
-
-    /*!
-     * Each processor sends an array of data to the different processors.
-     * Each processor may send any size array to any processor.  In the variable
-     * description, N is the size of the communicator.  Note that this is a
-     * blocking global communication.  If successful, the total number of received
-     * elements will be returned.
-     * @param send_data     Input array
-     * @param send_cnt      The number of values to send to each processor (N)
-     * @param send_disp     The displacement (relative to the start of the array)
-     *                      from which to send to processor i
-     * @param recv_data     Output array of received values
-     * @param recv_cnt      The number of values to receive from each processor (N).
-     *                      If known, this should be provided as an input.  Otherwise
-     *                      it is an optional output that will return the number of
-     *                      received values from each processor.
-     * @param recv_disp     The displacement (relative to the start of the array)
-     *                      from which to send to processor i.
-     *                      If known, this should be provided as an input.  Otherwise
-     *                      it is an optional output that will return the starting location
-     *                      (relative to the start of the array) for the received data from
-     * processor i.
-     * @param known_recv    Are the received counts and displacements known.
-     *                      If the received sizes are known, then they must be provided,
-     *                      and an extra communication step is not necessary.  If the received
-     *                      sizes are not know, then an extra communication step will occur
-     * internally
-     *                      and the sizes and displacements will be returned (if desired).
-     */
-    template<class type>
-    int allToAll( const type *send_data, const int send_cnt[], const int send_disp[],
-        type *recv_data, int *recv_cnt = nullptr, int *recv_disp = nullptr,
-        bool known_recv = false ) const;
-
-
-    /*!
-     * \brief   Send a list of proccesor ids to communicate
-     * \details This function communicates a list of proccesors to communicate.
-     *    Given a list of ranks that we want to send/receieve data to/from, this routine
-     *    will communicate that set to the other ranks returning the list of processors
-     *    that want to communication with the current rank.
-     *    Note: this routine will involved global communication
-     * \param ranks         List of ranks that the current rank wants to communicate with
-     * \return              List of ranks that want to communicate with the current processor
-     */
-    std::vector<int> commRanks( const std::vector<int> &ranks ) const;
-
-
-    /*!
-     * \brief   Wait for a communication to finish
-     * \details Wait for a communication to finish.
-     *    Note: this does not require a communicator.
-     * \param request    Communication request to wait for (returned for Isend or Irecv)
-     */
-    static void wait( MPI_Request request );
-
-
-    /*!
-     * \brief   Wait for any communication to finish.
-     * \details This function waits for any of the given communication requests to finish.
-     *    It returns the index of the communication request that finished.
-     *    Note: this does not require a communicator.
-     * \param count      Number of communications to check
-     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
-     */
-    static int waitAny( int count, MPI_Request *request );
-
-
-    /*!
-     * \brief   Wait for all communications to finish.
-     * \details This function waits for all of the given communication requests to finish.
-     *    Note: this does not require a communicator.
-     * \param count      Number of communications to check
-     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
-     */
-    static void waitAll( int count, MPI_Request *request );
-
-
-    /*!
-     * \brief   Wait for some communications to finish.
-     * \details This function waits for one (or more) communications to finish.
-     *    It returns an array of the indicies that have finished.
-     *    Note: this does not require a communicator.
-     * \param count      Number of communications to check
-     * \param request    Array of communication requests to wait for (returned for Isend or Irecv)
-     */
-    static std::vector<int> waitSome( int count, MPI_Request *request );
-
-
-    /*!
-     * \brief   Nonblocking test for a message
-     * \details This function performs a non-blocking test for a message.
-     *    It will return the number of bytes in the message if a message with
-     *    the specified source and tag (on the current communicator) is available.
-     *    Otherwise it will return -1.
-     * \param source      source rank (-1: any source)
-     * \param tag         tag (-1: any tag)
-     */
-    int Iprobe( int source = -1, int tag = -1 ) const;
-
-
-    /*!
-     * \brief   Blocking test for a message
-     * \details This function performs a blocking test for a message.
-     *    It will return the number of bytes in the message when a message with
-     *    the specified source and tag (on the current communicator) is available
-     * \param source      source rank (-1: any source)
-     * \param tag         tag (-1: any tag)
-     */
-    int probe( int source = -1, int tag = -1 ) const;
-
-
-    /*!
-     * \brief   Start a serial region
-     * \details This function will serialize MPI processes so that they run
-     *    one at a time.  A call to serializeStart must be followed by a call
-     *    to serializeStop after the commands to be executed.
-     *    Note: the ranks will be run in order.
-     */
-    void serializeStart();
-
-
-    /*!
-     * \brief   Stop a serial region
-     * \details Stop a serial region.  See serializeStart for more information.
-     */
-    void serializeStop();
-
-
-    /*!
-     * \brief   Elapsed time
-     * \details This function returns the elapsed time on the calling processor
-     *    since an arbitrary point in the past (seconds).  It is a wrapper to MPI_Wtime.
-     *    See "tick" for the timer resolution in seconds.
-     *    The time may or may not be synchronized across processors depending on the MPI
-     *    implementation.  Refer to MPI documentation for the desired platform for more information.
-     */
-    static double time();
-
-
-    /*!
-     * \brief   Timer resolution
-     * \details This function returns the timer resolution used by "time"
-     */
-    static double tick();
-
-
-    /*!
-     * \brief   Change the level of the internal timers
-     * \details This function changes the level of the timers used to profile MPI
-     * \param level         New level of the timers
-     */
-    static void changeProfileLevel( int level ) { profile_level = level; }
-
-
-    //! Return the total number of MPI_Comm objects that have been created
-    static size_t MPI_Comm_created() { return N_MPI_Comm_created; }
-
-    //! Return the total number of MPI_Comm objects that have been destroyed
-    static size_t MPI_Comm_destroyed() { return N_MPI_Comm_destroyed; }
-
-    //! Return details about MPI
-    static std::string info();
-
-    //! Return the MPI version number { major, minor }
-    static std::array<int, 2> version();
-
-    //! Check if MPI is active
-    static bool MPI_Active();
-
-    //! Start MPI
-    static void start_MPI( int argc_in, char *argv_in[], int profile_level = 0 );
-
-    //! Stop MPI
-    static void stop_MPI();
-
-
-private: // Private helper functions for templated MPI operations;
-    template<class type>
-    void call_sumReduce( type *x, const int n = 1 ) const;
-    template<class type>
-    void call_sumReduce( const type *x, type *y, const int n = 1 ) const;
-    template<class type>
-    void call_minReduce( type *x, const int n = 1, int *rank_of_min = nullptr ) const;
-    template<class type>
-    void call_minReduce(
-        const type *x, type *y, const int n = 1, int *rank_of_min = nullptr ) const;
-    template<class type>
-    void call_maxReduce( type *x, const int n = 1, int *rank_of_max = nullptr ) const;
-    template<class type>
-    void call_maxReduce(
-        const type *x, type *y, const int n = 1, int *rank_of_max = nullptr ) const;
-    template<class type>
-    void call_bcast( type *x, const int n, const int root ) const;
-    template<class type>
-    void call_allGather( const type &x_in, type *x_out ) const;
-    template<class type>
-    void call_allGather(
-        const type *x_in, int size_in, type *x_out, int *size_out, int *disp_out ) const;
-    template<class type>
-    void call_sumScan( const type *x, type *y, int n = 1 ) const;
-    template<class type>
-    void call_minScan( const type *x, type *y, int n = 1 ) const;
-    template<class type>
-    void call_maxScan( const type *x, type *y, int n = 1 ) const;
-    template<class type>
-    void call_allToAll( const type *send_data, const int send_cnt[], const int send_disp[],
-        type *recv_data, const int *recv_cnt, const int *recv_disp ) const;
-
-
-private: // data members
-    // The internal MPI communicator
-    MPI_Comm communicator;
-
-    // Is the communicator NULL
-    bool d_isNull;
-
-    // Do we want to manage this communicator
-    bool d_manage;
-
-    // Do we want to call MPI_abort instead of exit
-    bool d_call_abort;
-
-    // The level for the profiles of MPI
-    static short profile_level;
-
-    // The rank and size of the communicator
-    int comm_rank, comm_size;
-
-    // The ranks of the comm in the global comm
-    mutable int *volatile d_ranks;
-
-    // Some attributes
-    int d_maxTag;
-    int *volatile d_currentTag;
-
-    /* How many objects share the same underlying MPI communicator.
-     * When the count goes to 0, the MPI comm will be free'd (assuming it was created
-     * by an communicator).  This may not be perfect, but is likely to be good enough.
-     * Note that for thread safety, any access to this variable should be blocked for thread safety.
-     * The value of count MUST be volatile to ensure the correct value is always used.
-     */
-    std::atomic_int *volatile d_count;
-
-    // Add a variable for data alignment (necessary for some Intel builds)
-    double tmp_alignment;
-
-    /* We want to keep track of how many MPI_Comm objects we have created over time.
-     * Like the count, for thread safety this should be blocked, however the most likely error
-     * caused by not blocking is a slight error in the MPI count.  Since this is just for reference
-     * we do not need to block (recognizing that the value may not be 100% accurate).
-     */
-    static volatile unsigned int N_MPI_Comm_created;
-    static volatile unsigned int N_MPI_Comm_destroyed;
-};
-
-
-} // namespace Utilities
-
-
-// Include the default instantiations
-// \cond HIDDEN_SYMBOLS
-#include "common/MPI.I"
-// \endcond
-
-
-#endif
diff --git a/common/MPI_Helpers.cpp b/common/MPI_Helpers.cpp
new file mode 100644
index 00000000..736a2f02
--- /dev/null
+++ b/common/MPI_Helpers.cpp
@@ -0,0 +1,266 @@
+#include "common/MPI_Helpers.h"
+#include "common/Utilities.h"
+
+
+/********************************************************
+* Return the MPI data type                              *
+********************************************************/
+template<> MPI_Datatype getMPItype<char>() {
+    return MPI_CHAR;
+}
+template<> MPI_Datatype getMPItype<unsigned char>() {
+    return MPI_UNSIGNED_CHAR;
+}
+template<> MPI_Datatype getMPItype<int>() {
+    return MPI_INT;
+}
+template<> MPI_Datatype getMPItype<long>() {
+    return MPI_LONG;
+}
+template<> MPI_Datatype getMPItype<unsigned long>() {
+    return MPI_UNSIGNED_LONG;
+}
+template<> MPI_Datatype getMPItype<long long>() {
+    return MPI_LONG_LONG;
+}
+template<> MPI_Datatype getMPItype<float>() {
+    return MPI_FLOAT;
+}
+template<> MPI_Datatype getMPItype<double>() {
+    return MPI_DOUBLE;
+}
+
+
+/********************************************************
+* Concrete implimentations for packing/unpacking        *
+********************************************************/
+// unsigned char
+template<>
+size_t packsize<unsigned char>( const unsigned char& )
+{
+    return sizeof(unsigned char);
+}
+template<>
+void pack<unsigned char>( const unsigned char& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(unsigned char));
+}
+template<>
+void unpack<unsigned char>( unsigned char& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(unsigned char));
+}
+// char
+template<>
+size_t packsize<char>( const char& )
+{
+    return sizeof(char);
+}
+template<>
+void pack<char>( const char& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(char));
+}
+template<>
+void unpack<char>( char& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(char));
+}
+// int
+template<>
+size_t packsize<int>( const int& )
+{
+    return sizeof(int);
+}
+template<>
+void pack<int>( const int& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(int));
+}
+template<>
+void unpack<int>( int& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(int));
+}
+// unsigned int
+template<>
+size_t packsize<unsigned int>( const unsigned int& )
+{
+    return sizeof(unsigned int);
+}
+template<>
+void pack<unsigned int>( const unsigned int& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(int));
+}
+template<>
+void unpack<unsigned int>( unsigned int& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(int));
+}
+// size_t
+template<>
+size_t packsize<size_t>( const size_t& )
+{
+    return sizeof(size_t);
+}
+template<>
+void pack<size_t>( const size_t& rhs, char *buffer )
+{
+    memcpy(buffer,&rhs,sizeof(size_t));
+}
+template<>
+void unpack<size_t>( size_t& data, const char *buffer )
+{
+    memcpy(&data,buffer,sizeof(size_t));
+}
+// std::string
+template<>
+size_t packsize<std::string>( const std::string& rhs )
+{
+    return rhs.size()+1;
+}
+template<>
+void pack<std::string>( const std::string& rhs, char *buffer )
+{
+    memcpy(buffer,rhs.c_str(),rhs.size()+1);
+}
+template<>
+void unpack<std::string>( std::string& data, const char *buffer )
+{
+    data = std::string(buffer);
+}
+
+
+/********************************************************
+* Fake MPI routines                                     *
+********************************************************/
+#ifndef USE_MPI
+int MPI_Init(int*,char***)
+{
+    return 0;
+}
+int MPI_Init_thread(int*,char***, int required, int *provided )
+{
+    *provided = required;
+    return 0;
+}
+int MPI_Finalize()
+{
+    return 0;
+}
+int MPI_Comm_size( MPI_Comm, int *size )
+{
+    *size = 1;
+    return 0;
+}
+int MPI_Comm_rank( MPI_Comm, int *rank )
+{
+    *rank = 0;
+    return 0;
+}
+int MPI_Barrier( MPI_Comm )
+{
+    return 0;
+}
+int MPI_Waitall( int, MPI_Request[], MPI_Status[] )
+{
+    return 0;
+}
+int MPI_Wait( MPI_Request*, MPI_Status* )
+{
+    return 0;
+}
+int MPI_Bcast( void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm )
+{
+    return 0;
+}
+int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+         MPI_Comm comm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+         MPI_Comm comm, MPI_Status *status)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm, MPI_Request *request)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
+              int tag, MPI_Comm comm, MPI_Request *request)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  MPI_Comm comm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                   void *recvbuf, const int *recvcounts, const int *displs,
+                   MPI_Datatype recvtype, MPI_Comm comm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                int dest, int sendtag,
+                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                int source, int recvtag,
+                MPI_Comm comm, MPI_Status *status)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+               MPI_Op op, int root, MPI_Comm comm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Comm_group(MPI_Comm comm, MPI_Group *group)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm)
+{
+    ERROR("Not implimented yet");
+    return 0;
+}
+int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm)
+{
+    *newcomm = comm;
+    return 0;
+}
+double MPI_Wtime( void )
+{
+    return 0.0;
+}
+int MPI_Comm_free(MPI_Comm *group)
+{
+    return 0;
+}
+int MPI_Group_free(MPI_Group *group)
+{
+    return 0;
+}
+#endif
+
+
diff --git a/common/MPI_Helpers.h b/common/MPI_Helpers.h
new file mode 100644
index 00000000..1d20318e
--- /dev/null
+++ b/common/MPI_Helpers.h
@@ -0,0 +1,239 @@
+// This file contains wrappers for MPI routines and functions to pack/unpack data structures
+#ifndef MPI_WRAPPERS_INC
+#define MPI_WRAPPERS_INC
+
+#include <string.h>
+#include <vector>
+#include <set>
+#include <map>
+
+#ifdef USE_MPI
+    // Inlcude MPI
+    #include "mpi.h"
+#else
+    // Create fake MPI types
+    typedef int MPI_Comm;
+    typedef int MPI_Request;
+    typedef int MPI_Status;
+    #define MPI_COMM_WORLD 0
+    #define MPI_COMM_SELF 0
+    #define MPI_COMM_NULL -1
+    #define MPI_GROUP_NULL -2
+    #define MPI_STATUS_IGNORE NULL
+    enum MPI_Datatype { MPI_LOGICAL, MPI_CHAR, MPI_UNSIGNED_CHAR, MPI_INT, 
+        MPI_UNSIGNED, MPI_LONG, MPI_UNSIGNED_LONG, MPI_LONG_LONG, MPI_FLOAT, MPI_DOUBLE };
+    enum MPI_Op { MPI_MIN, MPI_MAX, MPI_SUM };
+    typedef int MPI_Group;
+    #define MPI_THREAD_SINGLE 0
+    #define MPI_THREAD_FUNNELED 1
+    #define MPI_THREAD_SERIALIZED 2
+    #define MPI_THREAD_MULTIPLE 3
+    // Fake MPI functions
+	int MPI_Init(int*,char***);
+    int MPI_Init_thread( int *argc, char ***argv, int required, int *provided );
+	int MPI_Finalize();
+    int MPI_Comm_size( MPI_Comm, int *size );
+    int MPI_Comm_rank( MPI_Comm, int *rank );
+    int MPI_Barrier(MPI_Comm);
+    int MPI_Wait(MPI_Request*,MPI_Status*);
+    int MPI_Waitall(int,MPI_Request[],MPI_Status[]);
+    int MPI_Bcast(void*,int,MPI_Datatype,int,MPI_Comm);
+    int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+             MPI_Comm comm);
+    int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+             MPI_Comm comm, MPI_Status *status);
+    int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm, MPI_Request *request);
+    int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
+              int tag, MPI_Comm comm, MPI_Request *request);
+    int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
+    int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  MPI_Comm comm);
+    int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                  void *recvbuf, const int *recvcounts, const int *displs,
+                  MPI_Datatype recvtype, MPI_Comm comm);
+    int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                int dest, int sendtag,
+                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                int source, int recvtag,
+                MPI_Comm comm, MPI_Status *status);
+    int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+               MPI_Op op, int root, MPI_Comm comm);
+    double MPI_Wtime( void );
+    int MPI_Comm_group(MPI_Comm comm, MPI_Group *group);
+    int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
+    int MPI_Comm_free(MPI_Comm *group);
+    int MPI_Group_free(MPI_Group *group);
+    int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
+#endif
+
+
+//! Get the size of the MPI_Comm
+//  Note: this is a thread and interrupt safe function
+inline int comm_size( MPI_Comm comm ) {
+    int size = 1;
+    MPI_Comm_size( comm, &size );
+    return size;
+}
+    
+
+//! Get the rank of the MPI_Comm
+//  Note: this is a thread and interrupt safe function
+inline int comm_rank( MPI_Comm comm ) {
+    int rank = 1;
+    MPI_Comm_rank( comm, &rank );
+    return rank;
+}
+    
+
+//! Get the size of MPI_COMM_WORLD
+inline int MPI_WORLD_SIZE( ) {
+    return comm_size( MPI_COMM_WORLD );
+}
+
+//! Get the size of MPI_COMM_WORLD
+inline int MPI_WORLD_RANK( ) {
+    return comm_rank( MPI_COMM_WORLD );
+}
+
+//! Return the appropriate MPI datatype for a class
+template<class TYPE>
+MPI_Datatype getMPItype();
+
+
+//! Template function to return the buffer size required to pack a class
+template<class TYPE>
+size_t packsize( const TYPE& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const TYPE& rhs, char *buffer );
+
+//! Template function to unpack a class from a buffer
+template<class TYPE>
+void unpack( TYPE& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::vector
+template<class TYPE>
+size_t packsize( const std::vector<TYPE>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const std::vector<TYPE>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void unpack( std::vector<TYPE>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::pair
+template<class TYPE1, class TYPE2>
+size_t packsize( const std::pair<TYPE1,TYPE2>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void pack( const std::pair<TYPE1,TYPE2>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void unpack( std::pair<TYPE1,TYPE2>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::map
+template<class TYPE1, class TYPE2>
+size_t packsize( const std::map<TYPE1,TYPE2>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void pack( const std::map<TYPE1,TYPE2>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE1, class TYPE2>
+void unpack( std::map<TYPE1,TYPE2>& data, const char *buffer );
+
+
+//! Template function to return the buffer size required to pack a std::set
+template<class TYPE>
+size_t packsize( const std::set<TYPE>& rhs );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void pack( const std::set<TYPE>& rhs, char *buffer );
+
+//! Template function to pack a class to a buffer
+template<class TYPE>
+void unpack( std::set<TYPE>& data, const char *buffer );
+
+
+
+// Helper functions
+inline double sumReduce( MPI_Comm comm, double x )
+{
+    double y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_DOUBLE,MPI_SUM,comm);
+    return y;
+}
+inline float sumReduce( MPI_Comm comm, float x )
+{
+    float y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_FLOAT,MPI_SUM,comm);
+    return y;
+}
+inline int sumReduce( MPI_Comm comm, int x )
+{
+    int y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_INT,MPI_SUM,comm);
+    return y;
+}
+inline long long sumReduce( MPI_Comm comm, long long x )
+{
+    long long y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_LONG_LONG,MPI_SUM,comm);
+    return y;
+}
+inline bool sumReduce( MPI_Comm comm, bool x )
+{
+    int y = sumReduce( comm, x?1:0 );
+    return y>0;
+}
+inline std::vector<float> sumReduce( MPI_Comm comm, const std::vector<float>& x )
+{
+    auto y = x;
+	MPI_Allreduce(x.data(),y.data(),x.size(),MPI_FLOAT,MPI_SUM,comm);
+    return y;
+}
+inline std::vector<int> sumReduce( MPI_Comm comm, const std::vector<int>& x )
+{
+    auto y = x;
+	MPI_Allreduce(x.data(),y.data(),x.size(),MPI_INT,MPI_SUM,comm);
+    return y;
+}
+inline double maxReduce( MPI_Comm comm, double x )
+{
+    double y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_DOUBLE,MPI_MAX,comm);
+    return y;
+}
+inline float maxReduce( MPI_Comm comm, float x )
+{
+    float y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_FLOAT,MPI_MAX,comm);
+    return y;
+}
+inline int maxReduce( MPI_Comm comm, int x )
+{
+    int y = 0;
+	MPI_Allreduce(&x,&y,1,MPI_INT,MPI_MAX,comm);
+    return y;
+}
+
+
+#endif
+
+
+#include "common/MPI_Helpers.hpp"
+
+
diff --git a/IO/PackData.hpp b/common/MPI_Helpers.hpp
similarity index 95%
rename from IO/PackData.hpp
rename to common/MPI_Helpers.hpp
index 006cdf73..85261cf1 100644
--- a/IO/PackData.hpp
+++ b/common/MPI_Helpers.hpp
@@ -1,9 +1,8 @@
-// This file functions to pack/unpack data structures
-#ifndef included_PackData_hpp
-#define included_PackData_hpp
-
-#include "IO/PackData.h"
+// This file contains wrappers for MPI routines and functions to pack/unpack data structures
+#ifndef MPI_WRAPPERS_HPP
+#define MPI_WRAPPERS_HPP
 
+#include "common/MPI_Helpers.h"
 #include <string.h>
 #include <vector>
 #include <set>
diff --git a/common/ReadMicroCT.cpp b/common/ReadMicroCT.cpp
index 2209e712..79ef241e 100644
--- a/common/ReadMicroCT.cpp
+++ b/common/ReadMicroCT.cpp
@@ -64,11 +64,11 @@ Array<uint8_t> readMicroCT( const std::string& filename )
 
 
 // Read the compressed micro CT data and distribute
-Array<uint8_t> readMicroCT( const Database& domain, const Utilities::MPI& comm )
+Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm )
 {
     // Get the local problem info
     auto n = domain.getVector<int>( "n" );
-    int rank = comm.getRank();
+    int rank = comm_rank(MPI_COMM_WORLD);
     auto nproc = domain.getVector<int>( "nproc" );
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     
diff --git a/common/ReadMicroCT.h b/common/ReadMicroCT.h
index c8acc379..f232740e 100644
--- a/common/ReadMicroCT.h
+++ b/common/ReadMicroCT.h
@@ -5,12 +5,11 @@
 #include "common/Array.h"
 #include "common/Communication.h"
 #include "common/Database.h"
-#include "common/MPI.h"
 
 
 Array<uint8_t> readMicroCT( const std::string& filename );
 
-Array<uint8_t> readMicroCT( const Database& domain, const Utilities::MPI& comm );
+Array<uint8_t> readMicroCT( const Database& domain, MPI_Comm comm );
 
 
 #endif
diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 6f2966e7..e8a75994 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -5,7 +5,9 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	Lock=false; // unlock the communicator
 	//......................................................................................
 	// Create a separate copy of the communicator for the device
-    MPI_COMM_SCALBL = Dm->Comm.dup();
+	//MPI_Comm_group(Dm->Comm,&Group);
+	//MPI_Comm_create(Dm->Comm,Group,&MPI_COMM_SCALBL);
+	MPI_Comm_dup(Dm->Comm,&MPI_COMM_SCALBL);
 	//......................................................................................
 	// Copy the domain size and communication information directly from Dm
 	Nx = Dm->Nx;
@@ -213,7 +215,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	ScaLBL_CopyToZeroCopy(dvcRecvList_Yz,Dm->recvList_Yz,recvCount_Yz*sizeof(int));
 	//......................................................................................
 
-	MPI_COMM_SCALBL.barrier();
+	MPI_Barrier(MPI_COMM_SCALBL);
 
 	//...................................................................................
 	// Set up the recieve distribution lists
@@ -286,7 +288,7 @@ ScaLBL_Communicator::ScaLBL_Communicator(std::shared_ptr <Domain> Dm){
 	//...................................................................................
 
 	//......................................................................................
-	MPI_COMM_SCALBL.barrier();
+	MPI_Barrier(MPI_COMM_SCALBL);
 	ScaLBL_DeviceBarrier();
 	//......................................................................................
 	SendCount = sendCount_x+sendCount_X+sendCount_y+sendCount_Y+sendCount_z+sendCount_Z+
@@ -867,8 +869,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(12,dvcSendList_x,3*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	ScaLBL_D3Q19_Pack(14,dvcSendList_x,4*sendCount_x,sendCount_x,sendbuf_x,dist,N);
 	
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 5*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 5*recvCount_X,rank_X,recvtag);
+	MPI_Isend(sendbuf_x, 5*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 5*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,dist,N);
@@ -876,8 +878,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(11,dvcSendList_X,3*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	ScaLBL_D3Q19_Pack(13,dvcSendList_X,4*sendCount_X,sendCount_X,sendbuf_X,dist,N);
 	
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 5*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 5*recvCount_x,rank_x,recvtag);
+	MPI_Isend(sendbuf_X, 5*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 5*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(8,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,dist,N);
@@ -885,8 +887,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_y,3*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_y,4*sendCount_y,sendCount_y,sendbuf_y,dist,N);
 	
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 5*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 5*recvCount_Y,rank_Y,recvtag);
+	MPI_Isend(sendbuf_y, 5*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 5*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(7,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
@@ -894,8 +896,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Y,3*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Y,4*sendCount_Y,sendCount_Y,sendbuf_Y,dist,N);
 	
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 5*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 5*recvCount_y,rank_y,recvtag);
+	MPI_Isend(sendbuf_Y, 5*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 5*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(12,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,dist,N);
@@ -903,8 +905,8 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(16,dvcSendList_z,3*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	ScaLBL_D3Q19_Pack(17,dvcSendList_z,4*sendCount_z,sendCount_z,sendbuf_z,dist,N);
 	
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 5*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 5*recvCount_Z,rank_Z,recvtag);
+	MPI_Isend(sendbuf_z, 5*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 5*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,dist,N);
@@ -913,57 +915,57 @@ void ScaLBL_Communicator::SendD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Pack(15,dvcSendList_Z,3*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	ScaLBL_D3Q19_Pack(18,dvcSendList_Z,4*sendCount_Z,sendCount_Z,sendbuf_Z,dist,N);
 	
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 5*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 5*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_Z, 5*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 5*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 	
 	//...Pack the xy edge (8)................................
 	ScaLBL_D3Q19_Pack(8,dvcSendList_xy,0,sendCount_xy,sendbuf_xy,dist,N);
-	req1[6] = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-	req2[6] = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
 	//...Pack the Xy edge (9)................................
 	ScaLBL_D3Q19_Pack(9,dvcSendList_Xy,0,sendCount_Xy,sendbuf_Xy,dist,N);
-	req1[8] = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-	req2[8] = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
 	//...Pack the xY edge (10)................................
 	ScaLBL_D3Q19_Pack(10,dvcSendList_xY,0,sendCount_xY,sendbuf_xY,dist,N);
-	req1[9] = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-	req2[9] = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
 	//...Pack the XY edge (7)................................
 	ScaLBL_D3Q19_Pack(7,dvcSendList_XY,0,sendCount_XY,sendbuf_XY,dist,N);
-	req1[7] = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-	req2[7] = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
 	//...Pack the xz edge (12)................................
 	ScaLBL_D3Q19_Pack(12,dvcSendList_xz,0,sendCount_xz,sendbuf_xz,dist,N);
-	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
 	//...Pack the xZ edge (14)................................
 	ScaLBL_D3Q19_Pack(14,dvcSendList_xZ,0,sendCount_xZ,sendbuf_xZ,dist,N);
-	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
 	//...Pack the Xz edge (13)................................
 	ScaLBL_D3Q19_Pack(13,dvcSendList_Xz,0,sendCount_Xz,sendbuf_Xz,dist,N);
-	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
 	//...Pack the XZ edge (11)................................
 	ScaLBL_D3Q19_Pack(11,dvcSendList_XZ,0,sendCount_XZ,sendbuf_XZ,dist,N);
-	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
 	//...Pack the yz edge (16)................................
 	ScaLBL_D3Q19_Pack(16,dvcSendList_yz,0,sendCount_yz,sendbuf_yz,dist,N);
-	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
 	//...Pack the yZ edge (18)................................
 	ScaLBL_D3Q19_Pack(18,dvcSendList_yZ,0,sendCount_yZ,sendbuf_yZ,dist,N);
-	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
 	//...Pack the Yz edge (17)................................
 	ScaLBL_D3Q19_Pack(17,dvcSendList_Yz,0,sendCount_Yz,sendbuf_Yz,dist,N);
-	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
 	//...Pack the YZ edge (15)................................
 	ScaLBL_D3Q19_Pack(15,dvcSendList_YZ,0,sendCount_YZ,sendbuf_YZ,dist,N);
-	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
 	//...................................................................................
 
 }
@@ -973,8 +975,8 @@ void ScaLBL_Communicator::RecvD3Q19AA(double *dist){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1057,8 +1059,8 @@ void ScaLBL_Communicator::RecvGrad(double *phi, double *grad){
 	// Recieves halo and incorporates into D3Q19 based stencil gradient computation
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1151,36 +1153,36 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,0,sendCount_x,sendbuf_x,Aq,N);
 	ScaLBL_D3Q19_Pack(2,dvcSendList_x,sendCount_x,sendCount_x,sendbuf_x,Bq,N);
 
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 2*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 2*recvCount_X,rank_X,recvtag);
+	MPI_Isend(sendbuf_x, 2*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 2*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
 	
 	//...Packing for X face(1,7,9,11,13)................................
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,0,sendCount_X,sendbuf_X,Aq,N);
 	ScaLBL_D3Q19_Pack(1,dvcSendList_X,sendCount_X,sendCount_X,sendbuf_X,Bq,N);
 	
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 2*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 2*recvCount_x,rank_x,recvtag);
+	MPI_Isend(sendbuf_X, 2*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 2*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
 
 	//...Packing for y face(4,8,9,16,18).................................
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,0,sendCount_y,sendbuf_y,Aq,N);
 	ScaLBL_D3Q19_Pack(4,dvcSendList_y,sendCount_y,sendCount_y,sendbuf_y,Bq,N);
 
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 2*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 2*recvCount_Y,rank_Y,recvtag);
+	MPI_Isend(sendbuf_y, 2*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 2*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
 	
 	//...Packing for Y face(3,7,10,15,17).................................
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,0,sendCount_Y,sendbuf_Y,Aq,N);
 	ScaLBL_D3Q19_Pack(3,dvcSendList_Y,sendCount_Y,sendCount_Y,sendbuf_Y,Bq,N);
 
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 2*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 2*recvCount_y,rank_y,recvtag);
+	MPI_Isend(sendbuf_Y, 2*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 2*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
 	
 	//...Packing for z face(6,12,13,16,17)................................
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,0,sendCount_z,sendbuf_z,Aq,N);
 	ScaLBL_D3Q19_Pack(6,dvcSendList_z,sendCount_z,sendCount_z,sendbuf_z,Bq,N);
 	
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 2*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 2*recvCount_Z,rank_Z,recvtag);
+	MPI_Isend(sendbuf_z, 2*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 2*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
 	
 	//...Packing for Z face(5,11,14,15,18)................................
 	ScaLBL_D3Q19_Pack(5,dvcSendList_Z,0,sendCount_Z,sendbuf_Z,Aq,N);
@@ -1188,8 +1190,8 @@ void ScaLBL_Communicator::BiSendD3Q7AA(double *Aq, double *Bq){
 
 	//...................................................................................
 	// Send all the distributions
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 2*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 2*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_Z, 2*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 2*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 
 }
 
@@ -1199,8 +1201,8 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(6,req1);
-	MPI_COMM_SCALBL.waitAll(6,req2);
+	MPI_Waitall(6,req1,stat1);
+	MPI_Waitall(6,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1291,18 +1293,18 @@ void ScaLBL_Communicator::TriSendD3Q7AA(double *Aq, double *Bq, double *Cq){
 
 	//...................................................................................
 	// Send all the distributions
-	req1[0] = MPI_COMM_SCALBL.Isend(sendbuf_x, 3*sendCount_x,rank_x,sendtag);
-	req2[0] = MPI_COMM_SCALBL.Irecv(recvbuf_X, 3*recvCount_X,rank_X,recvtag);
-	req1[1] = MPI_COMM_SCALBL.Isend(sendbuf_X, 3*sendCount_X,rank_X,sendtag);
-	req2[1] = MPI_COMM_SCALBL.Irecv(recvbuf_x, 3*recvCount_x,rank_x,recvtag);
-	req1[2] = MPI_COMM_SCALBL.Isend(sendbuf_y, 3*sendCount_y,rank_y,sendtag);
-	req2[2] = MPI_COMM_SCALBL.Irecv(recvbuf_Y, 3*recvCount_Y,rank_Y,recvtag);
-	req1[3] = MPI_COMM_SCALBL.Isend(sendbuf_Y, 3*sendCount_Y,rank_Y,sendtag);
-	req2[3] = MPI_COMM_SCALBL.Irecv(recvbuf_y, 3*recvCount_y,rank_y,recvtag);
-	req1[4] = MPI_COMM_SCALBL.Isend(sendbuf_z, 3*sendCount_z,rank_z,sendtag);
-	req2[4] = MPI_COMM_SCALBL.Irecv(recvbuf_Z, 3*recvCount_Z,rank_Z,recvtag);
-	req1[5] = MPI_COMM_SCALBL.Isend(sendbuf_Z, 3*sendCount_Z,rank_Z,sendtag);
-	req2[5] = MPI_COMM_SCALBL.Irecv(recvbuf_z, 3*recvCount_z,rank_z,recvtag);
+	MPI_Isend(sendbuf_x, 3*sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, 3*recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	MPI_Isend(sendbuf_X, 3*sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, 3*recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	MPI_Isend(sendbuf_y, 3*sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, 3*recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	MPI_Isend(sendbuf_Y, 3*sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, 3*recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	MPI_Isend(sendbuf_z, 3*sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, 3*recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	MPI_Isend(sendbuf_Z, 3*sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, 3*recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
 
 }
 
@@ -1312,8 +1314,8 @@ void ScaLBL_Communicator::TriRecvD3Q7AA(double *Aq, double *Bq, double *Cq){
 	// NOTE: the center distribution f0 must NOT be at the start of feven, provide offset to start of f2
 	//...................................................................................
 	// Wait for completion of D3Q19 communication
-	MPI_COMM_SCALBL.waitAll(6,req1);
-	MPI_COMM_SCALBL.waitAll(6,req2);
+	MPI_Waitall(6,req1,stat1);
+	MPI_Waitall(6,req2,stat2);
 	ScaLBL_DeviceBarrier();
 
 	//...................................................................................
@@ -1407,49 +1409,49 @@ void ScaLBL_Communicator::SendHalo(double *data){
 	// Send / Recv all the phase indcator field values
 	//...................................................................................
 
-	req1[0]  = MPI_COMM_SCALBL.Isend(sendbuf_x, sendCount_x,rank_x,sendtag);
-	req2[0]  = MPI_COMM_SCALBL.Irecv(recvbuf_X, recvCount_X,rank_X,recvtag);
-	req1[1]  = MPI_COMM_SCALBL.Isend(sendbuf_X, sendCount_X,rank_X,sendtag);
-	req2[1]  = MPI_COMM_SCALBL.Irecv(recvbuf_x, recvCount_x,rank_x,recvtag);
-	req1[2]  = MPI_COMM_SCALBL.Isend(sendbuf_y, sendCount_y,rank_y,sendtag);
-	req2[2]  = MPI_COMM_SCALBL.Irecv(recvbuf_Y, recvCount_Y,rank_Y,recvtag);
-	req1[3]  = MPI_COMM_SCALBL.Isend(sendbuf_Y, sendCount_Y,rank_Y,sendtag);
-	req2[3]  = MPI_COMM_SCALBL.Irecv(recvbuf_y, recvCount_y,rank_y,recvtag);
-	req1[4]  = MPI_COMM_SCALBL.Isend(sendbuf_z, sendCount_z,rank_z,sendtag);
-	req2[4]  = MPI_COMM_SCALBL.Irecv(recvbuf_Z, recvCount_Z,rank_Z,recvtag);
-	req1[5]  = MPI_COMM_SCALBL.Isend(sendbuf_Z, sendCount_Z,rank_Z,sendtag);
-	req2[5]  = MPI_COMM_SCALBL.Irecv(recvbuf_z, recvCount_z,rank_z,recvtag);
-	req1[6]  = MPI_COMM_SCALBL.Isend(sendbuf_xy, sendCount_xy,rank_xy,sendtag);
-	req2[6]  = MPI_COMM_SCALBL.Irecv(recvbuf_XY, recvCount_XY,rank_XY,recvtag);
-	req1[7]  = MPI_COMM_SCALBL.Isend(sendbuf_XY, sendCount_XY,rank_XY,sendtag);
-	req2[7]  = MPI_COMM_SCALBL.Irecv(recvbuf_xy, recvCount_xy,rank_xy,recvtag);
-	req1[8]  = MPI_COMM_SCALBL.Isend(sendbuf_Xy, sendCount_Xy,rank_Xy,sendtag);
-	req2[8]  = MPI_COMM_SCALBL.Irecv(recvbuf_xY, recvCount_xY,rank_xY,recvtag);
-	req1[9]  = MPI_COMM_SCALBL.Isend(sendbuf_xY, sendCount_xY,rank_xY,sendtag);
-	req2[9]  = MPI_COMM_SCALBL.Irecv(recvbuf_Xy, recvCount_Xy,rank_Xy,recvtag);
-	req1[10] = MPI_COMM_SCALBL.Isend(sendbuf_xz, sendCount_xz,rank_xz,sendtag);
-	req2[10] = MPI_COMM_SCALBL.Irecv(recvbuf_XZ, recvCount_XZ,rank_XZ,recvtag);
-	req1[11] = MPI_COMM_SCALBL.Isend(sendbuf_XZ, sendCount_XZ,rank_XZ,sendtag);
-	req2[11] = MPI_COMM_SCALBL.Irecv(recvbuf_xz, recvCount_xz,rank_xz,recvtag);
-	req1[12] = MPI_COMM_SCALBL.Isend(sendbuf_Xz, sendCount_Xz,rank_Xz,sendtag);
-	req2[12] = MPI_COMM_SCALBL.Irecv(recvbuf_xZ, recvCount_xZ,rank_xZ,recvtag);
-	req1[13] = MPI_COMM_SCALBL.Isend(sendbuf_xZ, sendCount_xZ,rank_xZ,sendtag);
-	req2[13] = MPI_COMM_SCALBL.Irecv(recvbuf_Xz, recvCount_Xz,rank_Xz,recvtag);
-	req1[14] = MPI_COMM_SCALBL.Isend(sendbuf_yz, sendCount_yz,rank_yz,sendtag);
-	req2[14] = MPI_COMM_SCALBL.Irecv(recvbuf_YZ, recvCount_YZ,rank_YZ,recvtag);
-	req1[15] = MPI_COMM_SCALBL.Isend(sendbuf_YZ, sendCount_YZ,rank_YZ,sendtag);
-	req2[15] = MPI_COMM_SCALBL.Irecv(recvbuf_yz, recvCount_yz,rank_yz,recvtag);
-	req1[16] = MPI_COMM_SCALBL.Isend(sendbuf_Yz, sendCount_Yz,rank_Yz,sendtag);
-	req2[16] = MPI_COMM_SCALBL.Irecv(recvbuf_yZ, recvCount_yZ,rank_yZ,recvtag);
-	req1[17] = MPI_COMM_SCALBL.Isend(sendbuf_yZ, sendCount_yZ,rank_yZ,sendtag);
-	req2[17] = MPI_COMM_SCALBL.Irecv(recvbuf_Yz, recvCount_Yz,rank_Yz,recvtag);
+	MPI_Isend(sendbuf_x, sendCount_x,MPI_DOUBLE,rank_x,sendtag,MPI_COMM_SCALBL,&req1[0]);
+	MPI_Irecv(recvbuf_X, recvCount_X,MPI_DOUBLE,rank_X,recvtag,MPI_COMM_SCALBL,&req2[0]);
+	MPI_Isend(sendbuf_X, sendCount_X,MPI_DOUBLE,rank_X,sendtag,MPI_COMM_SCALBL,&req1[1]);
+	MPI_Irecv(recvbuf_x, recvCount_x,MPI_DOUBLE,rank_x,recvtag,MPI_COMM_SCALBL,&req2[1]);
+	MPI_Isend(sendbuf_y, sendCount_y,MPI_DOUBLE,rank_y,sendtag,MPI_COMM_SCALBL,&req1[2]);
+	MPI_Irecv(recvbuf_Y, recvCount_Y,MPI_DOUBLE,rank_Y,recvtag,MPI_COMM_SCALBL,&req2[2]);
+	MPI_Isend(sendbuf_Y, sendCount_Y,MPI_DOUBLE,rank_Y,sendtag,MPI_COMM_SCALBL,&req1[3]);
+	MPI_Irecv(recvbuf_y, recvCount_y,MPI_DOUBLE,rank_y,recvtag,MPI_COMM_SCALBL,&req2[3]);
+	MPI_Isend(sendbuf_z, sendCount_z,MPI_DOUBLE,rank_z,sendtag,MPI_COMM_SCALBL,&req1[4]);
+	MPI_Irecv(recvbuf_Z, recvCount_Z,MPI_DOUBLE,rank_Z,recvtag,MPI_COMM_SCALBL,&req2[4]);
+	MPI_Isend(sendbuf_Z, sendCount_Z,MPI_DOUBLE,rank_Z,sendtag,MPI_COMM_SCALBL,&req1[5]);
+	MPI_Irecv(recvbuf_z, recvCount_z,MPI_DOUBLE,rank_z,recvtag,MPI_COMM_SCALBL,&req2[5]);
+	MPI_Isend(sendbuf_xy, sendCount_xy,MPI_DOUBLE,rank_xy,sendtag,MPI_COMM_SCALBL,&req1[6]);
+	MPI_Irecv(recvbuf_XY, recvCount_XY,MPI_DOUBLE,rank_XY,recvtag,MPI_COMM_SCALBL,&req2[6]);
+	MPI_Isend(sendbuf_XY, sendCount_XY,MPI_DOUBLE,rank_XY,sendtag,MPI_COMM_SCALBL,&req1[7]);
+	MPI_Irecv(recvbuf_xy, recvCount_xy,MPI_DOUBLE,rank_xy,recvtag,MPI_COMM_SCALBL,&req2[7]);
+	MPI_Isend(sendbuf_Xy, sendCount_Xy,MPI_DOUBLE,rank_Xy,sendtag,MPI_COMM_SCALBL,&req1[8]);
+	MPI_Irecv(recvbuf_xY, recvCount_xY,MPI_DOUBLE,rank_xY,recvtag,MPI_COMM_SCALBL,&req2[8]);
+	MPI_Isend(sendbuf_xY, sendCount_xY,MPI_DOUBLE,rank_xY,sendtag,MPI_COMM_SCALBL,&req1[9]);
+	MPI_Irecv(recvbuf_Xy, recvCount_Xy,MPI_DOUBLE,rank_Xy,recvtag,MPI_COMM_SCALBL,&req2[9]);
+	MPI_Isend(sendbuf_xz, sendCount_xz,MPI_DOUBLE,rank_xz,sendtag,MPI_COMM_SCALBL,&req1[10]);
+	MPI_Irecv(recvbuf_XZ, recvCount_XZ,MPI_DOUBLE,rank_XZ,recvtag,MPI_COMM_SCALBL,&req2[10]);
+	MPI_Isend(sendbuf_XZ, sendCount_XZ,MPI_DOUBLE,rank_XZ,sendtag,MPI_COMM_SCALBL,&req1[11]);
+	MPI_Irecv(recvbuf_xz, recvCount_xz,MPI_DOUBLE,rank_xz,recvtag,MPI_COMM_SCALBL,&req2[11]);
+	MPI_Isend(sendbuf_Xz, sendCount_Xz,MPI_DOUBLE,rank_Xz,sendtag,MPI_COMM_SCALBL,&req1[12]);
+	MPI_Irecv(recvbuf_xZ, recvCount_xZ,MPI_DOUBLE,rank_xZ,recvtag,MPI_COMM_SCALBL,&req2[12]);
+	MPI_Isend(sendbuf_xZ, sendCount_xZ,MPI_DOUBLE,rank_xZ,sendtag,MPI_COMM_SCALBL,&req1[13]);
+	MPI_Irecv(recvbuf_Xz, recvCount_Xz,MPI_DOUBLE,rank_Xz,recvtag,MPI_COMM_SCALBL,&req2[13]);
+	MPI_Isend(sendbuf_yz, sendCount_yz,MPI_DOUBLE,rank_yz,sendtag,MPI_COMM_SCALBL,&req1[14]);
+	MPI_Irecv(recvbuf_YZ, recvCount_YZ,MPI_DOUBLE,rank_YZ,recvtag,MPI_COMM_SCALBL,&req2[14]);
+	MPI_Isend(sendbuf_YZ, sendCount_YZ,MPI_DOUBLE,rank_YZ,sendtag,MPI_COMM_SCALBL,&req1[15]);
+	MPI_Irecv(recvbuf_yz, recvCount_yz,MPI_DOUBLE,rank_yz,recvtag,MPI_COMM_SCALBL,&req2[15]);
+	MPI_Isend(sendbuf_Yz, sendCount_Yz,MPI_DOUBLE,rank_Yz,sendtag,MPI_COMM_SCALBL,&req1[16]);
+	MPI_Irecv(recvbuf_yZ, recvCount_yZ,MPI_DOUBLE,rank_yZ,recvtag,MPI_COMM_SCALBL,&req2[16]);
+	MPI_Isend(sendbuf_yZ, sendCount_yZ,MPI_DOUBLE,rank_yZ,sendtag,MPI_COMM_SCALBL,&req1[17]);
+	MPI_Irecv(recvbuf_Yz, recvCount_Yz,MPI_DOUBLE,rank_Yz,recvtag,MPI_COMM_SCALBL,&req2[17]);
 	//...................................................................................
 }
 void ScaLBL_Communicator::RecvHalo(double *data){
 
 	//...................................................................................
-	MPI_COMM_SCALBL.waitAll(18,req1);
-	MPI_COMM_SCALBL.waitAll(18,req2);
+	MPI_Waitall(18,req1,stat1);
+	MPI_Waitall(18,req2,stat2);
 	ScaLBL_DeviceBarrier();
 	//...................................................................................
 	//...................................................................................
@@ -1562,7 +1564,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		LocInletArea = double(sendCount_z);
 	else LocInletArea = 0.f;
 	
-	InletArea = MPI_COMM_SCALBL.sumReduce( LocInletArea );
+	MPI_Allreduce(&LocInletArea,&InletArea,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 	//printf("Inlet area = %f \n", InletArea);
 
 	// Set the flux BC
@@ -1571,7 +1573,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAeven_Flux_BC_z(dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 		
-		sum = MPI_COMM_SCALBL.sumReduce( locsum );
+		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 		din = flux/InletArea + sum;
 		//if (rank==0) printf("computed din (even) =%f \n",din);
 		if (kproc == 0)
@@ -1581,7 +1583,7 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 		if (kproc == 0) 
 			locsum = ScaLBL_D3Q19_AAodd_Flux_BC_z(neighborList, dvcSendList_z, fq, flux, InletArea, sendCount_z, N);
 
-		sum = MPI_COMM_SCALBL.sumReduce( locsum );
+		MPI_Allreduce(&locsum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_SCALBL);
 		din = flux/InletArea + sum;
 		
 		//if (rank==0) printf("computed din (odd)=%f \n",din);
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 51195f5a..c737659c 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -202,8 +202,9 @@ private:
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
 	MPI_Group Group;	// Group of processors associated with this domain
-	Utilities::MPI MPI_COMM_SCALBL;		// MPI Communicator for this domain
+	MPI_Comm MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
+	MPI_Status stat1[18],stat2[18];
 	//......................................................................................
 	// MPI ranks for all 18 neighbors
 	//......................................................................................
diff --git a/common/SpherePack.cpp b/common/SpherePack.cpp
index 18057653..a7246b72 100644
--- a/common/SpherePack.cpp
+++ b/common/SpherePack.cpp
@@ -9,6 +9,7 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 #include "common/SpherePack.h"
diff --git a/common/SpherePack.h b/common/SpherePack.h
index 56284a40..5075b289 100644
--- a/common/SpherePack.h
+++ b/common/SpherePack.h
@@ -12,6 +12,7 @@
 
 #include "common/Array.h"
 #include "common/Utilities.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Database.h"
 
diff --git a/common/UnitTest.cpp b/common/UnitTest.cpp
index aeb9026e..b995fa68 100755
--- a/common/UnitTest.cpp
+++ b/common/UnitTest.cpp
@@ -14,49 +14,44 @@
 /********************************************************************
  *  Constructor/Destructor                                           *
  ********************************************************************/
-UnitTest::UnitTest() : d_verbose( false ), d_comm( MPI_COMM_SELF )
+UnitTest::UnitTest()
 {
-    if ( Utilities::MPI::MPI_active() )
-        d_comm = MPI_COMM_WORLD;
+#ifdef USE_MPI
+    comm = MPI_COMM_WORLD;
+#endif
 }
 UnitTest::~UnitTest() { reset(); }
 void UnitTest::reset()
 {
-    d_mutex.lock();
+    mutex.lock();
     // Clear the data forcing a reallocation
-    std::vector<std::string>().swap( d_pass );
-    std::vector<std::string>().swap( d_fail );
-    std::vector<std::string>().swap( d_expected );
-    d_mutex.unlock();
+    std::vector<std::string>().swap( pass_messages );
+    std::vector<std::string>().swap( fail_messages );
+    std::vector<std::string>().swap( expected_fail_messages );
+    mutex.unlock();
 }
 
 
 /********************************************************************
  *  Add a pass, fail, expected failure message in a thread-safe way  *
  ********************************************************************/
-void UnitTest::passes( std::string in )
+void UnitTest::passes( const std::string &in )
 {
-    d_mutex.lock();
-    if ( d_verbose )
-        printf( "UnitTest: %i passes: %s\n", d_comm.getRank(), in.data() );
-    d_pass.emplace_back( std::move( in ) );
-    d_mutex.unlock();
+    mutex.lock();
+    pass_messages.push_back( in );
+    mutex.unlock();
 }
-void UnitTest::failure( std::string in )
+void UnitTest::failure( const std::string &in )
 {
-    d_mutex.lock();
-    if ( d_verbose )
-        printf( "UnitTest: %i failed: %s\n", d_comm.getRank(), in.data() );
-    d_fail.emplace_back( std::move( in ) );
-    d_mutex.unlock();
+    mutex.lock();
+    fail_messages.push_back( in );
+    mutex.unlock();
 }
-void UnitTest::expected_failure( std::string in )
+void UnitTest::expected_failure( const std::string &in )
 {
-    d_mutex.lock();
-    if ( d_verbose )
-        printf( "UnitTest: %i expected_failure: %s\n", d_comm.getRank(), in.data() );
-    d_expected.emplace_back( std::move( in ) );
-    d_mutex.unlock();
+    mutex.lock();
+    expected_fail_messages.push_back( in );
+    mutex.unlock();
 }
 
 
@@ -64,6 +59,23 @@ void UnitTest::expected_failure( std::string in )
  *  Print a global report                                            *
  *  Note: only rank 0 will print, all messages will be aggregated    *
  ********************************************************************/
+inline std::vector<int> UnitTest::allGather( int value ) const
+{
+    int size = getSize();
+    std::vector<int> data( size, value );
+#ifdef USE_MPI
+    if ( size > 1 )
+        MPI_Allgather( &value, 1, MPI_INT, data.data(), 1, MPI_INT, comm );
+#endif
+    return data;
+}
+inline void UnitTest::barrier() const
+{
+#ifdef USE_MPI
+    if ( getSize() > 1 )
+        MPI_Barrier( comm );
+#endif
+}
 static inline void print_messages( const std::vector<std::vector<std::string>> &messages )
 {
     if ( messages.size() > 1 ) {
@@ -81,27 +93,28 @@ static inline void print_messages( const std::vector<std::vector<std::string>> &
 }
 void UnitTest::report( const int level0 ) const
 {
-    d_mutex.lock();
-    int size = d_comm.getSize();
-    int rank = d_comm.getRank();
-    // Give all processors a chance to print any remaining messages
-    d_comm.barrier();
-    Utilities::sleep_ms( 10 );
+    mutex.lock();
+    int size = getSize();
+    int rank = getRank();
     // Broadcast the print level from rank 0
-    int level = d_comm.bcast( level0, 0 );
+    int level = level0;
+#ifdef USE_MPI
+    if ( getSize() > 1 )
+        MPI_Bcast( &level, 1, MPI_INT, 0, comm );
+#endif
     if ( level < 0 || level > 2 )
         ERROR( "Invalid print level" );
     // Perform a global all gather to get the number of failures per processor
-    auto N_pass        = d_comm.allGather<int>( d_pass.size() );
-    auto N_fail        = d_comm.allGather<int>( d_fail.size() );
-    auto N_expected    = d_comm.allGather<int>( d_expected.size() );
-    int N_pass_tot     = 0;
-    int N_fail_tot     = 0;
-    int N_expected_tot = 0;
+    auto N_pass             = allGather( pass_messages.size() );
+    auto N_fail             = allGather( fail_messages.size() );
+    auto N_expected_fail    = allGather( expected_fail_messages.size() );
+    int N_pass_tot          = 0;
+    int N_fail_tot          = 0;
+    int N_expected_fail_tot = 0;
     for ( int i = 0; i < size; i++ ) {
         N_pass_tot += N_pass[i];
         N_fail_tot += N_fail[i];
-        N_expected_tot += N_expected[i];
+        N_expected_fail_tot += N_expected_fail[i];
     }
     // Send all messages to rank 0 (if needed)
     std::vector<std::vector<std::string>> pass_messages_rank( size );
@@ -109,13 +122,13 @@ void UnitTest::report( const int level0 ) const
     std::vector<std::vector<std::string>> expected_fail_rank( size );
     // Get the pass messages
     if ( ( level == 1 && N_pass_tot <= 20 ) || level == 2 )
-        pass_messages_rank = UnitTest::gatherMessages( d_pass, 1 );
+        pass_messages_rank = UnitTest::gatherMessages( pass_messages, 1 );
     // Get the fail messages
     if ( level == 1 || level == 2 )
-        fail_messages_rank = UnitTest::gatherMessages( d_fail, 2 );
+        fail_messages_rank = UnitTest::gatherMessages( fail_messages, 2 );
     // Get the expected_fail messages
-    if ( ( level == 1 && N_expected_tot <= 50 ) || level == 2 )
-        expected_fail_rank = UnitTest::gatherMessages( d_expected, 2 );
+    if ( ( level == 1 && N_expected_fail_tot <= 50 ) || level == 2 )
+        expected_fail_rank = UnitTest::gatherMessages( expected_fail_messages, 2 );
     // Print the results of all messages (only rank 0 will print)
     if ( rank == 0 ) {
         pout << std::endl;
@@ -161,31 +174,31 @@ void UnitTest::report( const int level0 ) const
         pout << std::endl;
         // Print the tests that expected failed
         pout << "Tests expected failed" << std::endl;
-        if ( level == 0 || ( level == 1 && N_expected_tot > 50 ) ) {
+        if ( level == 0 || ( level == 1 && N_expected_fail_tot > 50 ) ) {
             // We want to print a summary
             if ( size > 8 ) {
                 // Print 1 summary for all processors
                 printp( "     %i tests expected failed (use report level 2 for more detail)\n",
-                    N_expected_tot );
+                    N_expected_fail_tot );
             } else {
                 // Print a summary for each processor
                 for ( int i = 0; i < size; i++ )
                     printp( "     %i tests expected failed (proc %i) (use report level 2 for more "
                             "detail)\n",
-                        N_expected[i], i );
+                        N_expected_fail[i], i );
             }
         } else {
             // We want to print all messages
             for ( int i = 0; i < size; i++ )
-                ASSERT( (int) expected_fail_rank[i].size() == N_expected[i] );
+                ASSERT( (int) expected_fail_rank[i].size() == N_expected_fail[i] );
             print_messages( expected_fail_rank );
         }
         pout << std::endl;
     }
     // Add a barrier to synchronize all processors (rank 0 is much slower)
-    d_comm.barrier();
+    barrier();
     Utilities::sleep_ms( 10 ); // Need a brief pause to allow any printing to finish
-    d_mutex.unlock();
+    mutex.unlock();
 }
 
 
@@ -195,8 +208,8 @@ void UnitTest::report( const int level0 ) const
 std::vector<std::vector<std::string>> UnitTest::gatherMessages(
     const std::vector<std::string> &local_messages, int tag ) const
 {
-    const int rank = d_comm.getRank();
-    const int size = d_comm.getSize();
+    const int rank = getRank();
+    const int size = getSize();
     std::vector<std::vector<std::string>> messages( size );
     if ( rank == 0 ) {
         // Rank 0 should receive all messages
@@ -220,6 +233,7 @@ std::vector<std::vector<std::string>> UnitTest::gatherMessages(
 void UnitTest::pack_message_stream(
     const std::vector<std::string> &messages, const int rank, const int tag ) const
 {
+#ifdef USE_MPI
     // Get the size of the messages
     auto N_messages  = (int) messages.size();
     auto *msg_size   = new int[N_messages];
@@ -240,11 +254,18 @@ void UnitTest::pack_message_stream(
         k += msg_size[i];
     }
     // Send the message stream (using a non-blocking send)
-    auto request = d_comm.Isend( data, size_data, rank, tag );
+    MPI_Request request;
+    MPI_Isend( data, size_data, MPI_CHAR, rank, tag, comm, &request );
     // Wait for the communication to send and free the temporary memory
-    d_comm.wait( request );
+    MPI_Status status;
+    MPI_Wait( &request, &status );
     delete[] data;
     delete[] msg_size;
+#else
+    NULL_USE( messages );
+    NULL_USE( rank );
+    NULL_USE( tag );
+#endif
 }
 
 
@@ -253,15 +274,20 @@ void UnitTest::pack_message_stream(
  ********************************************************************/
 std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const int tag ) const
 {
+#ifdef USE_MPI
     // Probe the message to get the message size
-    int size_data = d_comm.probe( rank, tag );
+    MPI_Status status;
+    MPI_Probe( rank, tag, comm, &status );
+    int size_data = -1;
+    MPI_Get_count( &status, MPI_BYTE, &size_data );
     ASSERT( size_data >= 0 );
     // Allocate memory to receive the data
     auto *data = new char[size_data];
     // receive the data (using a non-blocking receive)
-    auto request = d_comm.Irecv( data, size_data, rank, tag );
+    MPI_Request request;
+    MPI_Irecv( data, size_data, MPI_CHAR, rank, tag, comm, &request );
     // Wait for the communication to be received
-    d_comm.wait( request );
+    MPI_Wait( &request, &status );
     // Unpack the message stream
     int N_messages = 0;
     memcpy( &N_messages, data, sizeof( int ) );
@@ -277,16 +303,77 @@ std::vector<std::string> UnitTest::unpack_message_stream( const int rank, const
         messages[i] = std::string( &data[k], msg_size[i] );
         k += msg_size[i];
     }
-    // Delete the temporary memory
     delete[] data;
     return messages;
+#else
+    NULL_USE( rank );
+    NULL_USE( tag );
+    return std::vector<std::string>();
+#endif
 }
 
 
 /********************************************************************
  *  Other functions                                                  *
  ********************************************************************/
-size_t UnitTest::NumPassGlobal() const { return d_comm.sumReduce( d_pass.size() ); }
-size_t UnitTest::NumFailGlobal() const { return d_comm.sumReduce( d_fail.size() ); }
-size_t UnitTest::NumExpectedFailGlobal() const { return d_comm.sumReduce( d_expected.size() ); }
-
+int UnitTest::getRank() const
+{
+    int rank = 0;
+#ifdef USE_MPI
+    int flag = 0;
+    MPI_Initialized( &flag );
+    if ( flag )
+        MPI_Comm_rank( comm, &rank );
+#endif
+    return rank;
+}
+int UnitTest::getSize() const
+{
+    int size = 1;
+#ifdef USE_MPI
+    int flag = 0;
+    MPI_Initialized( &flag );
+    if ( flag )
+        MPI_Comm_size( comm, &size );
+#endif
+    return size;
+}
+size_t UnitTest::NumPassGlobal() const
+{
+    size_t num = pass_messages.size();
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
+    return num;
+}
+size_t UnitTest::NumFailGlobal() const
+{
+    size_t num = fail_messages.size();
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
+    return num;
+}
+size_t UnitTest::NumExpectedFailGlobal() const
+{
+    size_t num = expected_fail_messages.size();
+#ifdef USE_MPI
+    if ( getSize() > 1 ) {
+        auto send = static_cast<int>( num );
+        int sum   = 0;
+        MPI_Allreduce( &send, &sum, 1, MPI_INT, MPI_SUM, comm );
+        num = static_cast<size_t>( sum );
+    }
+#endif
+    return num;
+}
diff --git a/common/UnitTest.h b/common/UnitTest.h
index 9d452747..80503d19 100755
--- a/common/UnitTest.h
+++ b/common/UnitTest.h
@@ -1,11 +1,13 @@
 #ifndef included_UnitTest
 #define included_UnitTest
 
-#include "common/MPI.h"
-
 #include <mutex>
+#include <sstream>
 #include <string>
 #include <vector>
+#ifdef USE_MPI
+#include "mpi.h"
+#endif
 
 
 /*!
@@ -26,47 +28,47 @@
  * \endcode
 
  */
-class UnitTest final
+class UnitTest
 {
 public:
     //! Constructor
     UnitTest();
 
     //! Destructor
-    ~UnitTest();
-
-    // Copy constructor
-    UnitTest( const UnitTest & ) = delete;
-
-    // Assignment operator
-    UnitTest &operator=( const UnitTest & ) = delete;
+    virtual ~UnitTest();
 
     //! Indicate a passed test (thread-safe)
-    void passes( std::string in );
+    virtual void passes( const std::string &in );
 
     //! Indicate a failed test (thread-safe)
-    void failure( std::string in );
+    virtual void failure( const std::string &in );
 
     //! Indicate an expected failed test (thread-safe)
-    void expected_failure( std::string in );
+    virtual void expected_failure( const std::string &in );
 
     //! Return the number of passed tests locally
-    inline size_t NumPassLocal() const { return d_pass.size(); }
+    virtual size_t NumPassLocal() const { return pass_messages.size(); }
 
     //! Return the number of failed tests locally
-    inline size_t NumFailLocal() const { return d_fail.size(); }
+    virtual size_t NumFailLocal() const { return fail_messages.size(); }
 
     //! Return the number of expected failed tests locally
-    inline size_t NumExpectedFailLocal() const { return d_expected.size(); }
+    virtual size_t NumExpectedFailLocal() const { return expected_fail_messages.size(); }
 
     //! Return the number of passed tests locally
-    size_t NumPassGlobal() const;
+    virtual size_t NumPassGlobal() const;
 
     //! Return the number of failed tests locally
-    size_t NumFailGlobal() const;
+    virtual size_t NumFailGlobal() const;
 
     //! Return the number of expected failed tests locally
-    size_t NumExpectedFailGlobal() const;
+    virtual size_t NumExpectedFailGlobal() const;
+
+    //! Return the rank of the current processor
+    int getRank() const;
+
+    //! Return the number of processors
+    int getSize() const;
 
     /*!
      * Print a report of the passed and failed tests.
@@ -75,28 +77,29 @@ public:
      * to print correctly).
      * @param level     Optional integer specifying the level of reporting (default: 1)
      *                  0: Report the number of tests passed, failed, and expected failures.
-     *                  1: Report the passed tests (if <=20) or number passed,
-     *                     Report all failures,
-     *                     Report the expected failed tests (if <=50) or the number passed.
+     *                  1: Report the number of passed tests (if <=20) or the number passed
+     *                     otherwise, report all failures, report the number of expected
+     *                     failed tests (if <=50) or the number passed otherwise.
      *                  2: Report all passed, failed, and expected failed tests.
      */
-    void report( const int level = 1 ) const;
+    virtual void report( const int level = 1 ) const;
 
     //! Clear the messages
     void reset();
 
-    //! Make the unit test operator verbose?
-    void verbose( bool verbose = true ) { d_verbose = verbose; }
+protected:
+    std::vector<std::string> pass_messages;
+    std::vector<std::string> fail_messages;
+    std::vector<std::string> expected_fail_messages;
+    mutable std::mutex mutex;
+#ifdef USE_MPI
+    MPI_Comm comm;
+#endif
 
 private:
-    std::vector<std::string> d_pass;
-    std::vector<std::string> d_fail;
-    std::vector<std::string> d_expected;
-    bool d_verbose;
-    mutable std::mutex d_mutex;
-    Utilities::MPI d_comm;
+    // Make the copy constructor private
+    UnitTest( const UnitTest & ) {}
 
-private:
     // Function to pack the messages into a single data stream and send to the given processor
     // Note: This function does not return until the message stream has been sent
     void pack_message_stream(
@@ -106,7 +109,9 @@ private:
     // Note: This function does not return until the message stream has been received
     std::vector<std::string> unpack_message_stream( const int rank, const int tag ) const;
 
-    // Gather the messages
+    // Helper functions
+    inline void barrier() const;
+    inline std::vector<int> allGather( int value ) const;
     inline std::vector<std::vector<std::string>> gatherMessages(
         const std::vector<std::string> &local_messages, int tag ) const;
 };
diff --git a/common/UtilityMacros.h b/common/UtilityMacros.h
index 2c374ef1..bfac172f 100644
--- a/common/UtilityMacros.h
+++ b/common/UtilityMacros.h
@@ -143,43 +143,35 @@
  *      Be sure to follow with ENABLE_WARNINGS
  */
 // clang-format off
-#ifndef DISABLE_WARNINGS
-#if defined( USING_MSVC )
+#ifdef DISABLE_WARNINGS
+    // Macros previously defined
+#elif defined( USING_MSVC )
     #define DISABLE_WARNINGS __pragma( warning( push, 0 ) )
     #define ENABLE_WARNINGS __pragma( warning( pop ) )
 #elif defined( USING_CLANG )
     #define DISABLE_WARNINGS                                                \
-        _Pragma( "clang diagnostic push" )                                  \
-        _Pragma( "clang diagnostic ignored \"-Wall\"" )                     \
+        _Pragma( "clang diagnostic push" ) _Pragma( "clang diagnostic ignored \"-Wall\"" ) \
         _Pragma( "clang diagnostic ignored \"-Wextra\"" )                   \
         _Pragma( "clang diagnostic ignored \"-Wunused-private-field\"" )    \
-        _Pragma( "clang diagnostic ignored \"-Wdeprecated-declarations\"" ) \
-        _Pragma( "clang diagnostic ignored \"-Winteger-overflow\"" )
+        _Pragma( "clang diagnostic ignored \"-Wmismatched-new-delete\"" )
     #define ENABLE_WARNINGS _Pragma( "clang diagnostic pop" )
 #elif defined( USING_GCC )
+    // Note: We cannot disable the -Wliteral-suffix message with this macro because the
+    // pragma command cannot suppress warnings from the C++ preprocessor.  See gcc bug #53431.
     #define DISABLE_WARNINGS                                                \
-        _Pragma( "GCC diagnostic push" )                                    \
-        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                   \
-        _Pragma( "GCC diagnostic ignored \"-Wall\"" )                       \
+        _Pragma( "GCC diagnostic push" ) _Pragma( "GCC diagnostic ignored \"-Wall\"" ) \
         _Pragma( "GCC diagnostic ignored \"-Wextra\"" )                     \
-        _Pragma( "GCC diagnostic ignored \"-Wpedantic\"" )                  \
+        _Pragma( "GCC diagnostic ignored \"-Wpragmas\"" )                     \
         _Pragma( "GCC diagnostic ignored \"-Wunused-local-typedefs\"" )     \
         _Pragma( "GCC diagnostic ignored \"-Woverloaded-virtual\"" )        \
         _Pragma( "GCC diagnostic ignored \"-Wunused-parameter\"" )          \
-        _Pragma( "GCC diagnostic ignored \"-Wdeprecated-declarations\"" )   \
-        _Pragma( "GCC diagnostic ignored \"-Wvirtual-move-assign\"" )       \
-        _Pragma( "GCC diagnostic ignored \"-Wunused-function\"" )           \
-        _Pragma( "GCC diagnostic ignored \"-Woverflow\"" )                  \
-        _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )           \
-        _Pragma( "GCC diagnostic ignored \"-Wignored-qualifiers\"" )        \
-        _Pragma( "GCC diagnostic ignored \"-Wenum-compare\"" )              \
+        _Pragma( "GCC diagnostic ignored \"-Warray-bounds\"" )              \
         _Pragma( "GCC diagnostic ignored \"-Wterminate\"" )
     #define ENABLE_WARNINGS _Pragma( "GCC diagnostic pop" )
 #else
     #define DISABLE_WARNINGS
     #define ENABLE_WARNINGS
 #endif
-#endif
 // clang-format on
 
 
diff --git a/cpu/BGK.cpp b/cpu/BGK.cpp
index bccc5b77..436ab381 100644
--- a/cpu/BGK.cpp
+++ b/cpu/BGK.cpp
@@ -1,4 +1,5 @@
 extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
 	// conserved momemnts
 	double rho,ux,uy,uz,uu;
 	// non-conserved moments
@@ -110,12 +111,14 @@ extern "C" void ScaLBL_D3Q19_AAeven_BGK(double *dist, int start, int finish, int
 }
 
 extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int start, int finish, int Np, double rlx, double Fx, double Fy, double Fz){
+	int n;
 	// conserved momemnts
 	double rho,ux,uy,uz,uu;
 	// non-conserved moments
 	double f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	int nr1,nr2,nr3,nr4,nr5,nr6,nr7,nr8,nr9,nr10,nr11,nr12,nr13,nr14,nr15,nr16,nr17,nr18;
 
+	int nread;
 	for (int n=start; n<finish; n++){
 		
 		// q=0
@@ -272,4 +275,4 @@ extern "C" void ScaLBL_D3Q19_AAodd_BGK(int *neighborList, double *dist, int star
 				rlx*0.02777777777777778*(rho - 3.0*(uy-uz) + 4.5*(uy-uz)*(uy-uz) - uu) - 0.08333333333*(Fy-Fz);
 
 	}
-}
+}
\ No newline at end of file
diff --git a/cpu/Color.cpp b/cpu/Color.cpp
index 35b34a5c..7ae84341 100644
--- a/cpu/Color.cpp
+++ b/cpu/Color.cpp
@@ -920,17 +920,21 @@ extern "C" void ScaLBL_D3Q7_ColorCollideMass(char *ID, double *A_even, double *A
 		double *Den, double *Phi, double *ColorGrad, double *Velocity, double beta, int N, bool pBC)
 {
 	char id;
+
+	int idx,n,q,Cqx,Cqy,Cqz;
+	//	int sendLoc;
+
 	double f0,f1,f2,f3,f4,f5,f6;
 	double na,nb,nab;		// density values
 	double ux,uy,uz;	// flow velocity
 	double nx,ny,nz,C;	// color gradient components
 	double a1,a2,b1,b2;
-	double delta;
+	double sp,delta;
 	//double feq[6];		// equilibrium distributions
 	// Set of Discrete velocities for the D3Q19 Model
 	//int D3Q7[3][3]={{1,0,0},{0,1,0},{0,0,1}};
 
-	for (int n=0; n<N; n++){
+	for (n=0; n<N; n++){
 		id = ID[n];
 		if (id != 0 ){
 
@@ -1220,20 +1224,25 @@ extern "C" void DensityStreamD3Q7(char *ID, double *Den, double *Copy, double *P
 
 extern "C" void ScaLBL_ComputePhaseField(char *ID, double *Phi, double *Den, int N)
 {
+	int n;
+	double Na,Nb;
+	//...................................................................
 	// Update Phi
-	for (int n=0; n<N; n++){
+	for (n=0; n<N; n++){
 
 		if (ID[n] > 0 ){
 			// Get the density value (Streaming already performed)
-			double Na = Den[n];
-			double Nb = Den[N+n];
+			Na = Den[n];
+			Nb = Den[N+n];
 			Phi[n] = (Na-Nb)/(Na+Nb);
 		}
 	}
+	//...................................................................
 }
 
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice){
-	for (int n=Slice*Nx*Ny; n<(Slice+1)*Nx*Ny; n++){
+	int n;
+	for (n=Slice*Nx*Ny; n<(Slice+1)*Nx*Ny; n++){
 		Phi[n] = value;
 	}
 }
@@ -1246,7 +1255,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Color(int *Map, double *dist, double *Aq, do
 		double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
 		double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){
 
-	int ijk,nn;
+	int ijk,nn,n;
 	double fq;
 	// conserved momemnts
 	double rho,jx,jy,jz;
@@ -1829,7 +1838,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *di
 		double *Phi, double *Vel, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
 		double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np){
 	
-	int nn,ijk,nread;
+	int n,nn,ijk,nread;
 	int nr1,nr2,nr3,nr4,nr5,nr6;
 	int nr7,nr8,nr9,nr10;
 	int nr11,nr12,nr13,nr14;
@@ -2483,7 +2492,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *di
 extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double *Aq, double *Bq, 
 			double *Den, double *Phi, int start, int finish, int Np){
 
-	int idx, nread;
+	int idx,n,nread;
 	double fq,nA,nB;
 
 	for (int n=start; n<finish; n++){
@@ -2569,11 +2578,11 @@ extern "C" void ScaLBL_D3Q7_AAodd_PhaseField(int *neighborList, int *Map, double
 }
 
 extern "C" void ScaLBL_D3Q7_AAeven_PhaseField(int *Map, double *Aq, double *Bq, double *Den, double *Phi, 
-			int start, int finish, int Np)
-{
+			int start, int finish, int Np){
+	int idx,n,nread;
+	double fq,nA,nB;
 	for (int n=start; n<finish; n++){
-		double fq,nA,nB;
-
+		
 		// compute number density for component A
 		// q=0
 		fq = Aq[n];
@@ -2637,25 +2646,27 @@ extern "C" void ScaLBL_D3Q7_AAeven_PhaseField(int *Map, double *Aq, double *Bq,
 		Den[Np+n] = nB;
 		
 		// save the phase indicator field
-		int idx = Map[n];
+		idx = Map[n];
 		Phi[idx] = (nA-nB)/(nA+nB); 	
 	}	
 }
 
 extern "C" void ScaLBL_D3Q19_Gradient(int *Map, double *phi, double *ColorGrad, int start, int finish, int Np, int Nx, int Ny, int Nz){
+	int idx,n,N,i,j,k,nn;
 	// distributions
 	double f1,f2,f3,f4,f5,f6,f7,f8,f9;
 	double f10,f11,f12,f13,f14,f15,f16,f17,f18;
 	double nx,ny,nz;
-	for (int idx=0; idx<Np; idx++){
+
+	for (idx=0; idx<Np; idx++){
 
 		// Get the 1D index based on regular data layout
-		int n = Map[idx];
+		n = Map[idx];
 		
 		//.......Back out the 3D indices for node n..............
-		int k = n/(Nx*Ny);
-		int j = (n-Nx*Ny*k)/Nx;
-		int i = n-Nx*Ny*k-Nx*j;
+		k = n/(Nx*Ny);
+		j = (n-Nx*Ny*k)/Nx;
+		i = n-Nx*Ny*k-Nx*j;
 		//........................................................................
 		//........Get 1-D index for this thread....................
 		//		n = S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x;
@@ -2664,7 +2675,7 @@ extern "C" void ScaLBL_D3Q19_Gradient(int *Map, double *phi, double *ColorGrad,
 		//........................................................................
 		//.................Read Phase Indicator Values............................
 		//........................................................................
-		int nn = n-1;						// neighbor index (get convention)
+		nn = n-1;							// neighbor index (get convention)
 		if (i-1<0)		nn += Nx;			// periodic BC along the x-boundary
 		f1 = phi[nn];						// get neighbor for phi - 1
 		//........................................................................
diff --git a/cpu/exe/lb2_Color_mpi.cpp b/cpu/exe/lb2_Color_mpi.cpp
index 0cade21e..6c92f014 100644
--- a/cpu/exe/lb2_Color_mpi.cpp
+++ b/cpu/exe/lb2_Color_mpi.cpp
@@ -4,7 +4,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
diff --git a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
index 298e3fae..47fbbf43 100644
--- a/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
+++ b/cpu/exe/lb2_Color_wia_mpi_bubble.cpp
@@ -10,7 +10,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 96d7f214..4ef7573f 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -9,7 +9,7 @@ color lattice boltzmann model
 #include <stdlib.h>
 #include <time.h>
 
-ScaLBL_ColorModel::ScaLBL_ColorModel(int RANK, int NP, const Utilities::MPI& COMM):
+ScaLBL_ColorModel::ScaLBL_ColorModel(int RANK, int NP, MPI_Comm COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tauA(0),tauB(0),rhoA(0),rhoB(0),alpha(0),beta(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),inletA(0),inletB(0),outletA(0),outletB(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -167,9 +167,9 @@ void ScaLBL_ColorModel::SetDomain(){
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
 	//Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
 	Averages = std::shared_ptr<SubPhase> ( new SubPhase(Dm) ); // TwoPhase analysis object
-	comm.barrier();
+	MPI_Barrier(comm);
 	Dm->CommInit();
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Read domain parameters
 	rank = Dm->rank();	
 	nprocx = Dm->nprocx();
@@ -292,7 +292,7 @@ void ScaLBL_ColorModel::AssignComponentLabels(double *phase)
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
 	for (size_t idx=0; idx<NLABELS; idx++)
-		label_count_global[idx] = Dm->Comm.sumReduce( label_count[idx] );
+		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
 
 	if (rank==0){
 		printf("Component labels: %lu \n",NLABELS);
@@ -333,7 +333,7 @@ void ScaLBL_ColorModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
@@ -465,7 +465,7 @@ void ScaLBL_ColorModel::Initialize(){
 		ScaLBL_CopyToDevice(Phi,cPhi,N*sizeof(double));
 		ScaLBL_DeviceBarrier();
 
-		comm.barrier();
+		MPI_Barrier(comm);
 	}
 
 	if (rank==0)	printf ("Initializing phase field \n");
@@ -651,7 +651,7 @@ void ScaLBL_ColorModel::Run(){
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
-	comm.barrier();
+	MPI_Barrier(comm);
 	starttime = MPI_Wtime();
 	//.........................................
 
@@ -700,8 +700,7 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -736,10 +735,10 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************
 		
+		MPI_Barrier(comm);
 		PROFILE_STOP("Update");
 
 		if (rank==0 && timestep%analysis_interval == 0 && BoundaryCondition > 0){
@@ -980,7 +979,7 @@ void ScaLBL_ColorModel::Run(){
 					//morph_delta *= (-1.0);
 					REVERSE_FLOW_DIRECTION = false;
 				}
-				comm.barrier();
+				MPI_Barrier(comm);
 			}
 			morph_timesteps += analysis_interval;
 		}
@@ -990,7 +989,7 @@ void ScaLBL_ColorModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	comm.barrier();
+	MPI_Barrier(comm);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
@@ -1035,17 +1034,17 @@ double ScaLBL_ColorModel::ImageInit(std::string Filename){
 		}
 	}
 
-	Count = Dm->Comm.sumReduce( Count );
-	PoreCount = Dm->Comm.sumReduce( PoreCount );
+	Count=sumReduce( Dm->Comm, Count);
+	PoreCount=sumReduce( Dm->Comm, PoreCount);
 	
 	if (rank==0) printf("   new saturation: %f (%f / %f) \n", Count / PoreCount, Count, PoreCount);
 	ScaLBL_CopyToDevice(Phi, PhaseLabel, Nx*Ny*Nz*sizeof(double));
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	ScaLBL_D3Q19_Init(fq, Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	ScaLBL_CopyToHost(Averages->Phi.data(),Phi,Nx*Ny*Nz*sizeof(double));
 
@@ -1077,7 +1076,7 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,Averages->SDs,vF,vS,phase_label,Dm->Comm);
-		Dm->Comm.barrier();
+		MPI_Barrier(Dm->Comm);
 
 		long long count_connected=0;
 		long long count_porespace=0;
@@ -1099,9 +1098,9 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 				}
 			}
 		}
-		count_connected = Dm->Comm.sumReduce( count_connected);
-		count_porespace = Dm->Comm.sumReduce( count_porespace);
-		count_water = Dm->Comm.sumReduce( count_water);
+		count_connected=sumReduce( Dm->Comm, count_connected);
+		count_porespace=sumReduce( Dm->Comm, count_porespace);
+		count_water=sumReduce( Dm->Comm, count_water);
 
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -1173,7 +1172,7 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 				}
 			}
 		}
-		count_morphopen = Dm->Comm.sumReduce( count_morphopen);
+		count_morphopen=sumReduce( Dm->Comm, count_morphopen);
 		volume_change = double(count_morphopen - count_connected);
 		
 		if (rank==0)  printf("   opening of connected oil %f \n",volume_change/count_connected);
@@ -1279,8 +1278,8 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 		mass_loss += random_value*seed_water_in_oil;
 	}
 
-	count = Dm->Comm.sumReduce( count );
-	mass_loss = Dm->Comm.sumReduce( mass_loss );
+	count= sumReduce( Dm->Comm, count);
+	mass_loss= sumReduce( Dm->Comm, mass_loss);
 	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
 
 	// Need to initialize Aq, Bq, Den, Phi directly
@@ -1317,7 +1316,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	double volume_initial = Dm->Comm.sumReduce(  count);
+	double volume_initial = sumReduce( Dm->Comm, count);
 	/*
 	sprintf(LocalRankFilename,"phi_initial.%05i.raw",rank);
 	FILE *INPUT = fopen(LocalRankFilename,"wb");
@@ -1327,7 +1326,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	// 2. Identify connected components of phase field -> phase_label
 	BlobIDstruct new_index;
 	ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	// only operate on component "0"
 	count = 0.0;
@@ -1349,8 +1348,8 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}	
-	double volume_connected = Dm->Comm.sumReduce( count );
-	second_biggest = Dm->Comm.sumReduce( second_biggest );
+	double volume_connected = sumReduce( Dm->Comm, count);
+	second_biggest = sumReduce( Dm->Comm, second_biggest);
 
 	/*int reach_x, reach_y, reach_z;
 	for (int k=0; k<Nz; k++){
@@ -1437,7 +1436,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 			}
 		}
 	}
-	double volume_final = Dm->Comm.sumReduce( count );
+	double volume_final= sumReduce( Dm->Comm, count);
 
 	delta_volume = (volume_final-volume_initial);
 	if (rank == 0)  printf("MorphInit: change fluid volume fraction by %f \n", delta_volume/volume_initial);
diff --git a/models/ColorModel.h b/models/ColorModel.h
index c52f04c3..a3b3a124 100644
--- a/models/ColorModel.h
+++ b/models/ColorModel.h
@@ -12,13 +12,13 @@ Implementation of color lattice boltzmann model
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
 class ScaLBL_ColorModel{
 public:
-	ScaLBL_ColorModel(int RANK, int NP, const Utilities::MPI& COMM);
+	ScaLBL_ColorModel(int RANK, int NP, MPI_Comm COMM);
 	~ScaLBL_ColorModel();	
 	
 	// functions in they should be run
@@ -68,7 +68,7 @@ public:
 	double *Pressure;
 		
 private:
-	Utilities::MPI comm;
+	MPI_Comm comm;
     
 	int dist_mem_size;
 	int neighborSize;
diff --git a/models/DFHModel.cpp b/models/DFHModel.cpp
index ced5853f..4eb03bea 100644
--- a/models/DFHModel.cpp
+++ b/models/DFHModel.cpp
@@ -3,7 +3,7 @@ color lattice boltzmann model
  */
 #include "models/DFHModel.h"
 
-ScaLBL_DFHModel::ScaLBL_DFHModel(int RANK, int NP, const Utilities::MPI& COMM):
+ScaLBL_DFHModel::ScaLBL_DFHModel(int RANK, int NP, MPI_Comm COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tauA(0),tauB(0),rhoA(0),rhoB(0),alpha(0),beta(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),inletA(0),inletB(0),outletA(0),outletB(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -100,16 +100,16 @@ void ScaLBL_DFHModel::ReadParams(string filename){
 
 }
 void ScaLBL_DFHModel::SetDomain(){
-	Dm   = std::make_shared<Domain>(domain_db,comm); // full domain for analysis
-	Mask = std::make_shared<Domain>(domain_db,comm); // mask domain removes immobile phases
+	Dm  = std::shared_ptr<Domain>(new Domain(domain_db,comm));      // full domain for analysis
+	Mask  = std::shared_ptr<Domain>(new Domain(domain_db,comm));    // mask domain removes immobile phases
 	Nx+=2; Ny+=2; Nz += 2;
 	N = Nx*Ny*Nz;
 	id = new char [N];
-	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;   // initialize this way
-	Averages = std::make_shared<TwoPhase>( Dm );    // TwoPhase analysis object
-	comm.barrier();
+	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
+	Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
+	MPI_Barrier(comm);
 	Dm->CommInit();
-	comm.barrier();
+	MPI_Barrier(comm);
 	rank = Dm->rank();
 }
 
@@ -131,7 +131,7 @@ void ScaLBL_DFHModel::ReadInput(){
 	sprintf(LocalRankString,"%05d",rank);
 	sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 	ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 }
 
@@ -206,7 +206,7 @@ void ScaLBL_DFHModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
@@ -424,7 +424,7 @@ void ScaLBL_DFHModel::Initialize(){
 			}
 		}
 	}
-	count_wet_global = comm.sumReduce( count_wet );
+	MPI_Allreduce(&count_wet,&count_wet_global,1,MPI_DOUBLE,MPI_SUM,comm);
 	if (rank==0)	printf("Wetting phase volume fraction =%f \n",count_wet_global/double(Nx*Ny*Nz*nprocs));
 	// initialize phi based on PhaseLabel (include solid component labels)
 	ScaLBL_CopyToDevice(Phi, PhaseLabel, Np*sizeof(double));
@@ -446,7 +446,7 @@ void ScaLBL_DFHModel::Initialize(){
 				timestep=0;
 			}
 		}
-		comm.bcast(&timestep,1,0);
+		MPI_Bcast(&timestep,1,MPI_INT,0,comm);
 		// Read in the restart file to CPU buffers
 		double *cPhi = new double[Np];
 		double *cDist = new double[19*Np];
@@ -468,7 +468,7 @@ void ScaLBL_DFHModel::Initialize(){
 		ScaLBL_DeviceBarrier();
 		delete [] cPhi;
 		delete [] cDist;
-		comm.barrier();
+		MPI_Barrier(comm);
 	}
 
 	if (rank==0)    printf ("Initializing phase field \n");
@@ -486,7 +486,7 @@ void ScaLBL_DFHModel::Run(){
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
 	ScaLBL_DeviceBarrier();
-	comm.barrier();
+	MPI_Barrier(comm);
 	starttime = MPI_Wtime();
 	//.........................................
 	//************ MAIN ITERATION LOOP ***************************************/
@@ -532,8 +532,7 @@ void ScaLBL_DFHModel::Run(){
 		}
 		ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -569,9 +568,9 @@ void ScaLBL_DFHModel::Run(){
 		}
 		ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************
+		MPI_Barrier(comm);
 		PROFILE_STOP("Update");
 
 		// Run the analysis
@@ -582,7 +581,7 @@ void ScaLBL_DFHModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	comm.barrier();
+	MPI_Barrier(comm);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
diff --git a/models/DFHModel.h b/models/DFHModel.h
index 00e6e6b3..883ec6f8 100644
--- a/models/DFHModel.h
+++ b/models/DFHModel.h
@@ -12,13 +12,13 @@ Implementation of color lattice boltzmann model
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
 class ScaLBL_DFHModel{
 public:
-	ScaLBL_DFHModel(int RANK, int NP, const Utilities::MPI& COMM);
+	ScaLBL_DFHModel(int RANK, int NP, MPI_Comm COMM);
 	~ScaLBL_DFHModel();	
 	
 	// functions in they should be run
@@ -66,7 +66,7 @@ public:
     double *Pressure;
 		
 private:
-	Utilities::MPI comm;
+	MPI_Comm comm;
     
 	int dist_mem_size;
 	int neighborSize;
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index 23925930..c1db7c1c 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -5,7 +5,7 @@
 #include "analysis/distance.h"
 #include "common/ReadMicroCT.h"
 
-ScaLBL_MRTModel::ScaLBL_MRTModel(int RANK, int NP, const Utilities::MPI& COMM):
+ScaLBL_MRTModel::ScaLBL_MRTModel(int RANK, int NP, MPI_Comm COMM):
 rank(RANK), nprocs(NP), Restart(0),timestep(0),timestepMax(0),tau(0),
 Fx(0),Fy(0),Fz(0),flux(0),din(0),dout(0),mu(0),
 Nx(0),Ny(0),Nz(0),N(0),Np(0),nprocx(0),nprocy(0),nprocz(0),BoundaryCondition(0),Lx(0),Ly(0),Lz(0),comm(COMM)
@@ -83,9 +83,9 @@ void ScaLBL_MRTModel::SetDomain(){
 	
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;               // initialize this way
 	//Averages = std::shared_ptr<TwoPhase> ( new TwoPhase(Dm) ); // TwoPhase analysis object
-	comm.barrier();
+	MPI_Barrier(comm);
 	Dm->CommInit();
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	rank = Dm->rank();	
 	nprocx = Dm->nprocx();
@@ -171,7 +171,7 @@ void ScaLBL_MRTModel::Create(){
 	Map.resize(Nx,Ny,Nz);       Map.fill(-2);
 	auto neighborList= new int[18*Npad];
 	Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-	comm.barrier();
+	MPI_Barrier(comm);
 	//...........................................................................
 	//                MAIN  VARIABLES ALLOCATED HERE
 	//...........................................................................
@@ -190,7 +190,7 @@ void ScaLBL_MRTModel::Create(){
 	if (rank==0)    printf ("Setting up device map and neighbor list \n");
 	// copy the neighbor list 
 	ScaLBL_CopyToDevice(NeighborList, neighborList, neighborSize);
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 }        
 
@@ -225,8 +225,7 @@ void ScaLBL_MRTModel::Run(){
 
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	ScaLBL_DeviceBarrier();
-    comm.barrier();
+	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 	starttime = MPI_Wtime();
 	if (rank==0) printf("Beginning AA timesteps, timestepMax = %i \n", timestepMax);
 	if (rank==0) printf("********************************************************\n");
@@ -240,21 +239,18 @@ void ScaLBL_MRTModel::Run(){
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
 		ScaLBL_D3Q19_AAeven_MRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		ScaLBL_D3Q19_AAeven_MRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-		ScaLBL_DeviceBarrier();
-        comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/
 		
 		if (timestep%1000==0){
 			ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-			ScaLBL_DeviceBarrier();
-            comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[0],Velocity_x);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[Np],Velocity_y);
 			ScaLBL_Comm->RegularLayout(Map,&Velocity[2*Np],Velocity_z);
@@ -276,10 +272,10 @@ void ScaLBL_MRTModel::Run(){
 					}
 				}
 			}
-            vax = Mask->Comm.sumReduce( vax_loc );
-            vay = Mask->Comm.sumReduce( vay_loc );
-            vaz = Mask->Comm.sumReduce( vaz_loc );
-            count = Mask->Comm.sumReduce( count_loc );
+			MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+			MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			
 			vax /= count;
 			vay /= count;
@@ -309,10 +305,10 @@ void ScaLBL_MRTModel::Run(){
 			double As = Morphology.A();
 			double Hs = Morphology.H();
 			double Xs = Morphology.X();
-			Vs = Dm->Comm.sumReduce( Vs);
-			As = Dm->Comm.sumReduce( As);
-			Hs = Dm->Comm.sumReduce( Hs);
-			Xs = Dm->Comm.sumReduce( Xs);
+			Vs=sumReduce( Dm->Comm, Vs);
+			As=sumReduce( Dm->Comm, As);
+			Hs=sumReduce( Dm->Comm, Hs);
+			Xs=sumReduce( Dm->Comm, Xs);
 			double h = Dm->voxel_length;
 			double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;
 			if (rank==0) {
@@ -346,8 +342,7 @@ void ScaLBL_MRTModel::VelocityField(){
 /*	Minkowski Morphology(Mask);
 	int SIZE=Np*sizeof(double);
 	ScaLBL_D3Q19_Momentum(fq,Velocity, Np);
-	ScaLBL_DeviceBarrier();.
-    comm.barrier();
+	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 	ScaLBL_CopyToHost(&VELOCITY[0],&Velocity[0],3*SIZE);
 
 	memcpy(Morphology.SDn.data(), Distance.data(), Nx*Ny*Nz*sizeof(double));
@@ -374,10 +369,10 @@ void ScaLBL_MRTModel::VelocityField(){
 		vaz_loc += VELOCITY[2*Np+n];
 		count_loc+=1.0;
 	}
-    vax = Mask->Comm.sumReduce( vax_loc );
-    vay = Mask->Comm.sumReduce( vay_loc );
-    vaz = Mask->Comm.sumReduce( vaz_loc );
-    count = Mask->Comm.sumReduce( count_loc );
+	MPI_Allreduce(&vax_loc,&vax,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&vay_loc,&vay,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
+	MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 	
 	vax /= count;
 	vay /= count;
diff --git a/models/MRTModel.h b/models/MRTModel.h
index 7e23cc44..aa4ee1f0 100644
--- a/models/MRTModel.h
+++ b/models/MRTModel.h
@@ -11,13 +11,13 @@
 
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "analysis/Minkowski.h"
 #include "ProfilerApp.h"
 
 class ScaLBL_MRTModel{
 public:
-	ScaLBL_MRTModel(int RANK, int NP, const Utilities::MPI& COMM);
+	ScaLBL_MRTModel(int RANK, int NP, MPI_Comm COMM);
 	~ScaLBL_MRTModel();	
 	
 	// functions in they should be run
@@ -63,7 +63,7 @@ public:
     DoubleArray Velocity_y;
     DoubleArray Velocity_z;
 private:
-	Utilities::MPI comm;
+	MPI_Comm comm;
 	
 	// filenames
     char LocalRankString[8];
diff --git a/tests/BlobAnalyzeParallel.cpp b/tests/BlobAnalyzeParallel.cpp
index 48e9e230..c9e3f8fc 100644
--- a/tests/BlobAnalyzeParallel.cpp
+++ b/tests/BlobAnalyzeParallel.cpp
@@ -100,10 +100,11 @@ inline void  WriteBlobStates(TwoPhase TCAT, double D, double porosity){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
     Utilities::setAbortBehavior( true, true, true );
     Utilities::setErrorHandlers();
 	PROFILE_ENABLE(0);
@@ -136,7 +137,7 @@ int main(int argc, char **argv)
     	domain >> Ly;
     	domain >> Lz;
     }
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -149,7 +150,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 
     // Check that the number of processors >= the number of ranks
     if ( rank==0 ) {
@@ -208,7 +209,7 @@ int main(int argc, char **argv)
 //	WriteLocalSolidID(LocalRankFilename, id, N);
 	sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 	ReadBinaryFile(LocalRankFilename, Averages.SDs.get(), N);
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
     //.......................................................................
 	//copies of data needed to perform checkpointing from cpu
@@ -220,7 +221,7 @@ int main(int argc, char **argv)
 	if (rank==0) printf("Reading restart file! \n");
 	// Read in the restart file to CPU buffers
 	ReadCheckpoint(LocalRestartFile, Den, DistEven, DistOdd, N);
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.........................................................................
 	// Populate the arrays needed to perform averaging
 	if (rank==0) printf("Populate arrays \n");
@@ -328,14 +329,14 @@ int main(int argc, char **argv)
     //      BlobContainer Blobs;
     DoubleArray RecvBuffer(dimx);
     //    MPI_Allreduce(&Averages.BlobAverages.get(),&Blobs.get(),1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
-    comm.barrier();
+    MPI_Barrier(comm);
     if (rank==0) printf("Number of components is %i \n",dimy);
 
     for (int b=0; b<dimy; b++){
 
     	MPI_Allreduce(&Averages.BlobAverages(0,b),&RecvBuffer(0),dimx,MPI_DOUBLE,MPI_SUM,comm);
     	for (int idx=0; idx<dimx-1; idx++) Averages.BlobAverages(idx,b)=RecvBuffer(idx);
-    	comm.barrier();
+    	MPI_Barrier(comm);
 
     	if (Averages.BlobAverages(0,b) > 0.0){
     		double Vn,pn,awn,ans,Jwn,Kwn,lwns,cwns,trawn,trJwn;
@@ -481,7 +482,7 @@ int main(int argc, char **argv)
     fclose(BLOBS);*/
     PROFILE_STOP("main");
     PROFILE_SAVE("BlobIdentifyParallel",false);
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return 0;  
 }
diff --git a/tests/BlobIdentifyParallel.cpp b/tests/BlobIdentifyParallel.cpp
index b8929a11..f93371cb 100644
--- a/tests/BlobIdentifyParallel.cpp
+++ b/tests/BlobIdentifyParallel.cpp
@@ -47,10 +47,11 @@ void readRankData( int proc, int nx, int ny, int nz, DoubleArray& Phase, DoubleA
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 #ifdef PROFILE
 	PROFILE_ENABLE(0);
     PROFILE_DISABLE_TRACE();
@@ -128,7 +129,7 @@ int main(int argc, char **argv)
     PROFILE_STOP("main");
     PROFILE_SAVE("BlobIdentifyParallel",false);
 #endif
-    comm.barrier();
+    MPI_Barrier(comm);
 	MPI_Finalize();
     return 0;  
 }
diff --git a/tests/ColorToBinary.cpp b/tests/ColorToBinary.cpp
index fae156d1..7ac740bc 100644
--- a/tests/ColorToBinary.cpp
+++ b/tests/ColorToBinary.cpp
@@ -114,10 +114,11 @@ inline void ReadFromRank(char *FILENAME, DoubleArray &Phase, int nx, int ny, int
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 
 	printf("----------------------------------------------------------\n");
 	printf("Creating single Binary file from restart (8-bit integer)\n");
@@ -275,7 +276,7 @@ int main(int argc, char **argv)
 	*/
 	
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/ComponentLabel.cpp b/tests/ComponentLabel.cpp
index 624ce8f4..07ef6555 100644
--- a/tests/ComponentLabel.cpp
+++ b/tests/ComponentLabel.cpp
@@ -119,10 +119,11 @@ inline void ReadFromRank(char *FILENAME, DoubleArray &Phase, DoubleArray &Pressu
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 
 	printf("----------------------------------------------------------\n");
 	printf("COMPUTING TCAT ANALYSIS FOR NON-WETTING PHASE FEATURES \n");
@@ -432,7 +433,7 @@ int main(int argc, char **argv)
 	fclose(DISTANCE);
 	*/
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/GenerateSphereTest.cpp b/tests/GenerateSphereTest.cpp
index 43434092..d1917619 100644
--- a/tests/GenerateSphereTest.cpp
+++ b/tests/GenerateSphereTest.cpp
@@ -9,7 +9,7 @@
 //#include "common/pmmc.h"
 #include "common/Domain.h"
 #include "common/SpherePack.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 
 /*
@@ -70,8 +70,8 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 		}
 	}
 	// total Global is the number of nodes in the pore-space
-	totalGlobal = Dm.Comm.sumReduce( count );
-	maxdistGlobal = Dm.Comm.sumReduce( maxdist );
+	MPI_Allreduce(&count,&totalGlobal,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
+	MPI_Allreduce(&maxdist,&maxdistGlobal,1,MPI_DOUBLE,MPI_MAX,Dm.Comm);
 	double volume=double(nprocx*nprocy*nprocz)*double(nx-2)*double(ny-2)*double(nz-2);
 	double porosity=totalGlobal/volume;
 	if (rank==0) printf("Media Porosity: %f \n",porosity);
@@ -148,6 +148,7 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 	double Rcrit_old=0.0;
 	double Rcrit_new=0.0;
 
+	double GlobalNumber = 1.f;
 	int imin,jmin,kmin,imax,jmax,kmax;
     
 	Rcrit_new = maxdistGlobal;
@@ -214,41 +215,41 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         PackID(Dm.sendList_YZ, Dm.sendCount_YZ ,sendID_YZ, id);
         //......................................................................................
         MPI_Sendrecv(sendID_x,Dm.sendCount_x,MPI_CHAR,Dm.rank_x(),sendtag,
-		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_X,Dm.recvCount_X,MPI_CHAR,Dm.rank_X(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_X,Dm.sendCount_X,MPI_CHAR,Dm.rank_X(),sendtag,
-		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_x,Dm.recvCount_x,MPI_CHAR,Dm.rank_x(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_y,Dm.sendCount_y,MPI_CHAR,Dm.rank_y(),sendtag,
-		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_Y,Dm.recvCount_Y,MPI_CHAR,Dm.rank_Y(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Y,Dm.sendCount_Y,MPI_CHAR,Dm.rank_Y(),sendtag,
-		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_y,Dm.recvCount_y,MPI_CHAR,Dm.rank_y(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_z,Dm.sendCount_z,MPI_CHAR,Dm.rank_z(),sendtag,
-		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_Z,Dm.recvCount_Z,MPI_CHAR,Dm.rank_Z(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Z,Dm.sendCount_Z,MPI_CHAR,Dm.rank_Z(),sendtag,
-		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_z,Dm.recvCount_z,MPI_CHAR,Dm.rank_z(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xy,Dm.sendCount_xy,MPI_CHAR,Dm.rank_xy(),sendtag,
-		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_XY,Dm.recvCount_XY,MPI_CHAR,Dm.rank_XY(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_XY,Dm.sendCount_XY,MPI_CHAR,Dm.rank_XY(),sendtag,
-		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_xy,Dm.recvCount_xy,MPI_CHAR,Dm.rank_xy(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Xy,Dm.sendCount_Xy,MPI_CHAR,Dm.rank_Xy(),sendtag,
-		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_xY,Dm.recvCount_xY,MPI_CHAR,Dm.rank_xY(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xY,Dm.sendCount_xY,MPI_CHAR,Dm.rank_xY(),sendtag,
-		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_Xy,Dm.recvCount_Xy,MPI_CHAR,Dm.rank_Xy(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xz,Dm.sendCount_xz,MPI_CHAR,Dm.rank_xz(),sendtag,
-		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_XZ,Dm.recvCount_XZ,MPI_CHAR,Dm.rank_XZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_XZ,Dm.sendCount_XZ,MPI_CHAR,Dm.rank_XZ(),sendtag,
-		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_xz,Dm.recvCount_xz,MPI_CHAR,Dm.rank_xz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Xz,Dm.sendCount_Xz,MPI_CHAR,Dm.rank_Xz(),sendtag,
-		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_xZ,Dm.recvCount_xZ,MPI_CHAR,Dm.rank_xZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_xZ,Dm.sendCount_xZ,MPI_CHAR,Dm.rank_xZ(),sendtag,
-		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_Xz,Dm.recvCount_Xz,MPI_CHAR,Dm.rank_Xz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_yz,Dm.sendCount_yz,MPI_CHAR,Dm.rank_yz(),sendtag,
-		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_YZ,Dm.recvCount_YZ,MPI_CHAR,Dm.rank_YZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_YZ,Dm.sendCount_YZ,MPI_CHAR,Dm.rank_YZ(),sendtag,
-		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_yz,Dm.recvCount_yz,MPI_CHAR,Dm.rank_yz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_Yz,Dm.sendCount_Yz,MPI_CHAR,Dm.rank_Yz(),sendtag,
-		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_yZ,Dm.recvCount_yZ,MPI_CHAR,Dm.rank_yZ(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         MPI_Sendrecv(sendID_yZ,Dm.sendCount_yZ,MPI_CHAR,Dm.rank_yZ(),sendtag,
-		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm.getCommunicator(),MPI_STATUS_IGNORE);
+		     recvID_Yz,Dm.recvCount_Yz,MPI_CHAR,Dm.rank_Yz(),recvtag,Dm.Comm,MPI_STATUS_IGNORE);
         //......................................................................................
         UnpackID(Dm.recvList_x, Dm.recvCount_x ,recvID_x, id);
         UnpackID(Dm.recvList_X, Dm.recvCount_X ,recvID_X, id);
@@ -270,7 +271,7 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
         UnpackID(Dm.recvList_YZ, Dm.recvCount_YZ ,recvID_YZ, id);
         //......................................................................................
 
-        //double GlobalNumber = Dm.Comm.sumReduce( LocalNumber );
+        MPI_Allreduce(&LocalNumber,&GlobalNumber,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
 
         count = 0.f;
         for (int k=1; k<Nz-1; k++){
@@ -283,7 +284,7 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
                 }
             }
         }
-        countGlobal = Dm.Comm.sumReduce( count );
+        MPI_Allreduce(&count,&countGlobal,1,MPI_DOUBLE,MPI_SUM,Dm.Comm);
         sw_new = countGlobal/totalGlobal;
         sw_diff_new = abs(sw_new-SW);
         // for test only
@@ -313,11 +314,15 @@ inline void MorphOpen(DoubleArray SignDist, char *id, Domain &Dm, int nx, int ny
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		// parallel domain size (# of sub-domains)
 		int nprocx,nprocy,nprocz;
@@ -407,14 +412,14 @@ int main(int argc, char **argv)
 		//.......................................................................
 		if (rank == 0)	printf("Reading the sphere packing \n");
 		if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Broadcast the sphere packing to all processes
-		comm.bcast(cx,nspheres,0);
-		comm.bcast(cy,nspheres,0);
-		comm.bcast(cz,nspheres,0);
-		comm.bcast(rad,nspheres,0);
+		MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
+		MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
+		MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
+		MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 		//...........................................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank == 0){
 			// Compute the Sauter mean diameter
@@ -428,7 +433,7 @@ int main(int argc, char **argv)
 			D = 6.0*(Nx-2)*nprocx*totVol / totArea / Lx;
 			printf("Sauter Mean Diameter (computed from sphere packing) = %f \n",D);
 		}
-		comm.bcast(&D,1,0);
+		MPI_Bcast(&D,1,MPI_DOUBLE,0,comm);
 
 		//.......................................................................
 		SignedDistance(SignDist.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
@@ -460,7 +465,7 @@ int main(int argc, char **argv)
 			}
 		}
 		sum_local = 1.0*sum;
-		porosity = comm.sumReduce(sum_local);
+		MPI_Allreduce(&sum_local,&porosity,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = porosity*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
@@ -493,7 +498,7 @@ int main(int argc, char **argv)
 		//......................................................................
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/TestBlobAnalyze.cpp b/tests/TestBlobAnalyze.cpp
index 63d928c1..d2df9f86 100644
--- a/tests/TestBlobAnalyze.cpp
+++ b/tests/TestBlobAnalyze.cpp
@@ -127,10 +127,11 @@ inline void  WriteBlobStates(TwoPhase TCAT, double D, double porosity){
 int main(int argc, char **argv)
 {
   // Initialize MPI
+  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&nprocs);
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -188,7 +189,7 @@ int main(int argc, char **argv)
     		Lx=Ly=Lz=1;
     	}
     }
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -201,7 +202,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 
     // Check that the number of processors >= the number of ranks
     if ( rank==0 ) {
@@ -253,14 +254,14 @@ int main(int argc, char **argv)
 	cz[0]=0.25*Lz; cx[1]=0.75*Lz; cx[2]=0.25*Lz; cx[3]=0.25*Lz;
 	rad[0]=rad[1]=rad[2]=rad[3]=0.1*Lx;
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.......................................................................
 	SignedDistance(Averages.Phase.data(),nspheres,cx,cy,cz,rad,Lx,Ly,Lz,Nx,Ny,Nz,
 		       Dm->iproc(),Dm->jproc(),Dm->kproc(),Dm->nprocx(),Dm->nprocy(),Dm->nprocz());
@@ -316,7 +317,7 @@ int main(int argc, char **argv)
 	delete [] rad;
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  comm.barrier();
+  MPI_Barrier(comm);
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestBlobIdentify.cpp b/tests/TestBlobIdentify.cpp
index 7eb5c270..ccfc6afc 100644
--- a/tests/TestBlobIdentify.cpp
+++ b/tests/TestBlobIdentify.cpp
@@ -23,19 +23,21 @@ inline double rand2()
 
 
 // Test if all ranks agree on a value
-bool allAgree( int x, const Utilities::MPI& comm ) {
+bool allAgree( int x, MPI_Comm comm ) {
     int x2 = x;
-    comm.bcast(&x2,1,0);
+    MPI_Bcast(&x2,1,MPI_INT,0,comm);
     int diff = x==x2 ? 0:1;
-    int diff2 = comm.sumReduce( diff );
+    int diff2 = 0;
+    MPI_Allreduce(&diff,&diff2,1,MPI_INT,MPI_SUM,comm);
     return diff2==0;
 }
 template<class T>
-bool allAgree( const std::vector<T>& x, const Utilities::MPI& comm ) {
+bool allAgree( const std::vector<T>& x, MPI_Comm comm ) {
     std::vector<T> x2 = x;
-    comm.bcast(&x2[0],x.size()*sizeof(T)/sizeof(int),0);
+    MPI_Bcast(&x2[0],x.size()*sizeof(T)/sizeof(int),MPI_INT,0,comm);
     int diff = x==x2 ? 0:1;
-    int diff2 = comm.sumReduce( diff );
+    int diff2 = 0;
+    MPI_Allreduce(&diff,&diff2,1,MPI_INT,MPI_SUM,comm);
     return diff2==0;
 }
 
@@ -72,9 +74,9 @@ struct bubble_struct {
 
 
 // Create a random set of bubles
-std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly, double Lz, const Utilities::MPI& comm )
+std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly, double Lz, MPI_Comm comm )
 {
-    int rank = comm.getRank();
+    int rank = comm_rank(comm);
     std::vector<bubble_struct> bubbles(N_bubbles);
     if ( rank == 0 ) {
         double R0 = 0.2*Lx*Ly*Lz/pow((double)N_bubbles,0.333);
@@ -89,7 +91,7 @@ std::vector<bubble_struct> create_bubbles( int N_bubbles, double Lx, double Ly,
         }
     }
     size_t N_bytes = N_bubbles*sizeof(bubble_struct);
-    comm.bcast((char*)&bubbles[0],N_bytes,0);
+    MPI_Bcast((char*)&bubbles[0],N_bytes,MPI_CHAR,0,comm);
     return bubbles;
 }
 
@@ -122,7 +124,7 @@ void fillBubbleData( const std::vector<bubble_struct>& bubbles, DoubleArray& Pha
 
 
 // Shift all of the data by the given number of cells
-void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct& rank_info, const Utilities::MPI& comm )
+void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct& rank_info, MPI_Comm comm )
 {
     int nx = data.size(0)-2;
     int ny = data.size(1)-2;
@@ -152,10 +154,11 @@ void shift_data( DoubleArray& data, int sx, int sy, int sz, const RankInfoStruct
 int main(int argc, char **argv)
 {
     // Initialize MPI
+    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     PROFILE_ENABLE(1);
     PROFILE_DISABLE_TRACE();
     PROFILE_SYNCHRONIZE();
@@ -294,7 +297,7 @@ int main(int argc, char **argv)
             velocity[i].z = bubbles[i].radius*(2*rand2()-1);
         }
     }
-    comm.bcast((char*)&velocity[0],bubbles.size()*sizeof(Point),0);
+    MPI_Bcast((char*)&velocity[0],bubbles.size()*sizeof(Point),MPI_CHAR,0,comm);
     fillBubbleData( bubbles, Phase, SignDist, Lx, Ly, Lz, rank_info );
     fillData.fill(Phase);
     fillData.fill(SignDist);
@@ -388,8 +391,8 @@ int main(int argc, char **argv)
                 printf("\n");
             }
         }
-        comm.bcast(&N1,1,0);
-        comm.bcast(&N2,1,0);
+        MPI_Bcast(&N1,1,MPI_INT,0,comm);
+        MPI_Bcast(&N2,1,MPI_INT,0,comm);
         if ( N1!=nblobs || N2!=nblobs2 ) {
             if ( rank==0 )
                 printf("Error, blob ids do not map in moving bubble test (%i,%i,%i,%i)\n",
@@ -409,7 +412,7 @@ int main(int argc, char **argv)
     // Finished
     PROFILE_STOP("main");
     PROFILE_SAVE("TestBlobIdentify",false);
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return N_errors;  
 }
diff --git a/tests/TestBlobIdentifyCorners.cpp b/tests/TestBlobIdentifyCorners.cpp
index 904e52e0..4795f610 100644
--- a/tests/TestBlobIdentifyCorners.cpp
+++ b/tests/TestBlobIdentifyCorners.cpp
@@ -18,9 +18,10 @@
 int main(int argc, char **argv)
 {
     // Initialize MPI
+    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
     /*if ( nprocs != 8 ) {
         printf("This tests requires 8 processors\n");
         return -1;
diff --git a/tests/TestBubble.cpp b/tests/TestBubble.cpp
index e7e0ced8..c03e5dea 100644
--- a/tests/TestBubble.cpp
+++ b/tests/TestBubble.cpp
@@ -7,7 +7,7 @@
 
 #include "analysis/pmmc.h"
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -32,15 +32,14 @@ int main(int argc, char **argv)
   // Initialize MPI
   int provided_thread_support = -1;
   MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-
+  MPI_Comm comm;
+  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+  int rank = comm_rank(comm);
+  int nprocs = comm_size(comm);
+  if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
+    std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
-    auto comm = Utilities::MPI( MPI_COMM_WORLD ).dup();
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
-    if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
-      std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
-
     // parallel domain size (# of sub-domains)
     int nprocx,nprocy,nprocz;
 
@@ -120,7 +119,7 @@ int main(int argc, char **argv)
     int jproc = rank_info.jy;
     int kproc = rank_info.kz;
 
-    comm.barrier();
+    MPI_Barrier(comm);
     // **************************************************************
     // **************************************************************
     double Ps = -(das-dbs)/(das+dbs);
@@ -163,7 +162,7 @@ int main(int argc, char **argv)
     // Mask that excludes the solid phase
     Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,pBC);
      
-     comm.barrier();
+     MPI_Barrier(comm);
 
     Nx+=2; Ny+=2; Nz += 2;
 
@@ -433,7 +432,7 @@ int main(int argc, char **argv)
 
     //.......create and start timer............
     double starttime,stoptime,cputime;
-    comm.barrier();
+    MPI_Barrier(comm);
     starttime = MPI_Wtime();
     //.........................................
     //...........................................................................
@@ -518,7 +517,7 @@ int main(int argc, char **argv)
             ScaLBL_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
             ScaLBL_CopyToDevice(Den,cDen,2*N*sizeof(double));
             ScaLBL_DeviceBarrier();
-            comm.barrier();
+            MPI_Barrier(comm);
         }
 
         //*************************************************************************
@@ -530,7 +529,7 @@ int main(int argc, char **argv)
         ScaLBL_Comm.SendHalo(Phi);
         ScaLBL_Comm.RecvHalo(Phi);
         ScaLBL_DeviceBarrier();
-        comm.barrier();
+        MPI_Barrier(comm);
         //*************************************************************************
 
         if (rank==0 && pBC){
@@ -561,7 +560,7 @@ int main(int argc, char **argv)
         ScaLBL_D3Q19_Pressure(ID,f_even,f_odd,Pressure,Nx,Ny,Nz);
         ScaLBL_CopyToHost(Phase.data(),Phi,N*sizeof(double));
         ScaLBL_CopyToHost(Press.data(),Pressure,N*sizeof(double));
-        comm.barrier();
+        MPI_Barrier(comm);
         //...........................................................................
         
         int timestep = 0;
@@ -592,7 +591,7 @@ int main(int argc, char **argv)
             //*************************************************************************
 
             ScaLBL_DeviceBarrier();
-            comm.barrier();
+            MPI_Barrier(comm);
             //*************************************************************************
             //         Swap the distributions for momentum transport
             //*************************************************************************
@@ -600,7 +599,7 @@ int main(int argc, char **argv)
             //*************************************************************************
 
             ScaLBL_DeviceBarrier();
-            comm.barrier();
+            MPI_Barrier(comm);
             //*************************************************************************
             // Wait for communications to complete and unpack the distributions
             ScaLBL_Comm.RecvD3Q19(f_even, f_odd);
@@ -617,7 +616,7 @@ int main(int argc, char **argv)
             ScaLBL_D3Q7_Swap(ID, B_even, B_odd, Nx, Ny, Nz);
 
             ScaLBL_DeviceBarrier();
-            comm.barrier();
+            MPI_Barrier(comm);
 
             //*************************************************************************
             // Wait for communication and unpack the D3Q7 distributions
@@ -634,7 +633,7 @@ int main(int argc, char **argv)
             //*************************************************************************
             // ScaLBL_ComputePhaseField(ID, Phi, Copy, Den, N);
             ScaLBL_DeviceBarrier();
-            comm.barrier();
+            MPI_Barrier(comm);
         
             ScaLBL_ComputePhaseField(ID, Phi, Den, N);
             //*************************************************************************
@@ -660,7 +659,7 @@ int main(int argc, char **argv)
             
             //...................................................................................
 
-            comm.barrier();
+            MPI_Barrier(comm);
 
             // Timestep completed!
             timestep++;
@@ -808,7 +807,7 @@ int main(int argc, char **argv)
             //...........................................................................
         }
         //...........................................................................
-        comm.barrier();
+        MPI_Barrier(comm);
         MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
@@ -828,7 +827,7 @@ int main(int argc, char **argv)
         MPI_Allreduce(&Gwn(0),&Gwn_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&Gns(0),&Gns_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
         MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
-        comm.barrier();
+        MPI_Barrier(comm);
         //.........................................................................
         // Compute the change in the total surface energy based on the defined interval
         // See McClure, Prins and Miller (2013) 
@@ -951,7 +950,7 @@ int main(int argc, char **argv)
 
     //************************************************************************/
     ScaLBL_DeviceBarrier();
-    comm.barrier();
+    MPI_Barrier(comm);
     stoptime = MPI_Wtime();
     if (rank==0) printf("-------------------------------------------------------------------\n");
     // Compute the walltime per timestep
@@ -990,8 +989,9 @@ int main(int argc, char **argv)
     PROFILE_SAVE("TestBubble");
     
     // ****************************************************
-    comm.barrier();
+    MPI_Barrier(comm);
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
+  MPI_Comm_free(&comm);
   MPI_Finalize();
   return 0;
 }
diff --git a/tests/TestBubbleDFH.cpp b/tests/TestBubbleDFH.cpp
index 7f5d0047..a8ba0cde 100644
--- a/tests/TestBubbleDFH.cpp
+++ b/tests/TestBubbleDFH.cpp
@@ -9,7 +9,7 @@
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -29,9 +29,10 @@ int main(int argc, char **argv)
 	// Initialize MPI
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+	MPI_Comm comm;
+	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+	int rank = comm_rank(comm);
+	int nprocs = comm_size(comm);
 	int check=0;
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 	  int i,j,k,n,Np;
@@ -44,7 +45,7 @@ int main(int argc, char **argv)
 		int device=ScaLBL_SetDevice(rank);
 		printf("Using GPU ID %i for rank %i \n",device,rank);
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		PROFILE_ENABLE(1);
 		//PROFILE_ENABLE_TRACE();
@@ -71,7 +72,7 @@ int main(int argc, char **argv)
         // Initialize compute device
         //        int device=ScaLBL_SetDevice(rank);
         ScaLBL_DeviceBarrier();
-        comm.barrier();
+        MPI_Barrier(comm);
 
         Utilities::setErrorHandlers();
 
@@ -117,7 +118,7 @@ int main(int argc, char **argv)
         // Get the rank info
         const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-        comm.barrier();
+        MPI_Barrier(comm);
 
         if (nprocs != nprocx*nprocy*nprocz){
             printf("nprocx =  %i \n",nprocx);
@@ -166,7 +167,7 @@ int main(int argc, char **argv)
 
         // Mask that excludes the solid phase
         auto Mask = std::make_shared<Domain>(domain_db,comm);
-        comm.barrier();
+        MPI_Barrier(comm);
 
         Nx+=2; Ny+=2; Nz += 2;
         int N = Nx*Ny*Nz;
@@ -249,7 +250,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		auto neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Mask->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//...........................................................................
 		//				MAIN  VARIABLES ALLOCATED HERE
@@ -386,7 +387,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -436,7 +437,7 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->next, Np);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 			// *************EVEN TIMESTEP*************
 			timestep++;
@@ -472,9 +473,9 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, SolidPotential, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->next, Np);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			//************************************************************************
-			comm.barrier();
+			MPI_Barrier(comm);
 			PROFILE_STOP("Update");
 
 			// Run the analysis
@@ -486,7 +487,7 @@ int main(int argc, char **argv)
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -546,8 +547,9 @@ int main(int argc, char **argv)
 		PROFILE_STOP("Main");
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		// ****************************************************
-		comm.barrier();
+		MPI_Barrier(comm);
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
+	MPI_Comm_free(&comm);
 	MPI_Finalize();
 	return check;
 }
diff --git a/tests/TestColorBubble.cpp b/tests/TestColorBubble.cpp
index 1f42a71e..0e6ea25a 100644
--- a/tests/TestColorBubble.cpp
+++ b/tests/TestColorBubble.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/ColorModel.h"
 
 using namespace std;
@@ -64,11 +64,15 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		if (rank == 0){
@@ -93,7 +97,7 @@ int main(int argc, char **argv)
 		ColorModel.WriteDebug();
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorGrad.cpp b/tests/TestColorGrad.cpp
index df1c1daf..5cd6d924 100644
--- a/tests/TestColorGrad.cpp
+++ b/tests/TestColorGrad.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -15,11 +15,15 @@ using namespace std;
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check;
 	{
 		// parallel domain size (# of sub-domains)
@@ -112,7 +116,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		comm.barrier();
+		MPI_Barrier(comm);
 		//.................................................
 		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -125,7 +129,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		// **************************************************************
 		// **************************************************************
 
@@ -142,7 +146,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		double iVol_global = 1.0/Nx/Ny/Nz/nprocx/nprocy/nprocz;
 		int BoundaryCondition=0;
@@ -171,7 +175,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm.CommInit();
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -188,7 +192,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Np];
 
 		ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm.id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -256,7 +260,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorGradDFH.cpp b/tests/TestColorGradDFH.cpp
index b04aebce..d6376d82 100644
--- a/tests/TestColorGradDFH.cpp
+++ b/tests/TestColorGradDFH.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -25,11 +25,15 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -78,7 +82,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -101,7 +105,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int neighborSize=18*Np*sizeof(int);
@@ -207,7 +211,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestColorMassBounceback.cpp b/tests/TestColorMassBounceback.cpp
index 78508f9b..c05c245e 100644
--- a/tests/TestColorMassBounceback.cpp
+++ b/tests/TestColorMassBounceback.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -15,11 +15,15 @@ using namespace std;
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -38,7 +42,7 @@ int main(int argc, char **argv)
         // Initialize compute device
         //        int device=ScaLBL_SetDevice(rank);
         ScaLBL_DeviceBarrier();
-        comm.barrier();
+        MPI_Barrier(comm);
         Utilities::setErrorHandlers();
 
         // Variables that specify the computational domain  
@@ -73,7 +77,7 @@ int main(int argc, char **argv)
         // Get the rank info
         const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-        comm.barrier();
+        MPI_Barrier(comm);
 
         if (nprocs != nprocx*nprocy*nprocz){
             printf("nprocx =  %i \n",nprocx);
@@ -117,7 +121,7 @@ int main(int argc, char **argv)
         std::shared_ptr<Domain> Dm(new Domain(domain_db,comm));
         for (int i=0; i<Dm->Nx*Dm->Ny*Dm->Nz; i++) Dm->id[i] = 1;
         Dm->CommInit();
-        comm.barrier();
+        MPI_Barrier(comm);
 
         Nx+=2; Ny+=2; Nz += 2;
         int N = Nx*Ny*Nz;
@@ -149,7 +153,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -166,7 +170,7 @@ int main(int argc, char **argv)
 		Npad=Np+32;
 		neighborList= new int[18*Npad];
 		Np=ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -268,7 +272,7 @@ int main(int argc, char **argv)
 
         ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                 alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-        ScaLBL_DeviceBarrier(); comm.barrier();
+        ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		timestep++;
 
@@ -328,7 +332,7 @@ int main(int argc, char **argv)
          ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
          ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                  alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-         ScaLBL_DeviceBarrier(); comm.barrier();
+         ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
          timestep++;
          //************************************************************************
 		printf("Check after even time \n");
@@ -411,7 +415,7 @@ int main(int argc, char **argv)
 
         ScaLBL_D3Q19_AAodd_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                 alpha, beta, Fx, Fy, Fz, 0, ScaLBL_Comm->LastExterior(), Np);
-        ScaLBL_DeviceBarrier(); comm.barrier();
+        ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		timestep++;
 
@@ -472,7 +476,7 @@ int main(int argc, char **argv)
          ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
          ScaLBL_D3Q19_AAeven_DFH(NeighborList, fq, Aq, Bq, Den, Phi, Gradient, rhoA, rhoB, tauA, tauB,
                  alpha, beta, Fx, Fy, Fz,  0, ScaLBL_Comm->LastExterior(), Np);
-         ScaLBL_DeviceBarrier(); comm.barrier();
+         ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
          timestep++;
          //************************************************************************
 		printf("Check after even time \n");
@@ -519,7 +523,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 	return check;
diff --git a/tests/TestColorSquareTube.cpp b/tests/TestColorSquareTube.cpp
index cf8a9566..9807f0e8 100644
--- a/tests/TestColorSquareTube.cpp
+++ b/tests/TestColorSquareTube.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/ColorModel.h"
 
 std::shared_ptr<Database> loadInputs( int nprocs )
@@ -84,11 +84,15 @@ void InitializeSquareTube(ScaLBL_ColorModel &ColorModel){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		if (rank == 0){
@@ -109,7 +113,7 @@ int main(int argc, char **argv)
  
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestCommD3Q19.cpp b/tests/TestCommD3Q19.cpp
index d2799355..e1fa821f 100644
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@@ -6,7 +6,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -164,10 +164,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check;
 	{
 
@@ -262,14 +263,14 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		sum = comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		double iVol_global=1.f/double((Nx-2)*(Ny-2)*(Nz-2)*nprocx*nprocy*nprocz);
 	    	porosity = 1.0-sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 		//.......................................................................
 
 		//...........................................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		//...........................................................................
 
@@ -284,7 +285,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		Map.fill(-2);		
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 		int neighborSize=18*Np*sizeof(int);
 		//......................device distributions.................................
 		dist_mem_size = Np*sizeof(double);
@@ -354,7 +355,7 @@ int main(int argc, char **argv)
 		GlobalFlipScaLBL_D3Q19_Init(fq_host, Map, Np, Nx-2, Ny-2, Nz-2, iproc,jproc,kproc,nprocx,nprocy,nprocz);
 		ScaLBL_CopyToDevice(fq, fq_host, 19*dist_mem_size);
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		//*************************************************************************
 		// First timestep
 		ScaLBL_Comm.SendD3Q19AA(fq); //READ FROM NORMAL
@@ -377,7 +378,7 @@ int main(int argc, char **argv)
 
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
-		comm.barrier();
+		MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -397,7 +398,7 @@ int main(int argc, char **argv)
 			//*********************************************
 
 			ScaLBL_DeviceBarrier();
-			comm.barrier();
+			MPI_Barrier(comm);
 			// Iteration completed!
 			timestep++;
 			//...................................................................
@@ -426,7 +427,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Aggregated communication bandwidth = %f Gbit/sec \n",nprocs*ScaLBL_Comm.CommunicationCount*64*timestep/1e9);
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestDatabase.cpp b/tests/TestDatabase.cpp
index ced704e2..00bf87e2 100644
--- a/tests/TestDatabase.cpp
+++ b/tests/TestDatabase.cpp
@@ -9,7 +9,7 @@
 
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Database.h"
 #include "ProfilerApp.h"
 
@@ -17,8 +17,11 @@
 // Main
 int main(int argc, char **argv)
 {
+    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     Utilities::setAbortBehavior(true,2);
     Utilities::setErrorHandlers();
     UnitTest ut;
@@ -66,7 +69,7 @@ int main(int argc, char **argv)
 
     // Finished
     PROFILE_SAVE("TestDatabase",true);
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return err;
 }
diff --git a/tests/TestFluxBC.cpp b/tests/TestFluxBC.cpp
index 3e999715..020bbd89 100644
--- a/tests/TestFluxBC.cpp
+++ b/tests/TestFluxBC.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 #include "common/ScaLBL.h"
 
@@ -18,9 +18,9 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	int rank = MPI_WORLD_RANK();
+	int nprocs = MPI_WORLD_SIZE();
 
 	// set the error code
 	// Note: the error code should be consistent across all processors
@@ -89,7 +89,7 @@ int main (int argc, char **argv)
 		neighborList= new int[18*Npad];
 
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -149,7 +149,7 @@ int main (int argc, char **argv)
     	double *VEL;
     	VEL= new double [3*Np];
     	int SIZE=3*Np*sizeof(double);
-    	ScaLBL_DeviceBarrier(); comm.barrier();
+    	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
     	ScaLBL_CopyToHost(&VEL[0],&dvc_vel[0],SIZE);
 
     	double Q = 0.f;    	
@@ -192,7 +192,7 @@ int main (int argc, char **argv)
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 
 			ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
@@ -201,7 +201,7 @@ int main (int argc, char **argv)
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 			ScaLBL_D3Q19_AAeven_MRT(fq, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 			//************************************************************************/
 
@@ -265,7 +265,7 @@ int main (int argc, char **argv)
 
 	}
 	// Finished
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
     return error; 
 }
diff --git a/tests/TestForceD3Q19.cpp b/tests/TestForceD3Q19.cpp
index f8569624..b8f88aae 100644
--- a/tests/TestForceD3Q19.cpp
+++ b/tests/TestForceD3Q19.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 #include <math.h>
 
@@ -443,9 +443,8 @@ inline void MRT_Transform(double *dist, int Np, double Fx, double Fy, double Fz)
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+	int rank = MPI_WORLD_RANK();
+	int nprocs = MPI_WORLD_SIZE();
 
 	for (int i=0; i<nprocs; i++) {
 		if ( rank==i )
diff --git a/tests/TestForceMoments.cpp b/tests/TestForceMoments.cpp
index 0df4a726..1fb1e0a4 100644
--- a/tests/TestForceMoments.cpp
+++ b/tests/TestForceMoments.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -46,11 +46,15 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		// parallel domain size (# of sub-domains)
@@ -94,7 +98,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		kproc = rank/(nprocx*nprocy);
 		jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -102,7 +106,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -139,7 +143,7 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm->CommInit();
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 
 		int Np=0;  // number of local pore nodes
@@ -184,7 +188,7 @@ int main(int argc, char **argv)
 
 	        if (rank == 0) PrintNeighborList(neighborList,Np, rank);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -209,13 +213,13 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 
-		ScaLBL_DeviceBarrier(); comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 
 		/************ MAIN ITERATION LOOP (timing communications)***************************************/
 		//ScaLBL_Comm->SendD3Q19(dist, &dist[10*Np]);
 		//ScaLBL_Comm->RecvD3Q19(dist, &dist[10*Np]);
-		ScaLBL_DeviceBarrier(); comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 		if (rank==0) printf("Beginning AA timesteps...\n");
 		if (rank==0) printf("********************************************************\n");
@@ -227,14 +231,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist,  ScaLBL_Comm->first_interior, ScaLBL_Comm->last_interior, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm->RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 
 			ScaLBL_Comm->SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_MRT(dist, ScaLBL_Comm->first_interior, ScaLBL_Comm->last_interior, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm->RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_MRT(dist, 0, ScaLBL_Comm->next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 			//************************************************************************/
 			
@@ -327,7 +331,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestInterfaceSpeed.cpp b/tests/TestInterfaceSpeed.cpp
index d2c901df..40d53b47 100644
--- a/tests/TestInterfaceSpeed.cpp
+++ b/tests/TestInterfaceSpeed.cpp
@@ -2,7 +2,7 @@
 #include <math.h>
 
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -18,9 +18,13 @@
 int main (int argc, char *argv[])
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
+
+	int i,j,k;
 
     // Load inputs
 	string FILENAME = argv[1];
@@ -36,7 +40,7 @@ int main (int argc, char *argv[])
 
     Nx+=2; Ny+=2; Nz+=2;
 
-	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;
+	for (i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = 1;
 
 	Dm->CommInit();
 
@@ -47,9 +51,9 @@ int main (int argc, char *argv[])
 	double dist1,dist2;
 
 	Cx = Cy = Cz = N*0.5;
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
 				dist2 = fabs(Cz-k)-HEIGHT;
 
@@ -58,9 +62,9 @@ int main (int argc, char *argv[])
 		} 
 	}
 	Cz += SPEED;
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
 				
 				dist1 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)) - RADIUS;
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
@@ -73,9 +77,9 @@ int main (int argc, char *argv[])
 		}   
 	}
 	Cz += SPEED;
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
 				dist2 = sqrt((i-Cx)*(i-Cx)+(j-Cy)*(j-Cy)+(k-Cz)*(k-Cz)) - CAPRAD;
 				dist2 = fabs(Cz-k)-HEIGHT;
 
@@ -147,7 +151,7 @@ int main (int argc, char *argv[])
 	return toReturn;
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	return 0;
 	MPI_Finalize();
 	// ****************************************************
diff --git a/tests/TestMRT.cpp b/tests/TestMRT.cpp
index 5f2c4449..30f46689 100644
--- a/tests/TestMRT.cpp
+++ b/tests/TestMRT.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -488,11 +488,15 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check;
 	{
 		// parallel domain size (# of sub-domains)
@@ -578,7 +582,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		comm.barrier();
+		MPI_Barrier(comm);
 		//.................................................
 		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -591,7 +595,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 		// **************************************************************
 		// **************************************************************
 
@@ -609,7 +613,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		kproc = rank/(nprocx*nprocy);
 		jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -617,7 +621,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -646,7 +650,7 @@ int main(int argc, char **argv)
 		fread(Dm.id,1,N,IDFILE);
 		fclose(IDFILE);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		Dm.CommInit();
 
 		//.......................................................................
@@ -667,12 +671,12 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -702,7 +706,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Np];
 
 		ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm.id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		int dist_mem_size = Np*sizeof(double);
@@ -730,7 +734,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 
-		ScaLBL_DeviceBarrier(); comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 
 		while (timestep < timesteps) {
@@ -739,14 +743,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, ScaLBL_Comm.next, Np, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_MRT(NeighborList, dist, 0, ScaLBL_Comm.next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 
 			ScaLBL_Comm.SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_MRT(dist, ScaLBL_Comm.next, Np, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_MRT(dist, 0, ScaLBL_Comm.next, Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			timestep++;
 			//************************************************************************/
 
@@ -779,7 +783,7 @@ int main(int argc, char **argv)
     	VEL= new double [3*Np];
     	int SIZE=3*Np*sizeof(double);
     	ScaLBL_D3Q19_Momentum(dist,Velocity, Np);
-    	ScaLBL_DeviceBarrier(); comm.barrier();
+    	ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
     	ScaLBL_CopyToHost(&VEL[0],&Velocity[0],SIZE);
 
     	sum_local=0.f;
@@ -801,7 +805,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestMap.cpp b/tests/TestMap.cpp
index f3010081..a47c0d9e 100644
--- a/tests/TestMap.cpp
+++ b/tests/TestMap.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 using namespace std;
 
@@ -26,9 +26,15 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 
@@ -39,7 +45,6 @@ int main(int argc, char **argv)
 				{1,0,1},{-1,0,-1},{1,0,-1},{-1,0,1},
 				{0,1,1},{0,-1,-1},{0,1,-1},{0,-1,1}};
 
-        int rank = comm.getRank();
 		if (rank == 0){
 			printf("********************************************************\n");
 			printf("Running unit test: TestMap	\n");
@@ -47,7 +52,7 @@ int main(int argc, char **argv)
 		}
 		
 	    // Load inputs
-	    auto db = loadInputs( comm.getSize() );
+	    auto db = loadInputs( nprocs );
 	    int Nx = db->getVector<int>( "n" )[0];
 	    int Ny = db->getVector<int>( "n" )[1];
 	    int Nz = db->getVector<int>( "n" )[2];
@@ -89,7 +94,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Npad];
 
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 		
 		// Check the neighborlist
 		printf("Check neighborlist: exterior %i, first interior %i last interior %i \n",ScaLBL_Comm->LastExterior(),ScaLBL_Comm->FirstInterior(),ScaLBL_Comm->LastInterior());
@@ -192,7 +197,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index 68183cd2..bbfe8cae 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -8,7 +8,7 @@
 #include <fstream>
 
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/ColorModel.h"
 
 inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius){
@@ -67,10 +67,11 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 
 	if (rank == 0){
@@ -265,7 +266,7 @@ int main(int argc, char **argv)
 	}
 }
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/TestMicroCTReader.cpp b/tests/TestMicroCTReader.cpp
index 9a54610c..4a4c6aac 100644
--- a/tests/TestMicroCTReader.cpp
+++ b/tests/TestMicroCTReader.cpp
@@ -1,6 +1,6 @@
 // Test reading high-resolution files from the microct database
 
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/UnitTest.h"
 #include "common/Database.h"
 #include "common/Domain.h"
@@ -13,14 +13,12 @@
 
 void testReadMicroCT( const std::string& filename, UnitTest& ut )
 {
-    Utilities::MPI comm( MPI_COMM_WORLD );
-
     // Get the domain info
     auto db = std::make_shared<Database>( filename );
     auto domain_db = db->getDatabase( "Domain" );
 
     // Test reading microCT files
-    auto data = readMicroCT( *domain_db, comm );
+    auto data = readMicroCT( *domain_db, MPI_COMM_WORLD );
     
     // Check if we loaded the data correctly
     if ( data.size() == domain_db->getVector<size_t>( "n" ) )
@@ -32,7 +30,7 @@ void testReadMicroCT( const std::string& filename, UnitTest& ut )
     auto n = domain_db->getVector<int>( "n" );
     auto nproc = domain_db->getVector<int>( "nproc" );
     int N[3] = { n[0]*nproc[0], n[1]*nproc[1], n[2]*nproc[2] };
-    int rank = comm.getRank();
+    int rank = comm_rank(MPI_COMM_WORLD);
     RankInfoStruct rankInfo( rank, nproc[0], nproc[1], nproc[2] );
     std::vector<IO::MeshDataStruct> meshData( 1 );
     auto Var = std::make_shared<IO::Variable>();
@@ -43,7 +41,7 @@ void testReadMicroCT( const std::string& filename, UnitTest& ut )
     meshData[0].meshName = "grid";
     meshData[0].mesh = std::make_shared<IO::DomainMesh>(rankInfo,n[0],n[1],n[2],N[0],N[1],N[2]);
     meshData[0].vars.push_back(Var);
-    IO::writeData( 0, meshData, comm );
+    IO::writeData( 0, meshData, MPI_COMM_WORLD );
 }
 
 
diff --git a/tests/TestMomentsD3Q19.cpp b/tests/TestMomentsD3Q19.cpp
index 6bd3e8ff..b26d7bed 100644
--- a/tests/TestMomentsD3Q19.cpp
+++ b/tests/TestMomentsD3Q19.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 #include <math.h>
 
@@ -463,14 +463,13 @@ inline void MRT_Transform(double *dist, int Np) {
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    int rank = MPI_WORLD_RANK();
+    int nprocs = MPI_WORLD_SIZE();
 
     for (int i=0; i<nprocs; i++) {
         if ( rank==i )
             printf("%i of %i: TestMoments\n",rank,nprocs);
-        comm.barrier();
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 
     // Create a memory leak for valgrind to find
diff --git a/tests/TestNetcdf.cpp b/tests/TestNetcdf.cpp
index 8768c9ea..5ea5139f 100644
--- a/tests/TestNetcdf.cpp
+++ b/tests/TestNetcdf.cpp
@@ -1,7 +1,7 @@
 // Test reading/writing netcdf files
 
 #include "IO/netcdf.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/UnitTest.h"
 
@@ -13,8 +13,7 @@ void load( const std::string& );
 
 void test_NETCDF( UnitTest& ut )
 {
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    const int rank = comm.getRank();
+    const int rank = comm_rank( MPI_COMM_WORLD );
     int nprocx = 2;
     int nprocy = 2;
     int nprocz = 2;
@@ -31,7 +30,7 @@ void test_NETCDF( UnitTest& ut )
     auto dims =  netcdf::defDim( fid, {"X", "Y", "Z"}, dim );
     netcdf::write( fid, "tmp", dims, data, info );
     netcdf::close( fid );
-    comm.barrier();
+    MPI_Barrier( MPI_COMM_WORLD );
     // Read the contents of the file we created
     fid = netcdf::open( filename, netcdf::READ );
     Array<float> tmp = netcdf::getVar<float>( fid, "tmp" );
@@ -96,8 +95,7 @@ int main(int argc, char **argv)
 {
     // Initialize MPI
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    const int rank = comm.getRank();
+    int rank = comm_rank(MPI_COMM_WORLD);
     UnitTest ut;
     PROFILE_START("Main");
 
diff --git a/tests/TestPoiseuille.cpp b/tests/TestPoiseuille.cpp
index 744d292d..e69507e1 100644
--- a/tests/TestPoiseuille.cpp
+++ b/tests/TestPoiseuille.cpp
@@ -7,7 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/MRTModel.h"
 
 void ParallelPlates(ScaLBL_MRTModel &MRT){
@@ -47,11 +47,15 @@ void ParallelPlates(ScaLBL_MRTModel &MRT){
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		if (rank == 0){
@@ -73,7 +77,7 @@ int main(int argc, char **argv)
 
 		int SIZE=MRT.Np*sizeof(double);
 		ScaLBL_D3Q19_Momentum(MRT.fq,MRT.Velocity, MRT.Np);
-		ScaLBL_DeviceBarrier(); comm.barrier();
+		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		ScaLBL_CopyToHost(&Vz[0],&MRT.Velocity[0],3*SIZE);
 		
 		if (rank == 0) printf("Force: %f,%f,%f \n",MRT.Fx,MRT.Fy,MRT.Fz);
@@ -87,7 +91,7 @@ int main(int argc, char **argv)
 		j=Ny/2; k=Nz/2;
 		if (rank == 0) printf("Channel width=%f \n",W);
 		if (rank == 0) printf("ID flag vz       analytical\n");
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		if (rank == 0) {
 			for (i=0;i<Nx;i++){
@@ -126,7 +130,7 @@ int main(int argc, char **argv)
 	}
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 
diff --git a/tests/TestPressVel.cpp b/tests/TestPressVel.cpp
index 25afd226..e655ced9 100644
--- a/tests/TestPressVel.cpp
+++ b/tests/TestPressVel.cpp
@@ -7,16 +7,21 @@
 #include <iostream>
 #include <fstream>
 #include "common/ScaLBL.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	int check=0;
 	{
 		if (rank == 0){
@@ -45,7 +50,7 @@ int main(int argc, char **argv)
 			printf("********************************************************\n");
 		}
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		int kproc = rank/(nprocx*nprocy);
 		int jproc = (rank-nprocx*nprocy*kproc)/nprocx;
 		int iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
@@ -53,7 +58,7 @@ int main(int argc, char **argv)
 		if (rank == 0) {
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 1){
 			printf("i,j,k proc=%d %d %d \n",iproc,jproc,kproc);
 			printf("\n\n");
@@ -97,11 +102,11 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-        sum = comm.sumReduce( sum_local );
+		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
 		porosity = sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 
@@ -128,7 +133,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm->MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//......................device distributions.................................
 		if (rank==0)	printf ("Allocating distributions \n");
@@ -189,7 +194,7 @@ int main(int argc, char **argv)
 	   }
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 	return check;
diff --git a/tests/TestSegDist.cpp b/tests/TestSegDist.cpp
index b5e23ec8..ece3222d 100644
--- a/tests/TestSegDist.cpp
+++ b/tests/TestSegDist.cpp
@@ -39,10 +39,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
     // Initialize MPI
+    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     {
 
 
@@ -97,7 +98,7 @@ int main(int argc, char **argv)
         }
     }
 
-    comm.barrier();
+    MPI_Barrier(comm);
     if (rank==0) printf("Initialized! Converting to Signed Distance function \n");
 
     double t1 = MPI_Wtime();
@@ -115,7 +116,7 @@ int main(int argc, char **argv)
             }
         }
     }
-    err = Dm.Comm.sumReduce( err );
+    err = sumReduce( Dm.Comm, err );
     err = sqrt( err / (nx*ny*nz*nprocs) );
     if (rank==0)
         printf("Mean error %0.4f \n", err);
@@ -141,7 +142,7 @@ int main(int argc, char **argv)
     IO::writeData( "testSegDist", data, MPI_COMM_WORLD );
 
     }
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return 0;
 
diff --git a/tests/TestSubphase.cpp b/tests/TestSubphase.cpp
index 9738812f..fd6383be 100644
--- a/tests/TestSubphase.cpp
+++ b/tests/TestSubphase.cpp
@@ -26,10 +26,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		if ( rank==0 ) {
@@ -136,7 +137,7 @@ int main(int argc, char **argv)
 		// Averages->Reduce();
 
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;  
 }
diff --git a/tests/TestTopo3D.cpp b/tests/TestTopo3D.cpp
index 948bb1d6..8d00ef5a 100644
--- a/tests/TestTopo3D.cpp
+++ b/tests/TestTopo3D.cpp
@@ -26,10 +26,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		if ( rank==0 ) {
@@ -225,7 +226,7 @@ int main(int argc, char **argv)
 		IO::writeData( timestep, visData, comm );
 
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;  
 }
diff --git a/tests/TestTorus.cpp b/tests/TestTorus.cpp
index 5125ce92..2d486774 100644
--- a/tests/TestTorus.cpp
+++ b/tests/TestTorus.cpp
@@ -26,10 +26,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
   // Initialize MPI
+  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&nprocs);
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -164,7 +165,7 @@ int main(int argc, char **argv)
    // Averages->Reduce();
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  comm.barrier();
+  MPI_Barrier(comm);
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestTorusEvolve.cpp b/tests/TestTorusEvolve.cpp
index 32cf7fd8..1a65d268 100644
--- a/tests/TestTorusEvolve.cpp
+++ b/tests/TestTorusEvolve.cpp
@@ -26,10 +26,11 @@ std::shared_ptr<Database> loadInputs( int nprocs )
 int main(int argc, char **argv)
 {
   // Initialize MPI
+  int rank, nprocs;
   MPI_Init(&argc,&argv);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&nprocs);
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
     if ( rank==0 ) {
@@ -156,7 +157,7 @@ int main(int argc, char **argv)
 
     }
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
-  comm.barrier();
+  MPI_Barrier(comm);
   MPI_Finalize();
   return 0;  
 }
diff --git a/tests/TestTwoPhase.cpp b/tests/TestTwoPhase.cpp
index fa54d98d..a979314a 100644
--- a/tests/TestTwoPhase.cpp
+++ b/tests/TestTwoPhase.cpp
@@ -8,7 +8,7 @@
 #include <fstream>
 
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -17,10 +17,11 @@
 int main(int argc, char **argv)
 {
   // Initialize MPI
+  int rank,nprocs;
   MPI_Init(&argc,&argv);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&nprocs);
   { // Limit scope so Domain can free it's communicator
 
 	printf("Running two-phase averaging test on %i processors \n",nprocs);
@@ -109,7 +110,7 @@ int main(int argc, char **argv)
 		fclose(PHASE);
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
   } // Limit scope so Domain will free it's communicator
   MPI_Finalize();
   return 0;
diff --git a/tests/TestWriter.cpp b/tests/TestWriter.cpp
index 37858202..8936aaff 100644
--- a/tests/TestWriter.cpp
+++ b/tests/TestWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "common/UnitTest.h"
 #include "common/Utilities.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Reader.h"
 #include "IO/Writer.h"
@@ -34,9 +34,11 @@ inline double distance( const Point& p )
 // Test writing and reading the given format
 void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& meshData, UnitTest& ut )
 {
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int nprocs = comm.getSize();
-    comm.barrier();
+    int rank, nprocs;
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
+    MPI_Barrier(comm);
 
     // Get the format
     std::string format2 = format;
@@ -61,7 +63,7 @@ void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& mes
     IO::initialize( "test_"+format, format2, false );
     IO::writeData( 0, meshData, comm );
     IO::writeData( 3, meshData, comm );
-    comm.barrier();
+    MPI_Barrier(comm);
     PROFILE_STOP(format+"-write");
 
     // Get the summary name for reading
@@ -226,10 +228,11 @@ void testWriter( const std::string& format, std::vector<IO::MeshDataStruct>& mes
 // Main
 int main(int argc, char **argv)
 {
+    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     Utilities::setAbortBehavior(true,2);
     Utilities::setErrorHandlers();
     UnitTest ut;
@@ -386,7 +389,7 @@ int main(int argc, char **argv)
     ut.report();
     PROFILE_SAVE("TestWriter",true);
     int N_errors = ut.NumFailGlobal();
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return N_errors;
 }
diff --git a/tests/convertIO.cpp b/tests/convertIO.cpp
index 27605237..0937729f 100644
--- a/tests/convertIO.cpp
+++ b/tests/convertIO.cpp
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include <fstream>
 
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 #include "common/Utilities.h"
 #include "IO/Mesh.h"
@@ -17,10 +17,11 @@
 int main(int argc, char **argv)
 {
   // Initialize MPI
+  int rank,nprocs;
   MPI_Init(&argc,&argv);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&nprocs);
   Utilities::setErrorHandlers();
   PROFILE_ENABLE(2);
   PROFILE_ENABLE_TRACE();
@@ -69,20 +70,20 @@ int main(int argc, char **argv)
 
             i++;
         }
-        comm.barrier();
+        MPI_Barrier(comm);
         PROFILE_STOP("Read");
 
         // Save the mesh data to a new file
         PROFILE_START("Write");
         IO::writeData( timestep, meshData, MPI_COMM_WORLD );
-        comm.barrier();
+        MPI_Barrier(comm);
         PROFILE_STOP("Write");
     }
 
   } // Limit scope
   PROFILE_STOP("Main");
   PROFILE_SAVE("convertData",true);
-  comm.barrier();
+  MPI_Barrier(comm);
   MPI_Finalize();
   return 0;
 }
diff --git a/tests/hello_world.cpp b/tests/hello_world.cpp
index 810d3a9c..d236bf0e 100644
--- a/tests/hello_world.cpp
+++ b/tests/hello_world.cpp
@@ -1,19 +1,18 @@
 #include <iostream>
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Utilities.h"
 
 
 int main (int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    int rank = MPI_WORLD_RANK();
+    int nprocs = MPI_WORLD_SIZE();
 
     for (int i=0; i<nprocs; i++) {
         if ( rank==i )
             printf("%i of %i: Hello world\n",rank,nprocs);
-        comm.barrier();
+        MPI_Barrier(MPI_COMM_WORLD);
     }
 
     // Create a memory leak for valgrind to find
@@ -27,7 +26,7 @@ int main (int argc, char **argv)
     int error = 0;
     
     // Finished
-    comm.barrier();
+	MPI_Barrier(MPI_COMM_WORLD);
 	MPI_Finalize();
     return error; 
 }
diff --git a/tests/lb2_CMT_wia.cpp b/tests/lb2_CMT_wia.cpp
index 820428a3..e0f5a627 100644
--- a/tests/lb2_CMT_wia.cpp
+++ b/tests/lb2_CMT_wia.cpp
@@ -11,7 +11,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-//#include "common/MPI.h"
+//#include "common/MPI_Helpers.h"
 //#include "Communication.h"
 
 //#define CBUB
diff --git a/tests/lb2_Color_blob_wia_mpi.cpp b/tests/lb2_Color_blob_wia_mpi.cpp
index 70342176..1c11ebd3 100644
--- a/tests/lb2_Color_blob_wia_mpi.cpp
+++ b/tests/lb2_Color_blob_wia_mpi.cpp
@@ -12,7 +12,7 @@
 #include "D3Q19.h"
 #include "D3Q7.h"
 #include "Color.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "Communication.h"
 
 #define WRITE_SURFACES
@@ -96,11 +96,15 @@ inline void ZeroHalo(double *Data, int Nx, int Ny, int Nz)
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -205,7 +209,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
 	MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&alpha,1,MPI_DOUBLE,0,comm);
@@ -238,7 +242,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	RESTART_INTERVAL=interval;
 	// **************************************************************
@@ -280,7 +284,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	 comm.barrier();
+	 MPI_Barrier(comm);
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -397,14 +401,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	if (rank == 0){
 		// Compute the Sauter mean diameter
@@ -592,7 +596,7 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("SendLists are ready on host\n");
 	//......................................................................................
 	// Use MPI to fill in the recvCounts form the associated processes
@@ -779,7 +783,7 @@ int main(int argc, char **argv)
 	ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_Yz, recvCount_Yz*sizeof(int));	// Allocate device memory
 	ScaLBL_AllocateDeviceMemory((void **) &dvcRecvList_YZ, recvCount_YZ*sizeof(int));	// Allocate device memory
 	//......................................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank==0)	printf ("Prepare to copy send/recv Lists to device \n");
 	ScaLBL_CopyToDevice(dvcSendList_x,sendList_x,sendCount_x*sizeof(int));
 	ScaLBL_CopyToDevice(dvcSendList_X,sendList_X,sendCount_X*sizeof(int));
@@ -989,7 +993,7 @@ int main(int argc, char **argv)
 	recvMeshData_YZ = new double [recvCount_YZ];
 	recvMeshData_XZ = new double [recvCount_XZ];
 	if (rank==0)	printf ("Devices are ready to communicate. \n");
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	//...........device phase ID.................................................
 	if (rank==0)	printf ("Copying phase ID to device \n");
@@ -1216,7 +1220,7 @@ int main(int argc, char **argv)
 		ScaLBL_CopyToDevice(f_odd,cDistOdd,9*N*sizeof(double));
 		ScaLBL_CopyToDevice(Den,cDen,2*N*sizeof(double));
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 	}
 	// Set up the cube list (very regular in this case due to lack of blob-ID)
 	// Set up kstart, kfinish so that the reservoirs are excluded from averaging
@@ -1483,7 +1487,7 @@ int main(int argc, char **argv)
 	ScaLBL_CopyToHost(Vel_x.data,&Velocity[0],N*sizeof(double));
 	ScaLBL_CopyToHost(Vel_y.data,&Velocity[N],N*sizeof(double));
 	ScaLBL_CopyToHost(Vel_z.data,&Velocity[2*N],N*sizeof(double));
-	comm.barrier();
+	MPI_Barrier(comm);
 	//...........................................................................
 	
 	int timestep = 0;
@@ -1496,7 +1500,7 @@ int main(int argc, char **argv)
 	
 	//.......create and start timer............
 	double starttime,stoptime,cputime;
-	comm.barrier();
+	MPI_Barrier(comm);
 	starttime = MPI_Wtime();
 	//.........................................
 	
@@ -1911,7 +1915,7 @@ int main(int argc, char **argv)
 		
 		//...................................................................................
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Timestep completed!
 		timestep++;
@@ -1936,7 +1940,7 @@ int main(int argc, char **argv)
 			ScaLBL_CopyToHost(Vel_x.data,&Velocity[0],N*sizeof(double));
 			ScaLBL_CopyToHost(Vel_y.data,&Velocity[N],N*sizeof(double));
 			ScaLBL_CopyToHost(Vel_z.data,&Velocity[2*N],N*sizeof(double));
-			comm.barrier();
+			MPI_Barrier(comm);
 		}
 		if (timestep%1000 == 5){
 			//...........................................................................
@@ -2441,7 +2445,7 @@ int main(int argc, char **argv)
 			}
 			
 			//...........................................................................
-			comm.barrier();
+			MPI_Barrier(comm);
 			MPI_Allreduce(&nwp_volume,&nwp_volume_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&awn,&awn_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&ans,&ans_global,1,MPI_DOUBLE,MPI_SUM,comm);
@@ -2464,7 +2468,7 @@ int main(int argc, char **argv)
 			MPI_Allreduce(&Gws(0),&Gws_global(0),6,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&trawn,&trawn_global,1,MPI_DOUBLE,MPI_SUM,comm);
 			MPI_Allreduce(&trJwn,&trJwn_global,1,MPI_DOUBLE,MPI_SUM,comm);
-			comm.barrier();
+			MPI_Barrier(comm);
 			//.........................................................................
 			// Compute the change in the total surface energy based on the defined interval
 			// See McClure, Prins and Miller (2014) 
@@ -2543,7 +2547,7 @@ int main(int argc, char **argv)
 			if (rank==0){
 				mkdir(tmpstr,0777);
 			}
-			comm.barrier();
+			MPI_Barrier(comm);
 			
 			FILE *WN_TRIS;
 			sprintf(LocalRankFilename,"%s/%s%s",tmpstr,"wn-tris.",LocalRankString);
@@ -2688,7 +2692,7 @@ int main(int argc, char **argv)
 	}
 	//************************************************************************/
 	ScaLBL_DeviceBarrier();
-	comm.barrier();
+	MPI_Barrier(comm);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
@@ -2812,7 +2816,7 @@ int main(int argc, char **argv)
 */	//************************************************************************/
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_BGK_simulator.cpp b/tests/lbpm_BGK_simulator.cpp
index 8b079900..095a6c5f 100644
--- a/tests/lbpm_BGK_simulator.cpp
+++ b/tests/lbpm_BGK_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 //#define WRITE_SURFACES
 
@@ -23,12 +23,15 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		// parallel domain size (# of sub-domains)
 		int nprocx,nprocy,nprocz;
@@ -95,7 +98,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		comm.barrier();
+		MPI_Barrier(comm);
 		//.................................................
 		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
@@ -120,7 +123,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		RESTART_INTERVAL=interval;
 		// **************************************************************
@@ -155,7 +158,7 @@ int main(int argc, char **argv)
 
 		// Mask that excludes the solid phase
 		Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BC);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		Nx += 2;	Ny += 2;	Nz += 2;
 		int N = Nx*Ny*Nz;
@@ -191,7 +194,7 @@ int main(int argc, char **argv)
 		sprintf(LocalRankString,"%05d",rank);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages.SDs.data(), N);
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 
 		//.......................................................................
@@ -258,7 +261,7 @@ int main(int argc, char **argv)
 		id[0] = id[Nx-1] = id[(Ny-1)*Nx] = id[(Ny-1)*Nx + Nx-1] = 0;
 		id[(Nz-1)*Nx*Ny] = id[(Nz-1)*Nx*Ny+Nx-1] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx] = id[(Nz-1)*Nx*Ny+(Ny-1)*Nx + Nx-1] = 0;
 		//.........................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Initialize communication structures in averaging domain
 		for (i=0; i<Mask.Nx*Mask.Ny*Mask.Nz; i++) Mask.id[i] = id[i];
@@ -274,7 +277,7 @@ int main(int argc, char **argv)
 		IntArray Map(Nx,Ny,Nz);
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Mask.id,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 		
 		// LBM variables
 		if (rank==0)	printf ("Allocating distributions \n");
@@ -330,7 +333,7 @@ int main(int argc, char **argv)
 
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
-		comm.barrier();
+		MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -345,14 +348,14 @@ int main(int argc, char **argv)
 			ScaLBL_D3Q19_AAodd_BGK(NeighborList, dist, ScaLBL_Comm.first_interior, ScaLBL_Comm.last_interior, Np, rlx, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAodd_BGK(NeighborList, dist, 0, ScaLBL_Comm.next, Np, rlx, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 			timestep++;
 			ScaLBL_Comm.SendD3Q19AA(dist); //READ FORM NORMAL
 			ScaLBL_D3Q19_AAeven_BGK(dist, ScaLBL_Comm.first_interior, ScaLBL_Comm.last_interior, Np, rlx, Fx, Fy, Fz);
 			ScaLBL_Comm.RecvD3Q19AA(dist); //WRITE INTO OPPOSITE
 			ScaLBL_D3Q19_AAeven_BGK(dist, 0, ScaLBL_Comm.next, Np, rlx, Fx, Fy, Fz);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			//************************************************************************/
 
 			if (timestep%500 == 0){
@@ -409,7 +412,7 @@ int main(int argc, char **argv)
 		}
 		//************************************************************************/
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -427,7 +430,7 @@ int main(int argc, char **argv)
 		NULL_USE(RESTART_INTERVAL);
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_captube_pp.cpp b/tests/lbpm_captube_pp.cpp
index b90ebb2a..ce82a4bb 100644
--- a/tests/lbpm_captube_pp.cpp
+++ b/tests/lbpm_captube_pp.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 
 std::shared_ptr<Database> loadInputs( )
@@ -24,11 +24,15 @@ std::shared_ptr<Database> loadInputs( )
 //***************************************************************************************
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 	//*****************************************
 	// MPI ranks for all 18 neighbors
@@ -92,7 +96,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -181,7 +185,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_color_macro_simulator.cpp b/tests/lbpm_color_macro_simulator.cpp
index 97df6812..1c619c5a 100644
--- a/tests/lbpm_color_macro_simulator.cpp
+++ b/tests/lbpm_color_macro_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
 #include "analysis/runAnalysis.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -30,9 +30,10 @@ int main(int argc, char **argv)
 	// Initialize MPI
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+	MPI_Comm comm;
+	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+	int rank = comm_rank(comm);
+	int nprocs = comm_size(comm);
 	{ // Limit scope so variables that contain communicators will free before MPI_Finialize
 
 		// parallel domain size (# of sub-domains)
@@ -51,7 +52,7 @@ int main(int argc, char **argv)
 		//		int device=ScaLBL_SetDevice(rank);
 		//printf("Using GPU ID %i for rank %i \n",device,rank);
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		PROFILE_ENABLE(1);
 		//PROFILE_ENABLE_TRACE();
@@ -170,7 +171,7 @@ int main(int argc, char **argv)
 		}
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		comm.barrier();
+		MPI_Barrier(comm);
 		//.................................................
 		MPI_Bcast(&tauA,1,MPI_DOUBLE,0,comm);
 		MPI_Bcast(&tauB,1,MPI_DOUBLE,0,comm);
@@ -206,7 +207,7 @@ int main(int argc, char **argv)
 		// Get the rank info
 		const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		if (nprocs != nprocx*nprocy*nprocz){
 			printf("nprocx =  %i \n",nprocx);
@@ -261,7 +262,7 @@ int main(int argc, char **argv)
 
 		// Mask that excludes the solid phase
 		Domain Mask(Nx,Ny,Nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		Nx+=2; Ny+=2; Nz += 2;
 		int N = Nx*Ny*Nz;
@@ -296,7 +297,7 @@ int main(int argc, char **argv)
 		sprintf(LocalRankString,"%05d",rank);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;
 
 		if (rank==0) printf("Initialize from segmented data: solid=0, NWP=1, WP=2 \n");
@@ -340,7 +341,7 @@ int main(int argc, char **argv)
 			delete [] cDen;
 			delete [] cfq;
 			*/
-			comm.barrier();
+			MPI_Barrier(comm);
 		}
 		
 		fflush(stdout);
@@ -415,7 +416,7 @@ int main(int argc, char **argv)
 		neighborList= new int[18*Npad];
 		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Mask.id,Np);
 		if (rank==0)	printf ("Set up memory efficient layout Npad=%i, Np=%i \n",Npad,Np);
-		comm.barrier();
+		MPI_Barrier(comm);
 		//...........................................................................
 		//				MAIN  VARIABLES ALLOCATED HERE
 		//...........................................................................
@@ -536,7 +537,7 @@ int main(int argc, char **argv)
 		//.......create and start timer............
 		double starttime,stoptime,cputime;
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		starttime = MPI_Wtime();
 		//.........................................
 
@@ -588,7 +589,7 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm.next, Np);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 
 			// *************EVEN TIMESTEP*************
 			timestep++;
@@ -621,10 +622,10 @@ int main(int argc, char **argv)
 			}
 			ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 					alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm.next, Np);
-			ScaLBL_DeviceBarrier(); comm.barrier();
+			ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 			//************************************************************************
 			
-			comm.barrier();
+			MPI_Barrier(comm);
 			PROFILE_STOP("Update");
 
 			// Run the analysis
@@ -636,7 +637,7 @@ int main(int argc, char **argv)
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		//************************************************************************
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		stoptime = MPI_Wtime();
 		if (rank==0) printf("-------------------------------------------------------------------\n");
 		// Compute the walltime per timestep
@@ -656,8 +657,9 @@ int main(int argc, char **argv)
 		PROFILE_STOP("Main");
 		PROFILE_SAVE("lbpm_color_simulator",1);
 		// ****************************************************
-		comm.barrier();
+		MPI_Barrier(comm);
 	} // Limit scope so variables that contain communicators will free before MPI_Finialize
+	MPI_Comm_free(&comm);
 	MPI_Finalize();
 }
 
diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index cef13189..1f63c653 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -28,9 +28,10 @@ int main(int argc, char **argv)
 
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
 
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm;
+    MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+    int rank = comm_rank(comm);
+    int nprocs = comm_size(comm);
 
     if (rank == 0){
 	    printf("********************************************************\n");
@@ -40,7 +41,7 @@ int main(int argc, char **argv)
     // Initialize compute device
     ScaLBL_SetDevice(rank);
     ScaLBL_DeviceBarrier();
-    comm.barrier();
+    MPI_Barrier(comm);
 
     PROFILE_ENABLE(1);
     //PROFILE_ENABLE_TRACE();
@@ -50,7 +51,7 @@ int main(int argc, char **argv)
     Utilities::setErrorHandlers();
 
     auto filename = argv[1];
-    ScaLBL_ColorModel ColorModel(rank,nprocs,comm.dup());
+    ScaLBL_ColorModel ColorModel(rank,nprocs,comm);
     ColorModel.ReadParams(filename);
     ColorModel.SetDomain();    
     ColorModel.ReadInput();    
@@ -63,7 +64,8 @@ int main(int argc, char **argv)
     PROFILE_SAVE("lbpm_color_simulator",1);
     // ****************************************************
 
-    comm.barrier();
+    MPI_Barrier(comm);
+    MPI_Comm_free(&comm);
 
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
 
diff --git a/tests/lbpm_dfh_simulator.cpp b/tests/lbpm_dfh_simulator.cpp
index 0d5902df..1e8dc0f9 100644
--- a/tests/lbpm_dfh_simulator.cpp
+++ b/tests/lbpm_dfh_simulator.cpp
@@ -26,9 +26,10 @@ int main(int argc, char **argv)
   // Initialize MPI
   int provided_thread_support = -1;
   MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-  Utilities::MPI comm( MPI_COMM_WORLD );
-  int rank = comm.getRank();
-  int nprocs = comm.getSize();
+  MPI_Comm comm;
+  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+  int rank = comm_rank(comm);
+  int nprocs = comm_size(comm);
   if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
     std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
   { // Limit scope so variables that contain communicators will free before MPI_Finialize
@@ -46,7 +47,7 @@ int main(int argc, char **argv)
     Utilities::setErrorHandlers();
 
 	auto filename = argv[1];
-	ScaLBL_DFHModel DFHModel( rank, nprocs, comm.dup() );
+	ScaLBL_DFHModel DFHModel(rank,nprocs,comm);
 	DFHModel.ReadParams(filename);
 	DFHModel.SetDomain();    
 	DFHModel.ReadInput();    
@@ -58,8 +59,9 @@ int main(int argc, char **argv)
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
   } // Limit scope so variables that contain communicators will free before MPI_Finialize
+  MPI_Comm_free(&comm);
   MPI_Finalize();
 }
 
diff --git a/tests/lbpm_disc_pp.cpp b/tests/lbpm_disc_pp.cpp
index 20d41884..92036000 100644
--- a/tests/lbpm_disc_pp.cpp
+++ b/tests/lbpm_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"    // This includes mpi.h
+#include "common/MPI_Helpers.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -130,11 +130,15 @@ inline void SignedDistanceDiscPack(double *Distance, int ndiscs, double *List_cx
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -186,7 +190,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
@@ -200,7 +204,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	// **************************************************************
 	if (argc > 1)	depth=atoi(argv[1]);
@@ -218,7 +222,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	 comm.barrier();
+	 MPI_Barrier(comm);
 
 	Nx += 2;
 	Ny += 2;
@@ -273,13 +277,13 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the disc packing \n");
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0){
 		cout << "Domain set." << endl;
 		printf("************ \n");
@@ -384,7 +388,7 @@ int main(int argc, char **argv)
 	//......................................................................
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_inkbottle_pp.cpp b/tests/lbpm_inkbottle_pp.cpp
index 669ab8c0..3c39219d 100644
--- a/tests/lbpm_inkbottle_pp.cpp
+++ b/tests/lbpm_inkbottle_pp.cpp
@@ -9,15 +9,19 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -79,7 +83,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -92,7 +96,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -119,7 +123,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -217,7 +221,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_juanes_bench_disc_pp.cpp b/tests/lbpm_juanes_bench_disc_pp.cpp
index 47d8cb84..6f04cffa 100644
--- a/tests/lbpm_juanes_bench_disc_pp.cpp
+++ b/tests/lbpm_juanes_bench_disc_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"    // This includes mpi.h
+#include "common/MPI_Helpers.h"    // This includes mpi.h
 #include "common/SpherePack.h"
 
 /*
@@ -130,11 +130,15 @@ inline void SignedDistanceDiscPack(double *Distance, int ndiscs, double *List_cx
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
 	int iproc,jproc,kproc;
@@ -190,7 +194,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	//.................................................
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
@@ -204,7 +208,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	// **************************************************************
 	double Rin,Rout;
@@ -236,7 +240,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-	 comm.barrier();
+	 MPI_Barrier(comm);
 	Nx += 2;	Ny += 2;	Nz += 2;
 
 	int N = Nx*Ny*Nz;
@@ -290,13 +294,13 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the disc packing \n");
 	if (rank == 0)	ReadDiscPacking(ndiscs,cx,cy,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,ndiscs,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,ndiscs,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	/*	if (rank == 0){
 		cout << "Domain set." << endl;
 		printf("************ \n");
@@ -308,7 +312,7 @@ int main(int argc, char **argv)
 	}
 	*/
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (nprocz > 1 && rank==0) printf("Disc packs are 2D -- are you sure you want nprocz > 1? \n");
 	if (rank ==0) printf("Compute the signed distance part I \n");
 	//.......................................................................
@@ -486,7 +490,7 @@ int main(int argc, char **argv)
 	//......................................................................
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_minkowski_scalar.cpp b/tests/lbpm_minkowski_scalar.cpp
index 721207a1..3e3ede6d 100644
--- a/tests/lbpm_minkowski_scalar.cpp
+++ b/tests/lbpm_minkowski_scalar.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -28,11 +28,13 @@
 
 int main(int argc, char **argv)
 {
+
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		Utilities::setErrorHandlers();
 		PROFILE_START("Main");
@@ -85,7 +87,7 @@ int main(int argc, char **argv)
 			fclose(SEGDAT);
 			printf("Read segmented data from %s \n",Filename.c_str());
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Get the rank info
 		int N = (nx+2)*(ny+2)*(nz+2);
@@ -150,7 +152,7 @@ int main(int argc, char **argv)
 						}
 						else{
 							printf("Sending data to process %i \n", rnk);
-							comm.send(tmp,N,rnk,15);
+							MPI_Send(tmp,N,MPI_CHAR,rnk,15,comm);
 						}
 					}
 				}
@@ -159,12 +161,13 @@ int main(int argc, char **argv)
 		else{
 			// Recieve the subdomain from rank = 0
 			printf("Ready to recieve data %i at process %i \n", N,rank);
-			comm.recv(Dm->id,N,0,15);
+			MPI_Recv(Dm->id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		
 		// Compute the Minkowski functionals
-		auto Averages = std::make_shared<Minkowski>(Dm);
+		MPI_Barrier(comm);
+		std::shared_ptr<Minkowski> Averages(new Minkowski(Dm));
 
 		// Calculate the distance		
 		// Initialize the domain and communication
@@ -209,7 +212,7 @@ int main(int argc, char **argv)
 	}
 	PROFILE_STOP("Main");
 	PROFILE_SAVE("Minkowski",true);
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_morph_pp.cpp b/tests/lbpm_morph_pp.cpp
index 939fdc32..8fe8b228 100644
--- a/tests/lbpm_morph_pp.cpp
+++ b/tests/lbpm_morph_pp.cpp
@@ -23,9 +23,11 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -125,13 +127,13 @@ int main(int argc, char **argv)
 
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
-		comm.barrier();
+		MPI_Barrier(comm);
 		
 		// Extract only the connected part of NWP
 		BlobIDstruct new_index;
 		double vF=0.0; double vS=0.0;
 		ComputeGlobalBlobIDs(nx-2,ny-2,nz-2,Dm->rank_info,phase,SignDist,vF,vS,phase_label,Dm->Comm);
-		Dm->Comm.barrier();
+		MPI_Barrier(Dm->Comm);
 			
 		int count_connected=0;
 		int count_porespace=0;
@@ -153,9 +155,9 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		count_connected = Dm->Comm.sumReduce( count_connected );
-		count_porespace = Dm->Comm.sumReduce( count_porespace );
-		count_water = Dm->Comm.sumReduce( count_water );
+		count_connected=sumReduce( Dm->Comm, count_connected);
+		count_porespace=sumReduce( Dm->Comm, count_porespace);
+		count_water=sumReduce( Dm->Comm, count_water);
 		
 		for (int k=0; k<nz; k++){
 			for (int j=0; j<ny; j++){
@@ -213,7 +215,7 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		count_water = Dm->Comm.sumReduce( count_water );
+		count_water=sumReduce( Dm->Comm, count_water);
 		
 		SW = double(count_water) / count_porespace;
 		if(rank==0) printf("Final saturation: %f \n", SW);
@@ -234,13 +236,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
         auto filename2 = READFILE + ".morph.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.c_str());
 		Mask->AggregateLabels(filename2);
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_morphdrain_pp.cpp b/tests/lbpm_morphdrain_pp.cpp
index d3c5a428..8d73b1e4 100644
--- a/tests/lbpm_morphdrain_pp.cpp
+++ b/tests/lbpm_morphdrain_pp.cpp
@@ -23,9 +23,11 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -119,7 +121,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Run the morphological opening
 		MorphDrain(SignDist, id, Dm, SW);
@@ -194,13 +196,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
         auto filename2 = READFILE + ".morphdrain.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.data() );
 		Mask->AggregateLabels( filename2 );
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_morphopen_pp.cpp b/tests/lbpm_morphopen_pp.cpp
index a6209240..f8819348 100644
--- a/tests/lbpm_morphopen_pp.cpp
+++ b/tests/lbpm_morphopen_pp.cpp
@@ -23,9 +23,11 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		//.......................................................................
 		// Reading the domain information file
@@ -121,7 +123,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,id_solid,*Dm);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Run the morphological opening
 		MorphOpen(SignDist, id, Dm, SW, ErodeLabel, OpenLabel);
@@ -196,13 +198,13 @@ int main(int argc, char **argv)
 				}
 			}
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
         auto filename2 = READFILE + ".morphopen.raw";
 		if (rank==0) printf("Writing file to: %s \n", filename2.data());
 		Mask->AggregateLabels(filename2);
 	}
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_nondarcy_simulator.cpp b/tests/lbpm_nondarcy_simulator.cpp
index 096dc790..40672375 100644
--- a/tests/lbpm_nondarcy_simulator.cpp
+++ b/tests/lbpm_nondarcy_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 //#define WRITE_SURFACES
 
@@ -77,11 +77,15 @@ int main(int argc, char **argv)
 	}
 	else {
 
+		//*****************************************
+		// ***** MPI STUFF ****************
+		//*****************************************
 		// Initialize MPI
+		int rank,nprocs;
 		MPI_Init(&argc,&argv);
-		Utilities::MPI comm( MPI_COMM_WORLD );
-        int rank = comm.getRank();
-        int nprocs = comm.getSize();
+		MPI_Comm comm = MPI_COMM_WORLD;
+		MPI_Comm_rank(comm,&rank);
+		MPI_Comm_size(comm,&nprocs);
 		{
 			// parallel domain size (# of sub-domains)
 			int nprocx,nprocy,nprocz;
@@ -156,7 +160,7 @@ int main(int argc, char **argv)
 			}
 			// **************************************************************
 			// Broadcast simulation parameters from rank 0 to all other procs
-			comm.barrier();
+			MPI_Barrier(comm);
 			//.................................................
 			MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
 			//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
@@ -181,7 +185,7 @@ int main(int argc, char **argv)
 			MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 			MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 			//.................................................
-			comm.barrier();
+			MPI_Barrier(comm);
 
 			RESTART_INTERVAL=interval;
 			// **************************************************************
@@ -218,7 +222,7 @@ int main(int argc, char **argv)
 					rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 					rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-			comm.barrier();
+			MPI_Barrier(comm);
 
 			Nx += 2;	Ny += 2;	Nz += 2;
 
@@ -258,7 +262,7 @@ int main(int argc, char **argv)
 			//	WriteLocalSolidID(LocalRankFilename, id, N);
 			sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 			ReadBinaryFile(LocalRankFilename, Averages.SDs.data(), N);
-			comm.barrier();
+			MPI_Barrier(comm);
 			if (rank == 0) cout << "Domain set." << endl;
 
 			//.......................................................................
@@ -432,7 +436,7 @@ int main(int argc, char **argv)
 
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
-			comm.barrier();
+			MPI_Barrier(comm);
 			starttime = MPI_Wtime();
 			//.........................................
 
@@ -481,7 +485,7 @@ int main(int argc, char **argv)
 					}
 					//...................................................................................
 					ScaLBL_DeviceBarrier();
-					comm.barrier();
+					MPI_Barrier(comm);
 
 
 					// Timestep completed!
@@ -553,7 +557,7 @@ int main(int argc, char **argv)
 			//************************************************************************/
 			fclose(NONDARCY);
 			ScaLBL_DeviceBarrier();
-			comm.barrier();
+			MPI_Barrier(comm);
 			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
@@ -571,7 +575,7 @@ int main(int argc, char **argv)
 			NULL_USE(RESTART_INTERVAL);
 		}
 		// ****************************************************
-		comm.barrier();
+		MPI_Barrier(comm);
 		MPI_Finalize();
 		// ****************************************************
 	}
diff --git a/tests/lbpm_nonnewtonian_simulator.cpp b/tests/lbpm_nonnewtonian_simulator.cpp
index ff8792e7..5c33841f 100644
--- a/tests/lbpm_nonnewtonian_simulator.cpp
+++ b/tests/lbpm_nonnewtonian_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "common/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "ProfilerApp.h"
 #include "threadpool/thread_pool.h"
 
@@ -99,12 +99,21 @@ inline void ZeroHalo(double *Data, int Nx, int Ny, int Nz)
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	//MPI_Init(&argc,&argv);
+
+	/*
+	 * Definitely seems to be an issue - let's hope James gets back to me...
+	 */
 	int provided_thread_support = -1;
 	MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provided_thread_support);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-	int rank = comm.getRank();
-	int nprocs = comm.getSize();
+	MPI_Comm comm;
+	MPI_Comm_dup(MPI_COMM_WORLD,&comm);
+	int rank = comm_rank(comm);
+	int nprocs = comm_size(comm);
 
 	if ( rank==0 && provided_thread_support<MPI_THREAD_MULTIPLE )
 		std::cerr << "Warning: Failed to start MPI with necessary thread support, thread support will be disabled" << std::endl;
@@ -205,32 +214,32 @@ int main(int argc, char **argv)
 
 		// **************************************************************
 		// Broadcast simulation parameters from rank 0 to all other procs
-		comm.barrier();
+		MPI_Barrier(comm);
 		//.................................................
-		comm.bcast(&tau,1,0);
-		//comm.bcast(&pBC,1,0);
-		//comm.bcast(&Restart,1,0);
-		comm.bcast(&din,1,0);
-		comm.bcast(&dout,1,0);
-		comm.bcast(&Fx,1,0);
-		comm.bcast(&Fy,1,0);
-		comm.bcast(&Fz,1,0);
-		comm.bcast(&timestepMax,1,0);
-		comm.bcast(&interval,1,0);
-		comm.bcast(&tol,1,0);
+		MPI_Bcast(&tau,1,MPI_DOUBLE,0,comm);
+		//MPI_Bcast(&pBC,1,MPI_LOGICAL,0,comm);
+		//	MPI_Bcast(&Restart,1,MPI_LOGICAL,0,comm);
+		MPI_Bcast(&din,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&dout,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fy,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Fz,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&timestepMax,1,MPI_INT,0,comm);
+		MPI_Bcast(&interval,1,MPI_INT,0,comm);
+		MPI_Bcast(&tol,1,MPI_DOUBLE,0,comm);
 		// Computational domain
-		comm.bcast(&Nx,1,0);
-		comm.bcast(&Ny,1,0);
-		comm.bcast(&Nz,1,0);
-		comm.bcast(&nprocx,1,0);
-		comm.bcast(&nprocy,1,0);
-		comm.bcast(&nprocz,1,0);
-		comm.bcast(&nspheres,1,0);
-		comm.bcast(&Lx,1,0);
-		comm.bcast(&Ly,1,0);
-		comm.bcast(&Lz,1,0);
+		MPI_Bcast(&Nx,1,MPI_INT,0,comm);
+		MPI_Bcast(&Ny,1,MPI_INT,0,comm);
+		MPI_Bcast(&Nz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocx,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocy,1,MPI_INT,0,comm);
+		MPI_Bcast(&nprocz,1,MPI_INT,0,comm);
+		MPI_Bcast(&nspheres,1,MPI_INT,0,comm);
+		MPI_Bcast(&Lx,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
+		MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 		//.................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		//?
 		RESTART_INTERVAL=interval;
@@ -243,7 +252,7 @@ int main(int argc, char **argv)
 
 		const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		/*
 		 * Set up the relaxation rates and STATIC VISCOSITY
@@ -310,7 +319,7 @@ int main(int argc, char **argv)
 		//				rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 		//				rank_yz, rank_YZ, rank_yZ, rank_Yz );
 
-		comm.barrier();
+		MPI_Barrier(comm);
 		Nx+=2; Ny+=2; Nz+=2;
 		int N = Nx*Ny*Nz;
 		int dist_mem_size = N*sizeof(double);
@@ -364,7 +373,7 @@ int main(int argc, char **argv)
 		//	WriteLocalSolidID(LocalRankFilename, id, N);
 		sprintf(LocalRankFilename,"%s%s","SignDist.",LocalRankString);
 		ReadBinaryFile(LocalRankFilename, Averages->SDs.data(), N);
-		comm.barrier();
+		MPI_Barrier(comm);
 		if (rank == 0) cout << "Domain set." << endl;                                               /*    3      */
 
 		//.......................................................................
@@ -589,14 +598,14 @@ int main(int argc, char **argv)
 			delete [] cDen;
 			delete [] cDistEven;
 			delete [] cDistOdd;
-			comm.barrier();
+			MPI_Barrier(comm);
 		}                                                                                   /*  14 */
 
 //		//......................................................................
 //		ScaLBL_D3Q7_Init(ID, A_even, A_odd, &Den[0], Nx, Ny, Nz);
 //		ScaLBL_D3Q7_Init(ID, B_even, B_odd, &Den[N], Nx, Ny, Nz);
 //		ScaLBL_DeviceBarrier();
-//		comm.barrier();																/*  15  */
+//		MPI_Barrier(comm);																/*  15  */
 
 		//.......................................................................
 		// Once phase has been initialized, map solid to account for 'smeared' interface
@@ -622,7 +631,7 @@ int main(int argc, char **argv)
 //		ScaLBL_Comm.SendHalo(Phi);
 //		ScaLBL_Comm.RecvHalo(Phi);
 //		ScaLBL_DeviceBarrier();
-//		comm.barrier();
+//		MPI_Barrier(comm);
 //		//*************************************************************************   /*  18  */
 
 
@@ -661,7 +670,7 @@ int main(int argc, char **argv)
 
 			//.......create and start timer............
 			double starttime,stoptime,cputime;
-			comm.barrier();
+			MPI_Barrier(comm);
 			starttime = MPI_Wtime();
 
 			/*
@@ -795,7 +804,7 @@ int main(int argc, char **argv)
 				}
 				//...................................................................................
 				ScaLBL_DeviceBarrier();
-				comm.barrier();
+				MPI_Barrier(comm);
 
 				// Timestep completed!
 				timestep++;
@@ -809,7 +818,7 @@ int main(int argc, char **argv)
 			}
 			//************************************************************************/
 			ScaLBL_DeviceBarrier();
-			comm.barrier();
+			MPI_Barrier(comm);
 			stoptime = MPI_Wtime();
 			if (rank==0) printf("-------------------------------------------------------------------\n");
 			// Compute the walltime per timestep
@@ -826,7 +835,7 @@ int main(int argc, char **argv)
 
 			NULL_USE(RESTART_INTERVAL);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		MPI_Finalize();
 	 //****************************************************
 }
diff --git a/tests/lbpm_nonnewtonian_simulator.h b/tests/lbpm_nonnewtonian_simulator.h
index 4df5e628..20da1ac3 100644
--- a/tests/lbpm_nonnewtonian_simulator.h
+++ b/tests/lbpm_nonnewtonian_simulator.h
@@ -1,7 +1,7 @@
 // Run the analysis, blob identification, and write restart files
 #include "common/Array.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 
 //#define ANALYSIS_INTERVAL 6
@@ -9,9 +9,20 @@
 #define BLOBID_INTERVAL 1000
 
 
+
+
+
+
 enum AnalysisType{ AnalyzeNone=0, IdentifyBlobs=0x01, CopyPhaseIndicator=0x02, 
 	CopySimState=0x04, ComputeAverages=0x08, CreateRestart=0x10, WriteVis=0x20 };
 
+
+
+
+
+
+
+
 template<class TYPE>
 void DeleteArray( const TYPE *p )
 {
@@ -19,6 +30,12 @@ void DeleteArray( const TYPE *p )
 }
 
 
+
+
+
+
+
+
 // Structure used to store ids
 struct AnalysisWaitIdStruct {
 	ThreadPool::thread_id_t blobID;
@@ -28,6 +45,7 @@ struct AnalysisWaitIdStruct {
 };
 
 
+
 // Helper class to write the restart file from a seperate thread
 class WriteRestartWorkItem: public ThreadPool::WorkItem
 {
@@ -66,9 +84,9 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //        timestep(timestep_), Nx(Nx_), Ny(Ny_), Nz(Nz_), rank_info(rank_info_),
 //        phase(phase_), dist(dist_), last_id(last_id_), new_index(new_index_), new_id(new_id_), new_list(new_list_)
 //        {
-//            newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
+//            MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
 //        }
-//    ~BlobIdentificationWorkItem1() {}
+//    ~BlobIdentificationWorkItem1() { MPI_Comm_free(&newcomm); }
 //    virtual void run() {
 //        // Compute the global blob id and compare to the previous version
 //        PROFILE_START("Identify blobs",1);
@@ -88,7 +106,7 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //    const DoubleArray& dist;
 //    BlobIDstruct last_id, new_index, new_id;
 //    BlobIDList new_list;
-//    Utilities::MPI newcomm;
+//    MPI_Comm newcomm;
 //};
 //
 
@@ -104,9 +122,9 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //        timestep(timestep_), Nx(Nx_), Ny(Ny_), Nz(Nz_), rank_info(rank_info_),
 //        phase(phase_), dist(dist_), last_id(last_id_), new_index(new_index_), new_id(new_id_), new_list(new_list_)
 //        {
-//            newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
+//            MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
 //        }
-//    ~BlobIdentificationWorkItem2() { }
+//    ~BlobIdentificationWorkItem2() { MPI_Comm_free(&newcomm); }
 //    virtual void run() {
 //        // Compute the global blob id and compare to the previous version
 //        PROFILE_START("Identify blobs maps",1);
@@ -140,7 +158,7 @@ typedef std::shared_ptr<std::vector<BlobIDType> > BlobIDList;
 //    const DoubleArray& dist;
 //    BlobIDstruct last_id, new_index, new_id;
 //    BlobIDList new_list;
-//    Utilities::MPI newcomm;
+//    MPI_Comm newcomm;
 //};
 //
 
@@ -153,9 +171,9 @@ public:
 			TwoPhase& Avgerages_, fillHalo<double>& fillData_ ):
 				timestep(timestep_), visData(visData_), Averages(Avgerages_), fillData(fillData_)
 {
-    newcomm = Utilities::MPI(MPI_COMM_WORLD).dup();
+		MPI_Comm_dup(MPI_COMM_WORLD,&newcomm);
 }
-	~WriteVisWorkItem() {}
+	~WriteVisWorkItem() { MPI_Comm_free(&newcomm); }
 	virtual void run() {
 		PROFILE_START("Save Vis",1);
 		ASSERT(visData[0].vars[0]->name=="phase");
@@ -180,7 +198,7 @@ private:
 	std::vector<IO::MeshDataStruct>& visData;
 	TwoPhase& Averages;
 	fillHalo<double>& fillData;
-	Utilities::MPI newcomm;
+	MPI_Comm newcomm;
 };
 
 
@@ -400,7 +418,7 @@ void run_analysis( int timestep, int restart_interval,
 
 	// Spawn a thread to write the restart file
 	if ( (type&CreateRestart) != 0 ) {
-		int rank = comm.getRank();
+		int rank = MPI_WORLD_RANK();
 
 		// Wait for previous restart files to finish writing (not necessary, but helps to ensure memory usage is limited)
 		tpool.wait(wait.restart);
diff --git a/tests/lbpm_permeability_simulator.cpp b/tests/lbpm_permeability_simulator.cpp
index eb5e6d4b..dbcfb96b 100644
--- a/tests/lbpm_permeability_simulator.cpp
+++ b/tests/lbpm_permeability_simulator.cpp
@@ -9,7 +9,7 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/MRTModel.h"
 //#define WRITE_SURFACES
 
@@ -24,10 +24,11 @@ using namespace std;
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -38,7 +39,7 @@ int main(int argc, char **argv)
 		int device=ScaLBL_SetDevice(rank);
         NULL_USE( device );
 		ScaLBL_DeviceBarrier();
-		comm.barrier();
+		MPI_Barrier(comm);
 		
 		ScaLBL_MRTModel MRT(rank,nprocs,comm);
 		auto filename = argv[1];
@@ -51,7 +52,7 @@ int main(int argc, char **argv)
 		MRT.VelocityField();
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_plates_pp.cpp b/tests/lbpm_plates_pp.cpp
index acd64f52..8344df47 100644
--- a/tests/lbpm_plates_pp.cpp
+++ b/tests/lbpm_plates_pp.cpp
@@ -9,15 +9,19 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -75,7 +79,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -88,7 +92,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -112,7 +116,7 @@ int main(int argc, char **argv)
         std::shared_ptr<TwoPhase> Averages( new TwoPhase(Dm) );
 
 	 
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -196,7 +200,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_porenetwork_pp.cpp b/tests/lbpm_porenetwork_pp.cpp
index 4a6ccda7..496f9d86 100644
--- a/tests/lbpm_porenetwork_pp.cpp
+++ b/tests/lbpm_porenetwork_pp.cpp
@@ -9,15 +9,19 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -65,7 +69,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -78,7 +82,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -104,7 +108,7 @@ int main(int argc, char **argv)
 	Dm->CommInit();
         std::shared_ptr<TwoPhase> Averages( new TwoPhase(Dm) );
 
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	Nx += 2; Ny += 2; Nz += 2;
 
@@ -289,7 +293,7 @@ int main(int argc, char **argv)
 
 	}
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_random_pp.cpp b/tests/lbpm_random_pp.cpp
index ad4b83cc..07c56e6f 100644
--- a/tests/lbpm_random_pp.cpp
+++ b/tests/lbpm_random_pp.cpp
@@ -52,10 +52,11 @@ inline void UnpackID(int *list, int count, char *recvbuf, char *ID){
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 
 	int InitialWetting;
 	double Saturation;
@@ -96,7 +97,7 @@ int main(int argc, char **argv)
 		domain >> Lz;
 
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&ny,1,MPI_INT,0,comm);
@@ -109,7 +110,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	// Check that the number of processors >= the number of ranks
 	if ( rank==0 ) {
@@ -421,7 +422,7 @@ int main(int argc, char **argv)
 	fwrite(id,1,N,ID);
 	fclose(ID);
 
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index 149ae673..d90dbb04 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -16,10 +16,11 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 
 	{
 		//.......................................................................
@@ -421,7 +422,7 @@ int main(int argc, char **argv)
 
 
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_segmented_decomp.cpp b/tests/lbpm_segmented_decomp.cpp
index 1bc89adb..3384e454 100644
--- a/tests/lbpm_segmented_decomp.cpp
+++ b/tests/lbpm_segmented_decomp.cpp
@@ -18,10 +18,12 @@
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 
 
@@ -82,7 +84,7 @@ int main(int argc, char **argv)
 			image >> zStart;
 
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 		// Computational domain
 		//.................................................
 		MPI_Bcast(&nx,1,MPI_INT,0,comm);
@@ -103,7 +105,7 @@ int main(int argc, char **argv)
 		MPI_Bcast(&yStart,1,MPI_INT,0,comm);
 		MPI_Bcast(&zStart,1,MPI_INT,0,comm);
 		//.................................................
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Check that the number of processors >= the number of ranks
 		if ( rank==0 ) {
@@ -127,7 +129,7 @@ int main(int argc, char **argv)
 			fclose(SEGDAT);
 			printf("Read segmented data from %s \n",Filename);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		// Get the rank info
 		int N = (nx+2)*(ny+2)*(nz+2);
@@ -202,7 +204,7 @@ int main(int argc, char **argv)
 			printf("Ready to recieve data %i at process %i \n", N,rank);
 			MPI_Recv(Dm.id,N,MPI_CHAR,0,15,comm,MPI_STATUS_IGNORE);
 		}
-		comm.barrier();
+		MPI_Barrier(comm);
 
 		nx+=2; ny+=2; nz+=2;
 		N=nx*ny*nz;
@@ -338,7 +340,7 @@ int main(int argc, char **argv)
 		if (!MULTINPUT){
 
 			if (rank==0) printf("Writing symmetric domain reflection\n");
-			comm.barrier();
+			MPI_Barrier(comm);
 			int symrank,sympz;
 			sympz = 2*nprocz - Dm.kproc() -1;
 			symrank = sympz*nprocx*nprocy + Dm.jproc()*nprocx + Dm.iproc();
@@ -364,6 +366,6 @@ int main(int argc, char **argv)
 			fclose(SYMID);
 		}
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 }
diff --git a/tests/lbpm_segmented_pp.cpp b/tests/lbpm_segmented_pp.cpp
index 39cf0bd1..007ff9d1 100644
--- a/tests/lbpm_segmented_pp.cpp
+++ b/tests/lbpm_segmented_pp.cpp
@@ -115,10 +115,11 @@ double ReadFromBlock( char *ID, int iproc, int jproc, int kproc, int Nx, int Ny,
 int main(int argc, char **argv)
 {
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{	
 		//.......................................................................
 		// Reading the domain information file
@@ -230,7 +231,7 @@ int main(int argc, char **argv)
 		fclose(DIST);
 
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;
 
diff --git a/tests/lbpm_sphere_pp.cpp b/tests/lbpm_sphere_pp.cpp
index 2e053eed..98778b8d 100644
--- a/tests/lbpm_sphere_pp.cpp
+++ b/tests/lbpm_sphere_pp.cpp
@@ -9,7 +9,7 @@
 #include "analysis/pmmc.h"
 #include "common/Domain.h"
 #include "common/SpherePack.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Communication.h"
 
 /*
@@ -22,11 +22,15 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
 	int iproc,jproc,kproc;
 	int sendtag,recvtag;
@@ -123,14 +127,14 @@ int main(int argc, char **argv)
 	//.......................................................................
 	if (rank == 0)	printf("Reading the sphere packing \n");
 	if (rank == 0)	ReadSpherePacking(nspheres,cx,cy,cz,rad);
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Broadcast the sphere packing to all processes
 	MPI_Bcast(cx,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cy,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(cz,nspheres,MPI_DOUBLE,0,comm);
 	MPI_Bcast(rad,nspheres,MPI_DOUBLE,0,comm);
 	//...........................................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	if (rank == 0) cout << "Domain set." << endl;
 	if (rank == 0){
 		// Compute the Sauter mean diameter
@@ -213,7 +217,7 @@ int main(int argc, char **argv)
 	fclose(ID);
 
 	// ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_squaretube_pp.cpp b/tests/lbpm_squaretube_pp.cpp
index c1f05aee..42715773 100644
--- a/tests/lbpm_squaretube_pp.cpp
+++ b/tests/lbpm_squaretube_pp.cpp
@@ -9,15 +9,19 @@
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
 #include "analysis/TwoPhase.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 
 int main(int argc, char **argv)
 {
+	//*****************************************
+	// ***** MPI STUFF ****************
+	//*****************************************
 	// Initialize MPI
+	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
 	{
 	// parallel domain size (# of sub-domains)
 	int nprocx,nprocy,nprocz;
@@ -81,7 +85,7 @@ int main(int argc, char **argv)
 	}
 	// **************************************************************
 	// Broadcast simulation parameters from rank 0 to all other procs
-	comm.barrier();
+	MPI_Barrier(comm);
 	// Computational domain
 	MPI_Bcast(&Nx,1,MPI_INT,0,comm);
 	MPI_Bcast(&Ny,1,MPI_INT,0,comm);
@@ -94,7 +98,7 @@ int main(int argc, char **argv)
 	MPI_Bcast(&Ly,1,MPI_DOUBLE,0,comm);
 	MPI_Bcast(&Lz,1,MPI_DOUBLE,0,comm);
 	//.................................................
-	comm.barrier();
+	MPI_Barrier(comm);
 	
 	// **************************************************************
 	if (nprocs != nprocx*nprocy*nprocz){
@@ -121,7 +125,7 @@ int main(int argc, char **argv)
 			 	 	 rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
 			 	 	 rank_yz, rank_YZ, rank_yZ, rank_Yz );
 	 
-	comm.barrier();
+	MPI_Barrier(comm);
 
 	Nz += 2;
 	Nx = Ny = Nz;	// Cubic domain
@@ -255,7 +259,7 @@ int main(int argc, char **argv)
 
 	}
         // ****************************************************
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	// ****************************************************
 }
diff --git a/tests/lbpm_uCT_maskfilter.cpp b/tests/lbpm_uCT_maskfilter.cpp
index 857bc4e0..cff41ad7 100644
--- a/tests/lbpm_uCT_maskfilter.cpp
+++ b/tests/lbpm_uCT_maskfilter.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -30,11 +30,13 @@
 
 int main(int argc, char **argv)
 {
+
 	// Initialize MPI
+	int rank, nprocs;
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm,&rank);
+	MPI_Comm_size(comm,&nprocs);
     Utilities::setErrorHandlers();
 	PROFILE_START("Main");
 
@@ -149,7 +151,7 @@ int main(int argc, char **argv)
       
     }
     netcdf::close( distid );
-	comm.barrier();
+	MPI_Barrier(comm);
 	PROFILE_STOP("ReadDistance");
 	if (rank==0) printf("Finished reading distance =\n");
 
@@ -182,7 +184,7 @@ int main(int argc, char **argv)
         fillFloat[0]->fill( LOCVOL[0] );
     }
     netcdf::close( fid );
-	comm.barrier();
+	MPI_Barrier(comm);
 	PROFILE_STOP("ReadVolume");
 	if (rank==0) printf("Read complete\n");
 
@@ -445,7 +447,7 @@ int main(int argc, char **argv)
 
 	PROFILE_STOP("Main");
 	PROFILE_SAVE("lbpm_uCT_maskfilter",true);
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return 0;
 }
diff --git a/tests/lbpm_uCT_pp.cpp b/tests/lbpm_uCT_pp.cpp
index 6e8d1bde..0285b864 100644
--- a/tests/lbpm_uCT_pp.cpp
+++ b/tests/lbpm_uCT_pp.cpp
@@ -14,7 +14,7 @@
 #include "common/Array.h"
 #include "common/Domain.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "IO/MeshDatabase.h"
 #include "IO/Mesh.h"
 #include "IO/Writer.h"
@@ -31,10 +31,11 @@ int main(int argc, char **argv)
 {
 
     // Initialize MPI
+    int rank, nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     {
         Utilities::setErrorHandlers();
         PROFILE_START("Main");
@@ -187,7 +188,7 @@ int main(int argc, char **argv)
             fillFloat[0]->fill( LOCVOL[0] );
         }
         netcdf::close( fid );
-        comm.barrier();
+        MPI_Barrier(comm);
         PROFILE_STOP("ReadVolume");
         if (rank==0) printf("Read complete\n");
 
@@ -250,15 +251,15 @@ int main(int argc, char **argv)
                 }
             }
         }
-        count_plus = Dm[0]->Comm.sumReduce( count_plus);
-        count_minus = Dm[0]->Comm.sumReduce( count_minus);
+        count_plus=sumReduce( Dm[0]->Comm, count_plus);
+        count_minus=sumReduce( Dm[0]->Comm, count_minus);
               if (rank==0) printf("minimum value=%f, max value=%f \n",min_value,max_value);
         if (rank==0) printf("plus=%i, minus=%i \n",count_plus,count_minus);
         ASSERT( count_plus > 0 && count_minus > 0 );
-        comm.barrier();
-        mean_plus = Dm[0]->Comm.sumReduce( mean_plus ) / count_plus;
-        mean_minus = Dm[0]->Comm.sumReduce( mean_minus ) / count_minus;
-        comm.barrier();
+        MPI_Barrier(comm);
+        mean_plus = sumReduce( Dm[0]->Comm, mean_plus ) / count_plus;
+        mean_minus = sumReduce( Dm[0]->Comm, mean_minus ) / count_minus;
+        MPI_Barrier(comm);
         if (rank==0) printf("    Region 1 mean (+): %f, Region 2 mean (-): %f \n",mean_plus, mean_minus);
 
         //if (rank==0) printf("Scale the input data (size = %i) \n",LOCVOL[0].length());
@@ -279,7 +280,7 @@ int main(int argc, char **argv)
 
         // Fill the source data for the coarse meshes
         if (rank==0) printf("Coarsen the mesh for N_levels=%i \n",N_levels);
-        comm.barrier(); 
+        MPI_Barrier(comm); 
         PROFILE_START("CoarsenMesh");
         for (int i=1; i<N_levels; i++) {
             Array<float> filter(ratio[0],ratio[1],ratio[2]);
@@ -295,7 +296,7 @@ int main(int argc, char **argv)
                 printf("   filter_x=%i, filter_y=%i, filter_z=%i \n",int(filter.size(0)),int(filter.size(1)),int(filter.size(2))  );
                 printf("   ratio= %i,%i,%i \n",int(ratio[0]),int(ratio[1]),int(ratio[2])  );
             }
-            comm.barrier();
+            MPI_Barrier(comm);
         }
         PROFILE_STOP("CoarsenMesh");
 
@@ -307,7 +308,7 @@ int main(int argc, char **argv)
                 NonLocalMean.back(), *fillFloat.back(), *Dm.back(), nprocx, 
                 rough_cutoff, lamda, nlm_sigsq, nlm_depth);
         PROFILE_STOP("Solve coarse mesh");
-        comm.barrier();
+        MPI_Barrier(comm);
 
         // Refine the solution
         PROFILE_START("Refine distance");
@@ -321,7 +322,7 @@ int main(int argc, char **argv)
                 rough_cutoff, lamda, nlm_sigsq, nlm_depth);
         }
         PROFILE_STOP("Refine distance");
-        comm.barrier();    
+        MPI_Barrier(comm);    
 
         // Perform a final filter
         PROFILE_START("Filtering final domains");
@@ -417,14 +418,14 @@ int main(int argc, char **argv)
             meshData[0].vars.push_back(filter_Dist2_var);
             fillDouble[0]->copy( filter_Dist2, filter_Dist2_var->data );
         #endif
-        comm.barrier();
+        MPI_Barrier(comm);
         if (rank==0) printf("Writing output \n");
         // Write visulization data
         IO::writeData( 0, meshData, comm );
         if (rank==0) printf("Finished. \n");
     
         // Compute the Minkowski functionals
-        comm.barrier();
+        MPI_Barrier(comm);
         auto Averages = std::make_shared<Minkowski>(Dm[0]);
         
         Array <char> phase_label(Nx[0]+2,Ny[0]+2,Nz[0]+2);
@@ -456,7 +457,7 @@ int main(int argc, char **argv)
     }
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_uCT_pp",true);
-    comm.barrier();
+    MPI_Barrier(comm);
     MPI_Finalize();
     return 0;
 }
diff --git a/tests/testCommunication.cpp b/tests/testCommunication.cpp
index 911ef1c5..57ce0959 100644
--- a/tests/testCommunication.cpp
+++ b/tests/testCommunication.cpp
@@ -6,7 +6,7 @@
 #include <fstream>
 
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Array.h"
 
 using namespace std;
@@ -15,9 +15,11 @@ using namespace std;
 
 //***************************************************************************************
 
-int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int nprocz )
+int test_communication( MPI_Comm comm, int nprocx, int nprocy, int nprocz )
 {
-    int rank = comm.getRank();
+    int rank,nprocs;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     int iproc,jproc,kproc;
     int sendtag,recvtag;
     if (rank==0)    printf("\nRunning test %i %i %i\n",nprocx,nprocy,nprocz);
@@ -36,7 +38,7 @@ int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int
 	    rank_xy, rank_XY, rank_xY, rank_Xy,
 	    rank_xz, rank_XZ, rank_xZ, rank_Xz,
 	    rank_yz, rank_YZ, rank_yZ, rank_Yz );
-    comm.barrier();
+    MPI_Barrier(comm);
 
     //**********************************
 
@@ -83,7 +85,7 @@ int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int
     sendCount_xy = sendCount_yz = sendCount_xz = sendCount_Xy = sendCount_Yz = sendCount_xZ = 0;
     sendCount_xY = sendCount_yZ = sendCount_Xz = sendCount_XY = sendCount_YZ = sendCount_XZ = 0;
 
-    comm.barrier();
+    MPI_Barrier(comm);
     if (rank==0)    printf ("SendLists are ready on host\n");
     //......................................................................................
     // Use MPI to fill in the recvCounts form the associated processes
@@ -156,7 +158,7 @@ int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int
         recvCount_yz, recvCount_YZ, recvCount_yZ, recvCount_Yz,
         rank_x, rank_y, rank_z, rank_X, rank_Y, rank_Z, rank_xy, rank_XY, rank_xY,
         rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz, rank_yz, rank_YZ, rank_yZ, rank_Yz );
-    comm.barrier();
+    MPI_Barrier(comm);
     if (rank==0)    printf ("RecvLists finished\n");
     
     // Free memory
@@ -179,9 +181,11 @@ int test_communication( const Utilities::MPI& comm, int nprocx, int nprocy, int
 
 
 template<class TYPE>
-int testHalo( const Utilities::MPI& comm, int nprocx, int nprocy, int nprocz, int depth )
+int testHalo( MPI_Comm comm, int nprocx, int nprocy, int nprocz, int depth )
 {
-    int rank = comm.getRank();
+    int rank,nprocs;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
     if ( rank==0 )
         printf("\nRunning Halo test %i %i %i %i\n",nprocx,nprocy,nprocz,depth);
 
@@ -251,10 +255,11 @@ int testHalo( const Utilities::MPI& comm, int nprocx, int nprocy, int nprocz, in
 int main(int argc, char **argv)
 {
     // Initialize MPI
+    int rank,nprocs;
     MPI_Init(&argc,&argv);
-    Utilities::MPI comm( MPI_COMM_WORLD );
-    int rank = comm.getRank();
-    int nprocs = comm.getSize();
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_size(comm,&nprocs);
 
     // Run the test with different domains
     int N_errors = 0;
@@ -284,9 +289,10 @@ int main(int argc, char **argv)
     }
 
     // Finished
-    comm.barrier();
-    int N_errors_global = comm.sumReduce( N_errors );
-    comm.barrier();
+    MPI_Barrier(comm);
+    int N_errors_global=0;
+    MPI_Allreduce( &N_errors, &N_errors_global, 1, MPI_INT, MPI_SUM, comm );
+    MPI_Barrier(comm);
     MPI_Finalize();
     if ( rank==0 ) {
         if ( N_errors_global==0 )
diff --git a/tests/test_dcel_minkowski.cpp b/tests/test_dcel_minkowski.cpp
index 2669b522..0d6cbca9 100644
--- a/tests/test_dcel_minkowski.cpp
+++ b/tests/test_dcel_minkowski.cpp
@@ -26,9 +26,9 @@ std::shared_ptr<Database> loadInputs( )
 int main(int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
-	//int rank = comm.getRank();
-	//int nprocs = comm.getSize();
+	MPI_Comm comm = MPI_COMM_WORLD;
+	//int rank = MPI_WORLD_RANK();
+	//int nprocs = MPI_WORLD_SIZE();
 	int toReturn = 0;
 	{
 		int i,j,k;
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 		
 	}
     PROFILE_SAVE("test_dcel_minkowski");
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return toReturn;
 }
diff --git a/tests/test_dcel_tri_normal.cpp b/tests/test_dcel_tri_normal.cpp
index b6497140..1e85b1f3 100644
--- a/tests/test_dcel_tri_normal.cpp
+++ b/tests/test_dcel_tri_normal.cpp
@@ -26,7 +26,7 @@ std::shared_ptr<Database> loadInputs( )
 int main(int argc, char **argv)
 {
 	MPI_Init(&argc,&argv);
-	Utilities::MPI comm( MPI_COMM_WORLD );
+	MPI_Comm comm = MPI_COMM_WORLD;
 	int toReturn = 0;
 	{
 		int i,j,k;
@@ -136,7 +136,7 @@ int main(int argc, char **argv)
 		if (count_check > 0)  toReturn=2;
 		else printf("Succeeded. \n");
 	}
-	comm.barrier();
+	MPI_Barrier(comm);
 	MPI_Finalize();
 	return toReturn;
 }

From 679c53a4690876755a64a44c79d4db856c58dd01 Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Thu, 19 Mar 2020 13:35:10 -0400
Subject: [PATCH 059/121] Add wall factor to morphgrow to change solid penalty
 term

---
 analysis/morphology.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index 72a17892..8f658328 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -692,6 +692,8 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 	int Nz = Dm->Nz;
 	int rank = Dm->rank();
 	
+	double WALL_FACTOR = 0.0; // 1.0 if you want to penalize movements close to solid
+	
 	double count=0.0;
 	for (int k=1; k<Nz-1; k++){
 		for (int j=1; j<Ny-1; j++){
@@ -722,8 +724,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			for (int j=1; j<Ny-1; j++){
 				for (int i=1; i<Nx-1; i++){
 					double walldist=BoundaryDist(i,j,k);
-					double wallweight = 1.0 / (1+exp(-5.f*(walldist-1.f))); 
-					//wallweight = 1.0;
+					double wallweight = WALL_FACTOR/ (1+exp(-5.f*(walldist-1.f))); 
 					if (fabs(wallweight*morph_delta) > MAX_DISPLACEMENT) MAX_DISPLACEMENT= fabs(wallweight*morph_delta);
 					
 					if (Dist(i,j,k) - wallweight*morph_delta < 0.0){
@@ -769,7 +770,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 		for (int j=1; j<Ny-1; j++){
 			for (int i=1; i<Nx-1; i++){
 				double walldist=BoundaryDist(i,j,k);
-				double wallweight = 1.0 / (1+exp(-5.f*(walldist-1.f))); 
+				double wallweight = WALL_FACTOR / (1+exp(-5.f*(walldist-1.f))); 
 				//wallweight = 1.0;
 				Dist(i,j,k) -= wallweight*morph_delta;
 				if (Dist(i,j,k) < 0.0)	count+=1.0;

From 8ff6cb20d3febf7ea867a1691d02a1613e744421 Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Thu, 19 Mar 2020 13:41:31 -0400
Subject: [PATCH 060/121] Add wall factor to morphgrow to change solid penalty
 term

---
 analysis/morphology.cpp | 10 ++++------
 analysis/morphology.h   |  2 +-
 models/ColorModel.cpp   |  3 ++-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/analysis/morphology.cpp b/analysis/morphology.cpp
index 8f658328..f6bb3469 100644
--- a/analysis/morphology.cpp
+++ b/analysis/morphology.cpp
@@ -685,15 +685,13 @@ double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain
 	return final_void_fraction;
 }
 
-double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetGrowth)
+double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetGrowth, double WallFactor)
 {
 	int Nx = Dm->Nx;
 	int Ny = Dm->Ny;
 	int Nz = Dm->Nz;
 	int rank = Dm->rank();
-	
-	double WALL_FACTOR = 0.0; // 1.0 if you want to penalize movements close to solid
-	
+		
 	double count=0.0;
 	for (int k=1; k<Nz-1; k++){
 		for (int j=1; j<Ny-1; j++){
@@ -724,7 +722,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 			for (int j=1; j<Ny-1; j++){
 				for (int i=1; i<Nx-1; i++){
 					double walldist=BoundaryDist(i,j,k);
-					double wallweight = WALL_FACTOR/ (1+exp(-5.f*(walldist-1.f))); 
+					double wallweight = WallFactor/ (1+exp(-5.f*(walldist-1.f))); 
 					if (fabs(wallweight*morph_delta) > MAX_DISPLACEMENT) MAX_DISPLACEMENT= fabs(wallweight*morph_delta);
 					
 					if (Dist(i,j,k) - wallweight*morph_delta < 0.0){
@@ -770,7 +768,7 @@ double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id,
 		for (int j=1; j<Ny-1; j++){
 			for (int i=1; i<Nx-1; i++){
 				double walldist=BoundaryDist(i,j,k);
-				double wallweight = WALL_FACTOR / (1+exp(-5.f*(walldist-1.f))); 
+				double wallweight = WallFactor / (1+exp(-5.f*(walldist-1.f))); 
 				//wallweight = 1.0;
 				Dist(i,j,k) -= wallweight*morph_delta;
 				if (Dist(i,j,k) < 0.0)	count+=1.0;
diff --git a/analysis/morphology.h b/analysis/morphology.h
index 29c0a322..bf59b6b4 100644
--- a/analysis/morphology.h
+++ b/analysis/morphology.h
@@ -5,4 +5,4 @@
 
 double MorphOpen(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain> Dm, double VoidFraction, signed char ErodeLabel, signed char ReplaceLabel);
 double MorphDrain(DoubleArray &SignDist, signed char *id, std::shared_ptr<Domain> Dm, double VoidFraction);
-double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetVol);
+double MorphGrow(DoubleArray &BoundaryDist, DoubleArray &Dist, Array<char> &id, std::shared_ptr<Domain> Dm, double TargetVol, double WallFactor);
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 4ef7573f..05004110 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1296,6 +1296,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	double vF = 0.f;
 	double vS = 0.f;
 	double delta_volume;
+	double WallFactor = 0.0;
 
 	DoubleArray phase(Nx,Ny,Nz);
 	IntArray phase_label(Nx,Ny,Nz);;
@@ -1395,7 +1396,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 		double target_delta_volume_incremental = target_delta_volume;
 		if (fabs(target_delta_volume) > 0.01*volume_initial)  
 			target_delta_volume_incremental = 0.01*volume_initial*target_delta_volume/fabs(target_delta_volume);
-		delta_volume = MorphGrow(Averages->SDs,phase_distance,phase_id,Averages->Dm, target_delta_volume_incremental);
+		delta_volume = MorphGrow(Averages->SDs,phase_distance,phase_id,Averages->Dm, target_delta_volume_incremental, WallFactor);
 
 		for (int k=0; k<Nz; k++){
 			for (int j=0; j<Ny; j++){

From e92eb8f91ded05b41300d4cbdffe6667784c18e3 Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 09:45:43 -0400
Subject: [PATCH 061/121] make sure input database is updated across all ranks

---
 analysis/runAnalysis.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index 6c76f58b..384d4d69 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -907,9 +907,8 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
     // Spawn a thread to write the restart file
     //    if ( matches(type,AnalysisType::CreateRestart) ) {
     if (timestep%d_restart_interval==0){
-
+		input_db->putScalar<bool>( "Restart", true );
     	if (d_rank==0) {
-    		input_db->putScalar<bool>( "Restart", true );
     		std::ofstream OutStream("Restart.db");
     		input_db->print(OutStream, "");
     		OutStream.close();
@@ -1010,10 +1009,11 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
     	ScaLBL_CopyToHost(cfq.get(),fq,19*d_Np*sizeof(double));
     	ScaLBL_CopyToHost(cDen.get(),Den,2*d_Np*sizeof(double));
 
+		color_db->putScalar<int>("timestep",timestep);    		
+		color_db->putScalar<bool>( "Restart", true );
+		input_db->putDatabase("Color", color_db);
+		
     	if (d_rank==0) {
-    		color_db->putScalar<int>("timestep",timestep);    		
-    		color_db->putScalar<bool>( "Restart", true );
-    		input_db->putDatabase("Color", color_db);
     		std::ofstream OutStream("Restart.db");
     		input_db->print(OutStream, "");
     		OutStream.close();

From 8d9f35d1d384e26ba84fb2e3bcdc7318a43eac4f Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 09:45:57 -0400
Subject: [PATCH 062/121] updating R helper functions

---
 example/Workflow/HelperFunctions.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/example/Workflow/HelperFunctions.R b/example/Workflow/HelperFunctions.R
index 6c8bd903..669b28fe 100644
--- a/example/Workflow/HelperFunctions.R
+++ b/example/Workflow/HelperFunctions.R
@@ -7,19 +7,20 @@ ReadDatabase<-function(FILE){
     INPUT<-gsub(';','',readLines(FILE))
 
     S<-gsub('tauA = ','',gsub("\\s+"," ",(grep("tauA",INPUT,value=TRUE))))
-    TAU_A = as.numeric(S)
+    TAU_A = as.numeric(gsub("/.*","",S))
     S<-gsub('tauB = ','',gsub("\\s+"," ",(grep("tauB",INPUT,value=TRUE))))
-    TAU_B = as.numeric(S)
+    TAU_B = as.numeric(gsub("/.*","",S))
     S<-gsub('rhoA = ','',gsub("\\s+"," ",(grep("rhoA",INPUT,value=TRUE))))
-    RHO_A = as.numeric(S)
+    RHO_A = as.numeric(gsub("/.*","",S))
     S<-gsub('rhoB = ','',gsub("\\s+"," ",(grep("rhoB",INPUT,value=TRUE))))
-    RHO_B = as.numeric(S)
+    RHO_B = as.numeric(gsub("/.*","",S))
 
     S<-gsub('alpha = ','',gsub("\\s+"," ",(grep("alpha",INPUT,value=TRUE))))
-    ALPHA = as.numeric(S)
+    ALPHA = as.numeric(gsub("/.*","",S))
 
     # Read the affinity
     S<-gsub('ComponentAffinity = ','',gsub("\\s+"," ",(grep("ComponentAffinity",INPUT,value=TRUE))))
+    S<-gsub("/.*","",S)
     AFFINITY<-as.numeric(unlist(strsplit(S,", ")))
 
     PARAMETERS<-c(TAU_A,TAU_B,RHO_A,RHO_B,ALPHA,AFFINITY)

From ad20322f31c01d3604b321553a113e55bc972e49 Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 09:53:47 -0400
Subject: [PATCH 063/121] refactor refine pp tool

---
 tests/lbpm_refine_pp.cpp | 41 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index d90dbb04..ad729aa2 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -40,7 +40,6 @@ int main(int argc, char **argv)
 		auto domain_db = db->getDatabase( "Domain" );
 
 		// Read domain parameters
-		auto L = domain_db->getVector<double>( "L" );
 		auto size = domain_db->getVector<int>( "n" );
 		auto nproc = domain_db->getVector<int>( "nproc" );
 		auto ReadValues = domain_db->getVector<char>( "ReadValues" );
@@ -92,8 +91,42 @@ int main(int argc, char **argv)
 			}
 		}
 		Dm.CommInit();
-
+		
+		Domain Mask(rnx,rny,rnz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
+		Mask->ReadIDs();
+		Mask.CommInit();
+		for (int i=0; i<nx*ny*nz; i++) id[i] = Mask->id[i];  // save what was read
+		
+		// Generate the signed distance map
+		// Initialize the domain and communication
+		Array<char> Labels(nx,ny,nz);
 		DoubleArray SignDist(nx,ny,nz);
+
+		// Solve for the position of the solid phase
+		for (int k=0;k<nz;k++){
+			for (int j=0;j<ny;j++){
+				for (int i=0;i<nx;i++){
+					int n = k*nx*ny+j*nx+i;
+					// Initialize the solid phase
+					signed char label = Mask->id[n];
+					if (label > 0)		Labels(i,j,k) = 1;
+					else	     		Labels(i,j,k) = 0;
+				}
+			}
+		}
+		// Initialize the signed distance function
+		for (int k=0;k<nz;k++){
+			for (int j=0;j<ny;j++){
+				for (int i=0;i<nx;i++){
+					// Initialize distance to +/- 1
+					Averages->SDs(i,j,k) = 2.0*double(Labels(i,j,k))-1.0;
+				}
+			}
+		}
+	//	MeanFilter(Averages->SDs);
+		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
+		CalcDist(SignDist,Labels,*Mask);
+		
 		// Read the signed distance from file
 		sprintf(LocalRankFilename,"SignDist.%05i",rank);
 		FILE *DIST = fopen(LocalRankFilename,"rb");
@@ -102,7 +135,7 @@ int main(int argc, char **argv)
 		if (ReadSignDist != size_t(N)) printf("lbpm_refine_pp: Error reading signed distance function (rank=%i)\n",rank);
 		fclose(DIST);
 		
-		char *Labels;
+	/*	char *Labels;
 		Labels = new char[N];
 		sprintf(LocalRankFilename,"ID.%05i",rank);
 		FILE *LABELS = fopen(LocalRankFilename,"rb");
@@ -110,7 +143,7 @@ int main(int argc, char **argv)
 		ReadLabels=fread(Labels,1,N,LABELS);
 		if (ReadLabels != size_t(N)) printf("lbpm_refine_pp: Error reading ID  (rank=%i)\n",rank);
 		fclose(LABELS);
-
+*/
 		if ( rank==0 )   printf("Set up Domain, read input distance \n");
 
 		DoubleArray RefinedSignDist(rnx,rny,rnz);

From afbef5075208d4fcf8308a0bc1f8b5034480a322 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 10:03:20 -0400
Subject: [PATCH 064/121] update lbpm_refine_pp

---
 tests/lbpm_refine_pp.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index ad729aa2..5f1c5875 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -12,6 +12,7 @@
 #include "common/Communication.h"
 #include "common/Domain.h"
 #include "analysis/pmmc.h"
+#include "analysis/distance.h"
 
 int main(int argc, char **argv)
 {
@@ -93,10 +94,9 @@ int main(int argc, char **argv)
 		Dm.CommInit();
 		
 		Domain Mask(rnx,rny,rnz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
-		Mask->ReadIDs();
+		Mask.ReadIDs();
 		Mask.CommInit();
-		for (int i=0; i<nx*ny*nz; i++) id[i] = Mask->id[i];  // save what was read
-		
+
 		// Generate the signed distance map
 		// Initialize the domain and communication
 		Array<char> Labels(nx,ny,nz);
@@ -108,7 +108,7 @@ int main(int argc, char **argv)
 				for (int i=0;i<nx;i++){
 					int n = k*nx*ny+j*nx+i;
 					// Initialize the solid phase
-					signed char label = Mask->id[n];
+					signed char label = Mask.id[n];
 					if (label > 0)		Labels(i,j,k) = 1;
 					else	     		Labels(i,j,k) = 0;
 				}
@@ -119,13 +119,13 @@ int main(int argc, char **argv)
 			for (int j=0;j<ny;j++){
 				for (int i=0;i<nx;i++){
 					// Initialize distance to +/- 1
-					Averages->SDs(i,j,k) = 2.0*double(Labels(i,j,k))-1.0;
+					SignDist(i,j,k) = 2.0*double(Labels(i,j,k))-1.0;
 				}
 			}
 		}
 	//	MeanFilter(Averages->SDs);
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
-		CalcDist(SignDist,Labels,*Mask);
+		CalcDist(SignDist,Labels,Mask);
 		
 		// Read the signed distance from file
 		sprintf(LocalRankFilename,"SignDist.%05i",rank);
@@ -178,7 +178,7 @@ int main(int argc, char **argv)
 					pt.y=0.5*(rj-1)+1.f;
 					pt.z=0.5*(rk-1)+1.f;
 					RefinedSignDist(ri,rj,rk) = LocalApprox.eval(pt);
-					RefineLabel(ri,rj,rk) = Labels[k*nx*ny+j*nx+i]; 
+					RefineLabel(ri,rj,rk) = Labels(i,j,k); 
 				}
 			}
 		}

From b206ad80a22d5d82112bd3137f0af7cf9d12ca1c Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 10:06:36 -0400
Subject: [PATCH 065/121] use Filename in refine pp

---
 tests/lbpm_refine_pp.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index 5f1c5875..be2ba346 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -94,7 +94,13 @@ int main(int argc, char **argv)
 		Dm.CommInit();
 		
 		Domain Mask(rnx,rny,rnz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
-		Mask.ReadIDs();
+		if (domain_db->keyExists( "Filename" )){
+			auto Filename = domain_db->getScalar<std::string>( "Filename" );
+			Mask.Decomp(Filename);
+		}
+		else{
+			Mask.ReadIDs();
+		}
 		Mask.CommInit();
 
 		// Generate the signed distance map

From dbbd8e30b7e28951231f9a4c7445894d6e779750 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 10:32:16 -0400
Subject: [PATCH 066/121] fix refine pp

---
 tests/lbpm_refine_pp.cpp | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index be2ba346..0f0ffdda 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -52,6 +52,7 @@ int main(int argc, char **argv)
 		int nprocx = nproc[0];
 		int nprocy = nproc[1];
 		int nprocz = nproc[2];
+		int BoundaryCondition=0;
 
 		// Check that the number of processors >= the number of ranks
 		if ( rank==0 ) {
@@ -63,15 +64,26 @@ int main(int argc, char **argv)
 			ERROR("Insufficient number of processors");
 		}
 
-		char LocalRankFilename[40];
+		//Domain Mask(nx,ny,nz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
+		Domain Mask(domain_db,MPI_COMM_WORLD);
+		if (domain_db->keyExists( "Filename" )){
+			auto Filename = domain_db->getScalar<std::string>( "Filename" );
+		        if (rank==0) printf("Reading domain from %s \n",Filename.c_str());
+			Mask.Decomp(Filename);
+			if (rank==0) printf("Complete. \n");
+		}
+		else{
+			Mask.ReadIDs();
+		}
+		Mask.CommInit();
 
+		char LocalRankFilename[40];
 		int rnx=2*nx;
 		int rny=2*ny;
 		int rnz=2*nz;
 
 		if (rank==0) printf("Refining mesh to %i x %i x %i \n",rnx,rny,rnz);
 
-		int BoundaryCondition=0;
 		Domain Dm(rnx,rny,rnz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
 
 		// Communication the halos
@@ -83,6 +95,7 @@ int main(int argc, char **argv)
 		int N = nx*ny*nz;
 
 		// Define communication sub-domain -- everywhere
+		if (rank==0) printf("Initialize refined domain \n");
 		for (int k=0; k<rnz; k++){
 			for (int j=0; j<rny; j++){
 				for (int i=0; i<rnx; i++){
@@ -93,16 +106,6 @@ int main(int argc, char **argv)
 		}
 		Dm.CommInit();
 		
-		Domain Mask(rnx,rny,rnz,rank,nprocx,nprocy,nprocz,Lx,Ly,Lz,BoundaryCondition);
-		if (domain_db->keyExists( "Filename" )){
-			auto Filename = domain_db->getScalar<std::string>( "Filename" );
-			Mask.Decomp(Filename);
-		}
-		else{
-			Mask.ReadIDs();
-		}
-		Mask.CommInit();
-
 		// Generate the signed distance map
 		// Initialize the domain and communication
 		Array<char> Labels(nx,ny,nz);
@@ -133,7 +136,7 @@ int main(int argc, char **argv)
 		if (rank==0) printf("Initialized solid phase -- Converting to Signed Distance function \n");
 		CalcDist(SignDist,Labels,Mask);
 		
-		// Read the signed distance from file
+		/*		// Read the signed distance from file
 		sprintf(LocalRankFilename,"SignDist.%05i",rank);
 		FILE *DIST = fopen(LocalRankFilename,"rb");
 		size_t ReadSignDist;
@@ -141,7 +144,7 @@ int main(int argc, char **argv)
 		if (ReadSignDist != size_t(N)) printf("lbpm_refine_pp: Error reading signed distance function (rank=%i)\n",rank);
 		fclose(DIST);
 		
-	/*	char *Labels;
+		char *Labels;
 		Labels = new char[N];
 		sprintf(LocalRankFilename,"ID.%05i",rank);
 		FILE *LABELS = fopen(LocalRankFilename,"rb");

From 05ed256b30ab969000fc76750cedad22fe4ded05 Mon Sep 17 00:00:00 2001
From: James E McClure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 10:37:32 -0400
Subject: [PATCH 067/121] adding refine options

---
 tests/lbpm_refine_pp.cpp | 433 ++++++++++++++++-----------------------
 1 file changed, 173 insertions(+), 260 deletions(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index 0f0ffdda..1a7ff05b 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -192,277 +192,190 @@ int main(int argc, char **argv)
 			}
 		}
 		fillData.fill(RefinedSignDist);
-		//	sprintf(LocalRankFilename,"ID.%05i",rank);
-		//FILE *ID = fopen(LocalRankFilename,"wb");
-		//fwrite(id,1,N,ID);
-		//fclose(ID);
-/*
-		sprintf(LocalRankFilename,"RefineDist.%05i",rank);
-		FILE *REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(RefinedSignDist.data(),8,rnx*rny*rnz,REFINEDIST);
-		fclose(REFINEDIST);
-*/
-		if ( rank==0 )   printf("Write output \n");
 
-		DoubleArray BlockDist(nx,ny,nz);
-		FILE *WRITEID, *REFINEDIST;
-		char * id;
-		id = new char [N];
-		int writerank;
 
-		// Write output blocks with the same sub-domain size as origina
-		// refinement increases the size of the process grid
- 		writerank = 8*Dm.kproc()*nprocx*nprocy + 4*Dm.jproc()*nprocx + 2*Dm.iproc();
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i,j,k);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j,k);
+		if (domain_db->keyExists( "Filename" )){
+			auto Filename = domain_db->getScalar<std::string>( "Filename" );
+			if ( rank==0 )   printf("Write output \n");
+			sprintf(LocalRankFilename,Filename.c_str(),".refine");
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(RefineLabel.data(),1,rnx*rny*rnz,WRITEID);
+			fclose(WRITEID);
+		}
+		else{
+			DoubleArray BlockDist(nx,ny,nz);
+			FILE *WRITEID, *REFINEDIST;
+			char * id;
+			id = new char [N];
+			int writerank;
+
+			// Write output blocks with the same sub-domain size as origina
+			// refinement increases the size of the process grid
+			writerank = 8*Dm.kproc()*nprocx*nprocy + 4*Dm.jproc()*nprocx + 2*Dm.iproc();
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i,j,k);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j,k);
+					}
 				}
 			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-
-/*		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]= 0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = 8*Dm.kproc()*nprocx*nprocy + 4*Dm.jproc()*nprocx + 2*Dm.iproc()+1;
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j,k);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j,k);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-
-/*		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
 
 
-		writerank = (2*Dm.kproc())*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc()+1;
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j+ny-2,k);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j+ny-2,k);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
 
-/*		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
+			writerank = 8*Dm.kproc()*nprocx*nprocy + 4*Dm.jproc()*nprocx + 2*Dm.iproc()+1;
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j,k);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j,k);
+					}
 				}
 			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = (2*Dm.kproc())*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc();
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i,j+ny-2,k);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j+ny-2,k);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-/*
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc())*2*nprocx + 2*Dm.iproc();
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i,j,k+nz-2);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j,k+nz-2);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-/*
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc())*2*nprocx + 2*Dm.iproc()+1;
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j,k+nz-2);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j,k+nz-2);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-
-/*		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc();
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i,j+ny-2,k+nz-2);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j+ny-2,k+nz-2);
-				}
-			}
-		}
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-/*
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
-
-		writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc()+1;
-		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j+ny-2,k+nz-2);
-					if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
-					else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j+ny-2,k+nz-2);
-				}
-			}
-		}
-
-		sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
-		REFINEDIST = fopen(LocalRankFilename,"wb");
-		fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
-		fclose(REFINEDIST);
-		
-/*		for (int k=0; k<nz; k++){
-			for (int j=0; j<ny; j++){
-				for (int i=0; i<nx; i++){
-					if (BlockDist(i,j,k) > 0.f)
-						id[k*nx*ny + j*nx + i]=2;
-					else
-						id[k*nx*ny + j*nx + i]=0;
-				}
-			}
-		}
-		*/
-		sprintf(LocalRankFilename,"RefineID.%05i",writerank);
-		WRITEID = fopen(LocalRankFilename,"wb");
-		fwrite(id,1,nx*ny*nz,WRITEID);
-		fclose(WRITEID);
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
 
 
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+
+			writerank = (2*Dm.kproc())*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc()+1;
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j+ny-2,k);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j+ny-2,k);
+					}
+				}
+			}
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+			writerank = (2*Dm.kproc())*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc();
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i,j+ny-2,k);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j+ny-2,k);
+					}
+				}
+			}
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+			writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc())*2*nprocx + 2*Dm.iproc();
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i,j,k+nz-2);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j,k+nz-2);
+					}
+				}
+			}
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+			writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc())*2*nprocx + 2*Dm.iproc()+1;
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j,k+nz-2);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j,k+nz-2);
+					}
+				}
+			}
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+			writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc();
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i,j+ny-2,k+nz-2);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i,j+ny-2,k+nz-2);
+					}
+				}
+			}
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+
+			writerank = (2*Dm.kproc()+1)*4*nprocx*nprocy + (2*Dm.jproc()+1)*2*nprocx + 2*Dm.iproc()+1;
+			for (int k=0; k<nz; k++){
+				for (int j=0; j<ny; j++){
+					for (int i=0; i<nx; i++){
+						BlockDist(i,j,k) = RefinedSignDist(i+nx-2,j+ny-2,k+nz-2);
+						if (BlockDist(i,j,k) > 0) 	id[k*nx*ny + j*nx + i]=2;
+						else 						id[k*nx*ny + j*nx + i] = RefineLabel(i+nx-2,j+ny-2,k+nz-2);
+					}
+				}
+			}
+
+			sprintf(LocalRankFilename,"RefineDist.%05i",writerank);
+			REFINEDIST = fopen(LocalRankFilename,"wb");
+			fwrite(BlockDist.data(),8,nx*ny*nz,REFINEDIST);
+			fclose(REFINEDIST);
+
+			sprintf(LocalRankFilename,"RefineID.%05i",writerank);
+			WRITEID = fopen(LocalRankFilename,"wb");
+			fwrite(id,1,nx*ny*nz,WRITEID);
+			fclose(WRITEID);
+		}
 	}
 	MPI_Barrier(comm);
 	MPI_Finalize();

From 8645d0b2a778a47e4c8d9c73d092375b1f19034f Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 10:42:45 -0400
Subject: [PATCH 068/121] write full refined ID

---
 tests/lbpm_refine_pp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index 1a7ff05b..df6edbf1 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -198,7 +198,7 @@ int main(int argc, char **argv)
 			auto Filename = domain_db->getScalar<std::string>( "Filename" );
 			if ( rank==0 )   printf("Write output \n");
 			sprintf(LocalRankFilename,Filename.c_str(),".refine");
-			WRITEID = fopen(LocalRankFilename,"wb");
+			FILE *WRITEID = fopen("refine.raw","wb");
 			fwrite(RefineLabel.data(),1,rnx*rny*rnz,WRITEID);
 			fclose(WRITEID);
 		}

From 28f3f9dcf8e7d09c2949d1d8cc727dce27fc5165 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Sat, 21 Mar 2020 13:02:17 -0400
Subject: [PATCH 069/121] using aggregator to write 1x 2x data

---
 tests/lbpm_refine_pp.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/lbpm_refine_pp.cpp b/tests/lbpm_refine_pp.cpp
index df6edbf1..4734c19b 100644
--- a/tests/lbpm_refine_pp.cpp
+++ b/tests/lbpm_refine_pp.cpp
@@ -187,7 +187,8 @@ int main(int argc, char **argv)
 					pt.y=0.5*(rj-1)+1.f;
 					pt.z=0.5*(rk-1)+1.f;
 					RefinedSignDist(ri,rj,rk) = LocalApprox.eval(pt);
-					RefineLabel(ri,rj,rk) = Labels(i,j,k); 
+					RefineLabel(ri,rj,rk) = Labels(i,j,k);
+					Dm.id[n] = Labels(i,j,k);
 				}
 			}
 		}
@@ -197,10 +198,11 @@ int main(int argc, char **argv)
 		if (domain_db->keyExists( "Filename" )){
 			auto Filename = domain_db->getScalar<std::string>( "Filename" );
 			if ( rank==0 )   printf("Write output \n");
-			sprintf(LocalRankFilename,Filename.c_str(),".refine");
-			FILE *WRITEID = fopen("refine.raw","wb");
-			fwrite(RefineLabel.data(),1,rnx*rny*rnz,WRITEID);
-			fclose(WRITEID);
+			Dm.AggregateLabels("id_2x.raw");
+			Mask.AggregateLabels("id.raw");
+			//FILE *WRITEID = fopen("refine.raw","wb");
+			//fwrite(RefineLabel.data(),1,rnx*rny*rnz,WRITEID);
+			//fclose(WRITEID);
 		}
 		else{
 			DoubleArray BlockDist(nx,ny,nz);

From 7258867983a421957435ae2934c81bc61607c8d6 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 31 Mar 2020 15:43:36 -0400
Subject: [PATCH 070/121] update shell aggregation protocol

---
 models/ColorModel.cpp | 64 +++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 05004110..a7d06409 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1297,6 +1297,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	double vS = 0.f;
 	double delta_volume;
 	double WallFactor = 0.0;
+	bool USE_CONNECTED_NWP = false;
 
 	DoubleArray phase(Nx,Ny,Nz);
 	IntArray phase_label(Nx,Ny,Nz);;
@@ -1325,32 +1326,55 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	fclose(INPUT);
 	*/
 	// 2. Identify connected components of phase field -> phase_label
-	BlobIDstruct new_index;
-	ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
-	MPI_Barrier(comm);
 	
-	// only operate on component "0"
-	count = 0.0;
-	double second_biggest = 0.0;
+	if (USE_CONNECTED_NWP){
+		BlobIDstruct new_index;
+		ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
+		MPI_Barrier(comm);
 
-	for (int k=0; k<Nz; k++){
-		for (int j=0; j<Ny; j++){
-			for (int i=0; i<Nx; i++){
-				int label = phase_label(i,j,k);
-				if (label == 0 ){
-					phase_id(i,j,k) = 0;
-					count += 1.0;
+		// only operate on component "0"
+		count = 0.0;
+		double second_biggest = 0.0;
+
+		for (int k=0; k<Nz; k++){
+			for (int j=0; j<Ny; j++){
+				for (int i=0; i<Nx; i++){
+					int label = phase_label(i,j,k);
+					if (label == 0 ){
+						phase_id(i,j,k) = 0;
+						count += 1.0;
+					}
+					else 		
+						phase_id(i,j,k) = 1;
+					if (label == 1 ){
+						second_biggest += 1.0;
+					}
 				}
-				else 		
-					phase_id(i,j,k) = 1;
-				if (label == 1 ){
-					second_biggest += 1.0;
+			}
+		}	
+		double volume_connected = sumReduce( Dm->Comm, count);
+		second_biggest = sumReduce( Dm->Comm, second_biggest);
+	}
+	else {
+		// use the whole NWP 
+		for (int k=0; k<Nz; k++){
+			for (int j=0; j<Ny; j++){
+				for (int i=0; i<Nx; i++){
+					if (Averages->SDs(i,j,k) > 0.f){
+						if (phase(i,j,k) > 0.f ){
+							phase_id(i,j,k) = 0;
+						}
+						else {
+							phase_id(i,j,k) = 1;
+						}
+					}
+					else {
+						phase_id(i,j,k) = 1;
+					}
 				}
 			}
 		}
-	}	
-	double volume_connected = sumReduce( Dm->Comm, count);
-	second_biggest = sumReduce( Dm->Comm, second_biggest);
+	}
 
 	/*int reach_x, reach_y, reach_z;
 	for (int k=0; k<Nz; k++){

From d4c08248651ad8b7e0ced49b2688c39503acce5a Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 31 Mar 2020 15:55:05 -0400
Subject: [PATCH 071/121] cloning databse for restart

---
 analysis/runAnalysis.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index 384d4d69..51b0214c 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -907,10 +907,11 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
     // Spawn a thread to write the restart file
     //    if ( matches(type,AnalysisType::CreateRestart) ) {
     if (timestep%d_restart_interval==0){
+    	auto Restart_db = input_db->clone();
 		input_db->putScalar<bool>( "Restart", true );
     	if (d_rank==0) {
     		std::ofstream OutStream("Restart.db");
-    		input_db->print(OutStream, "");
+    		Restart_db->print(OutStream, "");
     		OutStream.close();
     	}
     	// Write the restart file (using a seperate thread)
@@ -1008,22 +1009,21 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
     	cfq = std::shared_ptr<double>(new double[19*d_Np],DeleteArray<double>);
     	ScaLBL_CopyToHost(cfq.get(),fq,19*d_Np*sizeof(double));
     	ScaLBL_CopyToHost(cDen.get(),Den,2*d_Np*sizeof(double));
-
-		color_db->putScalar<int>("timestep",timestep);    		
-		color_db->putScalar<bool>( "Restart", true );
-		input_db->putDatabase("Color", color_db);
-		
+    	// clone the input database to avoid modifying shared data
+    	auto Restart_db = input_db->clone();
+    	auto tmp_color_db =  Restart_db.getDatabase( "Color" );
+    	tmp_color_db.putScalar<int>("timestep",timestep);    		
+    	tmp_color_db.putScalar<bool>( "Restart", true );
+    	Restart_db.putDatabase("Color", tmp_color_db);
     	if (d_rank==0) {
     		std::ofstream OutStream("Restart.db");
-    		input_db->print(OutStream, "");
+    		Restart_db.print(OutStream, "");
     		OutStream.close();
-  
     	}
     	// Write the restart file (using a seperate thread)
     	auto work1 = new WriteRestartWorkItem(d_restartFile.c_str(),cDen,cfq,d_Np);
     	work1->add_dependency(d_wait_restart);
     	d_wait_restart = d_tpool.add_work(work1);
-
     }
     
     if (timestep%d_visualization_interval==0){

From c4f15d8727516207d48ca63031c55ddcfc06945e Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 31 Mar 2020 16:12:24 -0400
Subject: [PATCH 072/121] fixed issue cloning db

---
 analysis/runAnalysis.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/analysis/runAnalysis.cpp b/analysis/runAnalysis.cpp
index 51b0214c..7150ab31 100644
--- a/analysis/runAnalysis.cpp
+++ b/analysis/runAnalysis.cpp
@@ -907,12 +907,12 @@ void runAnalysis::run(int timestep, std::shared_ptr<Database> input_db, TwoPhase
     // Spawn a thread to write the restart file
     //    if ( matches(type,AnalysisType::CreateRestart) ) {
     if (timestep%d_restart_interval==0){
-    	auto Restart_db = input_db->clone();
-		input_db->putScalar<bool>( "Restart", true );
+    	auto Restart_db = input_db->cloneDatabase();
+	//	Restart_db->putScalar<bool>( "Restart", true );
     	if (d_rank==0) {
-    		std::ofstream OutStream("Restart.db");
-    		Restart_db->print(OutStream, "");
-    		OutStream.close();
+	  //	std::ofstream OutStream("Restart.db");
+	  //	Restart_db->print(OutStream, "");
+	  //	OutStream.close();
     	}
     	// Write the restart file (using a seperate thread)
         auto work = new WriteRestartWorkItem(d_restartFile.c_str(),cDen,cfq,d_Np);
@@ -1010,14 +1010,14 @@ void runAnalysis::basic(int timestep, std::shared_ptr<Database> input_db, SubPha
     	ScaLBL_CopyToHost(cfq.get(),fq,19*d_Np*sizeof(double));
     	ScaLBL_CopyToHost(cDen.get(),Den,2*d_Np*sizeof(double));
     	// clone the input database to avoid modifying shared data
-    	auto Restart_db = input_db->clone();
-    	auto tmp_color_db =  Restart_db.getDatabase( "Color" );
-    	tmp_color_db.putScalar<int>("timestep",timestep);    		
-    	tmp_color_db.putScalar<bool>( "Restart", true );
-    	Restart_db.putDatabase("Color", tmp_color_db);
+    	auto Restart_db = input_db->cloneDatabase();
+    	auto tmp_color_db =  Restart_db->getDatabase( "Color" );
+    	tmp_color_db->putScalar<int>("timestep",timestep);    		
+    	tmp_color_db->putScalar<bool>( "Restart", true );
+    	Restart_db->putDatabase("Color", tmp_color_db);
     	if (d_rank==0) {
     		std::ofstream OutStream("Restart.db");
-    		Restart_db.print(OutStream, "");
+    		Restart_db->print(OutStream, "");
     		OutStream.close();
     	}
     	// Write the restart file (using a seperate thread)

From 7b67f2acfc7c0dfcd570d5b74fca52d81726eea1 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Tue, 31 Mar 2020 18:05:32 -0400
Subject: [PATCH 073/121] refactor shell aggregation

---
 models/ColorModel.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index a7d06409..25716a1e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1327,6 +1327,8 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	*/
 	// 2. Identify connected components of phase field -> phase_label
 	
+	double volume_connected = 0.0;
+       	double second_biggest = 0.0;
 	if (USE_CONNECTED_NWP){
 		BlobIDstruct new_index;
 		ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
@@ -1334,7 +1336,6 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 
 		// only operate on component "0"
 		count = 0.0;
-		double second_biggest = 0.0;
 
 		for (int k=0; k<Nz; k++){
 			for (int j=0; j<Ny; j++){
@@ -1352,7 +1353,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 				}
 			}
 		}	
-		double volume_connected = sumReduce( Dm->Comm, count);
+		volume_connected = sumReduce( Dm->Comm, count);
 		second_biggest = sumReduce( Dm->Comm, second_biggest);
 	}
 	else {
@@ -1409,13 +1410,16 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 		}
 	}
 
+	if (USE_CONNECTED_NWP){
 	if (volume_connected - second_biggest < 2.0*fabs(target_delta_volume) && target_delta_volume < 0.0){
 		// if connected volume is less than 2% just delete the whole thing
 		if (rank==0) printf("Connected region has shrunk! \n");
 		REVERSE_FLOW_DIRECTION = true;
 	}
+	
 /*	else{*/
 		if (rank==0) printf("Pathway volume / next largest ganglion %f \n",volume_connected/second_biggest );
+	}
 		if (rank==0) printf("MorphGrow with target volume fraction change %f \n", target_delta_volume/volume_initial);
 		double target_delta_volume_incremental = target_delta_volume;
 		if (fabs(target_delta_volume) > 0.01*volume_initial)  

From 0d493275b4eb54253c9cc300f012861224e97a8e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 08:19:59 -0400
Subject: [PATCH 074/121] use kr as target for morph change

---
 models/ColorModel.cpp | 47 +++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 25716a1e..c6cb563c 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -782,6 +782,20 @@ void ScaLBL_ColorModel::Run(){
 			double flow_rate_B = volB*(vB_x*dir_x + vB_y*dir_y + vB_z*dir_z);
 			double Ca = fabs(muA*flow_rate_A + muB*flow_rate_B)/(5.796*alpha);
 			
+			if (SET_CAPILLARY_NUMBER && CURRENT_STEADY_TIMESTEPS%MIN_STEADY_TIMESTEPS < analysis_interval ){
+				Fx *= capillary_number / Ca;
+				Fy *= capillary_number / Ca;
+				Fz *= capillary_number / Ca;
+				if (force_mag > 1e-3){
+					Fx *= 1e-3/force_mag;   // impose ceiling for stability
+					Fy *= 1e-3/force_mag;   
+					Fz *= 1e-3/force_mag;   
+				}
+				if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
+				Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
+				color_db->putVector<double>("F",{Fx,Fy,Fz});
+			}
+			
 			if ( morph_timesteps > morph_interval ){
 				
 				bool isSteady = false;
@@ -789,28 +803,21 @@ void ScaLBL_ColorModel::Run(){
 					isSteady = true;
 				if (CURRENT_STEADY_TIMESTEPS > MAX_STEADY_TIMESTEPS)
 					isSteady = true;
-				
-				if (SET_CAPILLARY_NUMBER  && RESCALE_FORCE_COUNT < RESCALE_FORCE_MAX){
-					RESCALE_FORCE_COUNT++;
-					Fx *= capillary_number / Ca;
-					Fy *= capillary_number / Ca;
-					Fz *= capillary_number / Ca;
-
-					if (force_mag > 1e-3){
-						Fx *= 1e-3/force_mag;   // impose ceiling for stability
-						Fy *= 1e-3/force_mag;   
-						Fz *= 1e-3/force_mag;   
-					}
-					
-					if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
-					Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
-					color_db->putVector<double>("F",{Fx,Fy,Fz});
-				}
 
 				if ( isSteady ){
 					MORPH_ADAPT = true;
 					CURRENT_MORPH_TIMESTEPS=0;
-					delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
+					//delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
+					/** morphological target based on relative permeability for A **/
+					double krA_TMP= fabs(muA*flow_rate_A / force_mag);
+					log_krA = log(krA_TMP);
+					log_krA_target = log(KRA_MORPH_FACTOR*(krA_TMP));
+					slope_krA_volume = (log_krA - log_krA_prev)/(Dm->Volume*(volA - volA_prev));
+					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
+					log_krA_prev = log_krA;
+					volA_prev = volA;
+					printf("   log(kr)=%f, volume=%f, TARGET log(kr)=%f, volume change=%f \n",log_krA, volA, log_krA_target, delta_volume_target/(volA*Dm->Volume));
+					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);
 					analysis.WriteVisData(timestep, current_db, *Averages, Phi, Pressure, Velocity, fq, Den );
@@ -884,7 +891,6 @@ void ScaLBL_ColorModel::Run(){
 						Fx *= capillary_number / Ca;
 						Fy *= capillary_number / Ca;
 						Fz *= capillary_number / Ca;
-						RESCALE_FORCE_COUNT = 1;
 						if (force_mag > 1e-3){
 							Fx *= 1e-3/force_mag;   // impose ceiling for stability
 							Fy *= 1e-3/force_mag;   
@@ -904,6 +910,7 @@ void ScaLBL_ColorModel::Run(){
 						Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
 						color_db->putVector<double>("F",{Fx,Fy,Fz});
 					}
+					
 					CURRENT_STEADY_TIMESTEPS = 0;
 				}
 				else{
@@ -979,7 +986,7 @@ void ScaLBL_ColorModel::Run(){
 					//morph_delta *= (-1.0);
 					REVERSE_FLOW_DIRECTION = false;
 				}
-				MPI_Barrier(comm);
+				comm.barrier();
 			}
 			morph_timesteps += analysis_interval;
 		}

From abfe86152f23b2334800a170415a76e6cf3a8ed0 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Wed, 1 Apr 2020 12:13:04 -0400
Subject: [PATCH 075/121] fix bug

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 09c8b946..f6d15b43 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1002,7 +1002,7 @@ void ScaLBL_ColorModel::Run(){
 					//morph_delta *= (-1.0);
 					REVERSE_FLOW_DIRECTION = false;
 				}
-				comm.barrier();
+				MPI_Barrier(comm);
 			}
 			morph_timesteps += analysis_interval;
 		}

From e50a099c13eab93a6a302c82c0824c7f4fb89bcc Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 12:20:21 -0400
Subject: [PATCH 076/121] cleaning up barriers in color model

---
 models/ColorModel.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index f6d15b43..a62ec927 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -716,7 +716,8 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); 
+		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -751,10 +752,9 @@ void ScaLBL_ColorModel::Run(){
 		}
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
-		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
+		ScaLBL_DeviceBarrier(); 
+		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 		//************************************************************************
-		
-		MPI_Barrier(comm);
 		PROFILE_STOP("Update");
 
 		if (rank==0 && timestep%analysis_interval == 0 && BoundaryCondition > 0){
@@ -763,7 +763,6 @@ void ScaLBL_ColorModel::Run(){
 		// Run the analysis
 		analysis.basic(timestep, current_db, *Averages, Phi, Pressure, Velocity, fq, Den );
 
-		
 		// allow initial ramp-up to get closer to steady state
 		if (timestep > RAMP_TIMESTEPS && timestep%analysis_interval == 0 && USE_MORPH){
 			analysis.finish();
@@ -1002,17 +1001,17 @@ void ScaLBL_ColorModel::Run(){
 					//morph_delta *= (-1.0);
 					REVERSE_FLOW_DIRECTION = false;
 				}
-				MPI_Barrier(comm);
 			}
 			morph_timesteps += analysis_interval;
 		}
+		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	}
 	analysis.finish();
 	PROFILE_STOP("Loop");
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(comm);
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep
@@ -1062,12 +1061,12 @@ double ScaLBL_ColorModel::ImageInit(std::string Filename){
 	
 	if (rank==0) printf("   new saturation: %f (%f / %f) \n", Count / PoreCount, Count, PoreCount);
 	ScaLBL_CopyToDevice(Phi, PhaseLabel, Nx*Ny*Nz*sizeof(double));
-	MPI_Barrier(comm);
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	
 	ScaLBL_D3Q19_Init(fq, Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
-	MPI_Barrier(comm);
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	
 	ScaLBL_CopyToHost(Averages->Phi.data(),Phi,Nx*Ny*Nz*sizeof(double));
 
@@ -1442,7 +1441,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	if (USE_CONNECTED_NWP){
 		BlobIDstruct new_index;
 		ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
-		MPI_Barrier(comm);
+		MPI_Barrier(Dm->comm);
 
 		// only operate on component "0"
 		count = 0.0;

From 7ef292e2fcc2a1864a626c1ef5475b6c708eda4e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 12:23:00 -0400
Subject: [PATCH 077/121] cleaning up barriers in color model

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index a62ec927..b86c0918 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1441,7 +1441,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	if (USE_CONNECTED_NWP){
 		BlobIDstruct new_index;
 		ComputeGlobalBlobIDs(Nx-2,Ny-2,Nz-2,rank_info,phase,Averages->SDs,vF,vS,phase_label,comm);
-		MPI_Barrier(Dm->comm);
+		MPI_Barrier(Dm->Comm);
 
 		// only operate on component "0"
 		count = 0.0;

From 64a19a718bc5a6725ef11bb249c5352745549646 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 12:26:55 -0400
Subject: [PATCH 078/121] make ScaLBL communicator public for

---
 common/ScaLBL.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index a50ab7ed..0d2ee0cf 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -134,6 +134,7 @@ public:
 	//ScaLBL_Communicator(Domain &Dm, IntArray &Map);
 	~ScaLBL_Communicator();
 	//......................................................................................
+	MPI_Comm MPI_COMM_SCALBL;		// MPI Communicator
 	unsigned long int CommunicationCount,SendCount,RecvCount;
 	int Nx,Ny,Nz,N;
 	int BoundaryCondition;
@@ -207,7 +208,6 @@ private:
 	// Give the object it's own MPI communicator
 	RankInfoStruct rank_info;
 	MPI_Group Group;	// Group of processors associated with this domain
-	MPI_Comm MPI_COMM_SCALBL;		// MPI Communicator for this domain
 	MPI_Request req1[18],req2[18];
 	MPI_Status stat1[18],stat2[18];
 	//......................................................................................

From 50b84071456fb6806c734735730c6982384420e9 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 13:13:57 -0400
Subject: [PATCH 079/121] clean up ScaLBL barriers

---
 cpu/Extras.cpp        | 2 +-
 gpu/Extras.cu         | 1 +
 models/ColorModel.cpp | 4 +---
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpu/Extras.cpp b/cpu/Extras.cpp
index 71f5c04a..efe820d3 100644
--- a/cpu/Extras.cpp
+++ b/cpu/Extras.cpp
@@ -49,5 +49,5 @@ extern "C" void ScaLBL_CopyToZeroCopy(void* dest, const void* source, size_t siz
 }
 
 extern "C" void ScaLBL_DeviceBarrier(){
-//	cudaDeviceSynchronize();
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL)
 }
diff --git a/gpu/Extras.cu b/gpu/Extras.cu
index 8aeedc87..cd9c265c 100644
--- a/gpu/Extras.cu
+++ b/gpu/Extras.cu
@@ -59,4 +59,5 @@ extern "C" void ScaLBL_CopyToHost(void* dest, const void* source, size_t size){
 
 extern "C" void ScaLBL_DeviceBarrier(){
 	cudaDeviceSynchronize();
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL)
 }
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index b86c0918..57e50411 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -717,7 +717,7 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
-		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
+		;
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -753,7 +753,6 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
-		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 		//************************************************************************
 		PROFILE_STOP("Update");
 
@@ -1011,7 +1010,6 @@ void ScaLBL_ColorModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
-	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep

From f12f8154b12b5aa4fc2a124a1a615ec3735bb176 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 1 Apr 2020 13:15:24 -0400
Subject: [PATCH 080/121] Revert "clean up ScaLBL barriers"

This reverts commit 50b84071456fb6806c734735730c6982384420e9.
---
 cpu/Extras.cpp        | 2 +-
 gpu/Extras.cu         | 1 -
 models/ColorModel.cpp | 4 +++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpu/Extras.cpp b/cpu/Extras.cpp
index efe820d3..71f5c04a 100644
--- a/cpu/Extras.cpp
+++ b/cpu/Extras.cpp
@@ -49,5 +49,5 @@ extern "C" void ScaLBL_CopyToZeroCopy(void* dest, const void* source, size_t siz
 }
 
 extern "C" void ScaLBL_DeviceBarrier(){
-	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL)
+//	cudaDeviceSynchronize();
 }
diff --git a/gpu/Extras.cu b/gpu/Extras.cu
index cd9c265c..8aeedc87 100644
--- a/gpu/Extras.cu
+++ b/gpu/Extras.cu
@@ -59,5 +59,4 @@ extern "C" void ScaLBL_CopyToHost(void* dest, const void* source, size_t size){
 
 extern "C" void ScaLBL_DeviceBarrier(){
 	cudaDeviceSynchronize();
-	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL)
 }
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 57e50411..b86c0918 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -717,7 +717,7 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
-		;
+		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 
 		// *************EVEN TIMESTEP*************
 		timestep++;
@@ -753,6 +753,7 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
+		MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 		//************************************************************************
 		PROFILE_STOP("Update");
 
@@ -1010,6 +1011,7 @@ void ScaLBL_ColorModel::Run(){
 	PROFILE_SAVE("lbpm_color_simulator",1);
 	//************************************************************************
 	ScaLBL_DeviceBarrier();
+	MPI_Barrier(ScaLBL_Comm->MPI_COMM_SCALBL);
 	stoptime = MPI_Wtime();
 	if (rank==0) printf("-------------------------------------------------------------------\n");
 	// Compute the walltime per timestep

From d1d626ac414eff6306669c1d70ba904a73238b45 Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Thu, 2 Apr 2020 10:38:14 -0400
Subject: [PATCH 081/121] fix header in greyscale

---
 models/GreyscaleModel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index a99925b1..ea807048 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -10,7 +10,8 @@ Implementation of color lattice boltzmann model
 #include <fstream>
 
 #include "common/Communication.h"
-#include "common/MPI.h"
+//#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "common/Database.h"
 #include "common/ScaLBL.h"
 #include "ProfilerApp.h"

From ce7d348a206aa2fc70089202e522a54a27334863 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Thu, 2 Apr 2020 10:43:10 -0400
Subject: [PATCH 082/121] fix sumReduce

---
 models/GreyscaleModel.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/models/GreyscaleModel.cpp b/models/GreyscaleModel.cpp
index 11d92c80..c28c88c5 100644
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@@ -261,8 +261,7 @@ void ScaLBL_GreyscaleModel::AssignComponentLabels(double *Porosity, double *Perm
 	// Set Dm to match Mask
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
-	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=Dm->Comm.sumReduce(label_count[idx]);
-
+	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=sumReduce( Dm->Comm, label_count[idx]);
     //Initialize a weighted porosity after considering grey voxels
     GreyPorosity=0.0;
 	for (unsigned int idx=0; idx<NLABELS; idx++){
@@ -600,10 +599,10 @@ void ScaLBL_GreyscaleModel::Run(){
 			//MPI_Allreduce(&vaz_loc,&vaz,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			//MPI_Allreduce(&count_loc,&count,1,MPI_DOUBLE,MPI_SUM,Mask->Comm);
 			
-            vax = Mask->Comm.sumReduce( vax_loc );
-            vay = Mask->Comm.sumReduce( vay_loc );
-            vaz = Mask->Comm.sumReduce( vaz_loc );
-            count = Mask->Comm.sumReduce( count_loc );
+            vax = sumReduce( Mask->Comm, vax_loc);
+            vay = sumReduce( Mask->Comm, vay_loc);
+            vaz = sumReduce( Mask->Comm, vaz_loc);
+            count = sumReduce( Mask->Comm, count_loc);
 
 			vax /= count;
 			vay /= count;
@@ -634,10 +633,10 @@ void ScaLBL_GreyscaleModel::Run(){
 			double As = Morphology.A();
 			double Hs = Morphology.H();
 			double Xs = Morphology.X();
-			Vs = Dm->Comm.sumReduce( Vs);
-			As = Dm->Comm.sumReduce( As);
-			Hs = Dm->Comm.sumReduce( Hs);
-			Xs = Dm->Comm.sumReduce( Xs);
+			Vs = sumReduce( Dm->Comm, Vs);
+			As = sumReduce( Dm->Comm, As);
+			Hs = sumReduce( Dm->Comm, Hs);
+			Xs = sumReduce( Dm->Comm, Xs);
 
 			double h = Dm->voxel_length;
 			//double absperm = h*h*mu*Mask->Porosity()*flow_rate / force_mag;

From b4a51e266b43f838ae96af26f702cea6486b810f Mon Sep 17 00:00:00 2001
From: JamesEMcclure <jemcclur@gmail.com>
Date: Thu, 2 Apr 2020 10:55:04 -0400
Subject: [PATCH 083/121] remove warnings for greyscale

---
 tests/lbpm_greyscale_simulator.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/lbpm_greyscale_simulator.cpp b/tests/lbpm_greyscale_simulator.cpp
index a54b6fc4..b17778ce 100644
--- a/tests/lbpm_greyscale_simulator.cpp
+++ b/tests/lbpm_greyscale_simulator.cpp
@@ -8,7 +8,7 @@
 
 #include "common/ScaLBL.h"
 #include "common/Communication.h"
-#include "common/MPI.h"
+#include "common/MPI_Helpers.h"
 #include "models/GreyscaleModel.h"
 //#define WRITE_SURFACES
 
@@ -33,8 +33,6 @@ int main(int argc, char **argv)
 	MPI_Comm_size(comm,&nprocs);
 	{
 		// parallel domain size (# of sub-domains)
-		int nprocx,nprocy,nprocz;
-		int iproc,jproc,kproc;
 
 		if (rank == 0){
 			printf("********************************************************\n");
@@ -43,6 +41,7 @@ int main(int argc, char **argv)
 		}
 		// Initialize compute device
 		int device=ScaLBL_SetDevice(rank);
+		NULL_USE(device);
 		ScaLBL_DeviceBarrier();
 		MPI_Barrier(comm);
 		

From 4398b09cc08ae3191cc429e3cea331a8ee5919e2 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Thu, 2 Apr 2020 12:53:54 -0400
Subject: [PATCH 084/121] enabling endpoint adaptation for color model

---
 models/ColorModel.cpp   | 50 ++++++++++++++++++++---------------------
 models/GreyscaleModel.h |  1 -
 2 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index b86c0918..7af1cafe 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -500,6 +500,7 @@ void ScaLBL_ColorModel::Run(){
 	bool USE_SEED = false;
 	bool USE_DIRECT = false;
 	bool USE_MORPHOPEN_OIL = false;
+	bool USE_TARGET_VOLUME_CHANGE = false;
 	int MAX_MORPH_TIMESTEPS = 50000; // maximum number of LBM timesteps to spend in morphological adaptation routine
 	int MIN_STEADY_TIMESTEPS = 100000;
 	int MAX_STEADY_TIMESTEPS = 200000;
@@ -523,11 +524,11 @@ void ScaLBL_ColorModel::Run(){
 	bool USE_BUMP_RATE = false;
 	
 	/* history for morphological algoirthm */
-	double KRA_MORPH_FACTOR=0.8;
+	double KRA_MORPH_FACTOR=0.5;
 	double volA_prev = 0.0; 
 	double log_krA_prev = 1.0;
 	double log_krA_target = 1.0;
-	double log_krA = 0.0;
+	double log_krA = 1.0;
 	double slope_krA_volume = 0.0;
 	if (color_db->keyExists( "vol_A_previous" )){
 		volA_prev  = color_db->getScalar<double>( "vol_A_previous" );
@@ -555,17 +556,19 @@ void ScaLBL_ColorModel::Run(){
 		seed_water = 0.01;
 		USE_SEED = true;
 		USE_MORPH = true;
+		USE_TARGET_VOLUME_CHANGE = true;
 	}
 	else if (protocol == "open connected oil"){
 		morph_delta = 0.05;
 		USE_MORPH = true;
 		USE_MORPHOPEN_OIL = true;
+		USE_TARGET_VOLUME_CHANGE = true;
 	}
 	else if (protocol == "shell aggregation"){
 		morph_delta = 0.05;
 		USE_MORPH = true;
+		USE_TARGET_VOLUME_CHANGE = true;
 	}  
-	
 	if (color_db->keyExists( "residual_endpoint_threshold" )){
 		RESIDUAL_ENDPOINT_THRESHOLD = color_db->getScalar<double>( "residual_endpoint_threshold" );
 	}
@@ -822,16 +825,28 @@ void ScaLBL_ColorModel::Run(){
 				if ( isSteady ){
 					MORPH_ADAPT = true;
 					CURRENT_MORPH_TIMESTEPS=0;
-					//delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
-					/** morphological target based on relative permeability for A **/
+					delta_volume_target = Dm->Volume*volA *morph_delta; // set target volume change
+					//****** ENDPOINT ADAPTATION ********/
 					double krA_TMP= fabs(muA*flow_rate_A / force_mag);
+					double krB_TMP= fabs(muB*flow_rate_B / force_mag);
 					log_krA = log(krA_TMP);
-					log_krA_target = log(KRA_MORPH_FACTOR*(krA_TMP));
-					slope_krA_volume = (log_krA - log_krA_prev)/(Dm->Volume*(volA - volA_prev));
-					delta_volume_target=Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume);
+					if (krA_TMP < 0.0){
+						// cannot do endpoint adaptation if kr is negative
+						log_krA = log_krA_prev;
+					}
+					else if (krA_TMP < krB_TMP && morph_delta > 0.0){
+						/** morphological target based on relative permeability for A **/
+						log_krA_target = log(KRA_MORPH_FACTOR*(krA_TMP));
+						slope_krA_volume = (log_krA - log_krA_prev)/(Dm->Volume*(volA - volA_prev));
+						delta_volume_target=min(delta_volume_target,Dm->Volume*(volA+(log_krA_target - log_krA)/slope_krA_volume));
+						if (rank==0){
+							printf("    Enabling endpoint adaptation: krA = %f, krB = %f \n",krA_TMP,krB_TMP);	
+							printf("    log(kr)=%f, volume=%f, TARGET log(kr)=%f, volume change=%f \n",log_krA, volA, log_krA_target, delta_volume_target/(volA*Dm->Volume));							
+						}
+					}
 					log_krA_prev = log_krA;
 					volA_prev = volA;
-					printf("   log(kr)=%f, volume=%f, TARGET log(kr)=%f, volume change=%f \n",log_krA, volA, log_krA_target, delta_volume_target/(volA*Dm->Volume));
+					//******************************** **/
 					/**  compute averages & write data **/
 					Averages->Full();
 					Averages->Write(timestep);
@@ -977,14 +992,6 @@ void ScaLBL_ColorModel::Run(){
 					CURRENT_STEADY_TIMESTEPS=0;
 					initial_volume = volA*Dm->Volume;
 					delta_volume = 0.0;
-					if (USE_DIRECT){
-						//BoundaryCondition = 0;
-						//ScaLBL_Comm->BoundaryCondition = 0;
-						//ScaLBL_Comm_Regular->BoundaryCondition = 0;
-						//Fx = capillary_number*dir_x*force_mag / Ca;
-						//Fy = capillary_number*dir_y*force_mag / Ca;
-						//Fz = capillary_number*dir_z*force_mag / Ca;
-					}
 				}
 				else if (!(USE_DIRECT) && CURRENT_MORPH_TIMESTEPS > MAX_MORPH_TIMESTEPS) {
 					MORPH_ADAPT = false;
@@ -992,15 +999,6 @@ void ScaLBL_ColorModel::Run(){
 					initial_volume = volA*Dm->Volume;
 					delta_volume = 0.0;
 				}
-				if ( REVERSE_FLOW_DIRECTION ){
-					//if (rank==0) printf("*****REVERSE FLOW DIRECTION***** \n");
-					delta_volume = 0.0;
-					// flow direction will reverse after next steady point
-					MORPH_ADAPT = false;
-					CURRENT_STEADY_TIMESTEPS=0;
-					//morph_delta *= (-1.0);
-					REVERSE_FLOW_DIRECTION = false;
-				}
 			}
 			morph_timesteps += analysis_interval;
 		}
diff --git a/models/GreyscaleModel.h b/models/GreyscaleModel.h
index ea807048..c670239f 100644
--- a/models/GreyscaleModel.h
+++ b/models/GreyscaleModel.h
@@ -10,7 +10,6 @@ Implementation of color lattice boltzmann model
 #include <fstream>
 
 #include "common/Communication.h"
-//#include "common/MPI.h"
 #include "common/MPI_Helpers.h"
 #include "common/Database.h"
 #include "common/ScaLBL.h"

From 7f83f55e1bc3e476ad2d8c85a0f1fa166083caf2 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 07:16:44 -0400
Subject: [PATCH 085/121] clean up target Ca

---
 models/ColorModel.cpp | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 7af1cafe..ddf669bb 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -800,20 +800,6 @@ void ScaLBL_ColorModel::Run(){
 			double flow_rate_B = volB*(vB_x*dir_x + vB_y*dir_y + vB_z*dir_z);
 			double Ca = fabs(muA*flow_rate_A + muB*flow_rate_B)/(5.796*alpha);
 			
-			if (SET_CAPILLARY_NUMBER && CURRENT_STEADY_TIMESTEPS%MIN_STEADY_TIMESTEPS < analysis_interval ){
-				Fx *= capillary_number / Ca;
-				Fy *= capillary_number / Ca;
-				Fz *= capillary_number / Ca;
-				if (force_mag > 1e-3){
-					Fx *= 1e-3/force_mag;   // impose ceiling for stability
-					Fy *= 1e-3/force_mag;   
-					Fz *= 1e-3/force_mag;   
-				}
-				if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
-				Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
-				color_db->putVector<double>("F",{Fx,Fy,Fz});
-			}
-			
 			if ( morph_timesteps > morph_interval ){
 				
 				bool isSteady = false;
@@ -926,16 +912,6 @@ void ScaLBL_ColorModel::Run(){
 							Fy *= 1e-3/force_mag;   
 							Fz *= 1e-3/force_mag;   
 						}
-						if (flow_rate_A < NOISE_THRESHOLD && USE_BUMP_RATE){
-							if (rank==0) printf("Hit noise threshold (%f): bumping capillary number by %f X \n",NOISE_THRESHOLD,BUMP_RATE);
-							Fx *= BUMP_RATE;   // impose bump condition
-							Fy *= BUMP_RATE;   
-							Fz *= BUMP_RATE;   
-							capillary_number *= BUMP_RATE;
-							color_db->putScalar<int>("capillary_number",capillary_number);
-							current_db->putDatabase("Color", color_db);
-							MORPH_ADAPT = false; // re-run current point if below noise threshold
-						}
 						if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
 						Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
 						color_db->putVector<double>("F",{Fx,Fy,Fz});
@@ -1293,7 +1269,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 
 	count= sumReduce( Dm->Comm, count);
 	mass_loss= sumReduce( Dm->Comm, mass_loss);
-	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
+	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_lojavascript:void(0)ss,count);
 
 	// Need to initialize Aq, Bq, Den, Phi directly
 	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));

From e7e14a9b642a1ce6b1812fdc04660cfbf0b93e6f Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 07:29:14 -0400
Subject: [PATCH 086/121] clean up flow adapt

---
 models/ColorModel.cpp | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index ddf669bb..a35c3c44 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -500,7 +500,6 @@ void ScaLBL_ColorModel::Run(){
 	bool USE_SEED = false;
 	bool USE_DIRECT = false;
 	bool USE_MORPHOPEN_OIL = false;
-	bool USE_TARGET_VOLUME_CHANGE = false;
 	int MAX_MORPH_TIMESTEPS = 50000; // maximum number of LBM timesteps to spend in morphological adaptation routine
 	int MIN_STEADY_TIMESTEPS = 100000;
 	int MAX_STEADY_TIMESTEPS = 200000;
@@ -518,10 +517,6 @@ void ScaLBL_ColorModel::Run(){
 	double initial_volume = 0.0;
 	double delta_volume = 0.0;
 	double delta_volume_target = 0.0;
-	double RESIDUAL_ENDPOINT_THRESHOLD = 0.04;
-	double NOISE_THRESHOLD = 0.0;
-	double BUMP_RATE = 2.0;
-	bool USE_BUMP_RATE = false;
 	
 	/* history for morphological algoirthm */
 	double KRA_MORPH_FACTOR=0.5;
@@ -569,18 +564,6 @@ void ScaLBL_ColorModel::Run(){
 		USE_MORPH = true;
 		USE_TARGET_VOLUME_CHANGE = true;
 	}  
-	if (color_db->keyExists( "residual_endpoint_threshold" )){
-		RESIDUAL_ENDPOINT_THRESHOLD = color_db->getScalar<double>( "residual_endpoint_threshold" );
-	}
-    NULL_USE( RESIDUAL_ENDPOINT_THRESHOLD );
-	if (color_db->keyExists( "noise_threshold" )){
-		NOISE_THRESHOLD  = color_db->getScalar<double>( "noise_threshold" );
-		USE_BUMP_RATE = true;
-	}
-	if (color_db->keyExists( "bump_rate" )){
-		BUMP_RATE  = color_db->getScalar<double>( "bump_rate" );
-		USE_BUMP_RATE = true;
-	}
 	if (color_db->keyExists( "capillary_number" )){
 		capillary_number = color_db->getScalar<double>( "capillary_number" );
 		SET_CAPILLARY_NUMBER=true;
@@ -1269,7 +1252,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 
 	count= sumReduce( Dm->Comm, count);
 	mass_loss= sumReduce( Dm->Comm, mass_loss);
-	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_lojavascript:void(0)ss,count);
+	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
 
 	// Need to initialize Aq, Bq, Den, Phi directly
 	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));

From 377d259884a61cc815f1c4b230328f3c8b834a71 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 07:30:17 -0400
Subject: [PATCH 087/121] clean up flow adapt

---
 models/ColorModel.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index a35c3c44..02ff6844 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -551,18 +551,15 @@ void ScaLBL_ColorModel::Run(){
 		seed_water = 0.01;
 		USE_SEED = true;
 		USE_MORPH = true;
-		USE_TARGET_VOLUME_CHANGE = true;
 	}
 	else if (protocol == "open connected oil"){
 		morph_delta = 0.05;
 		USE_MORPH = true;
 		USE_MORPHOPEN_OIL = true;
-		USE_TARGET_VOLUME_CHANGE = true;
 	}
 	else if (protocol == "shell aggregation"){
 		morph_delta = 0.05;
 		USE_MORPH = true;
-		USE_TARGET_VOLUME_CHANGE = true;
 	}  
 	if (color_db->keyExists( "capillary_number" )){
 		capillary_number = color_db->getScalar<double>( "capillary_number" );

From 3d31d2672256af61c25c05a25a9743091f970494 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 07:37:29 -0400
Subject: [PATCH 088/121] clean up flow adapt

---
 models/ColorModel.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 02ff6844..954bca7e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -566,9 +566,6 @@ void ScaLBL_ColorModel::Run(){
 		SET_CAPILLARY_NUMBER=true;
 		//RESCALE_FORCE_MAX = 1;
 	}
-//	if (analysis_db->keyExists( "rescale_force_count" )){
-//		RESCALE_FORCE_MAX = analysis_db->getScalar<int>( "rescale_force_count" );
-//	}
 	if (color_db->keyExists( "timestep" )){
 		timestep = color_db->getScalar<int>( "timestep" );
 	}

From 32f1bae784af2fe8c7e3e380c26cce9e6059ae58 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 08:24:28 -0400
Subject: [PATCH 089/121] don't unpack distributions when external BC are
 applied (D3Q7/D3Q19)

---
 common/ScaLBL.cpp | 148 +++++++++++++++++++++++++++++-----------------
 1 file changed, 94 insertions(+), 54 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 21656757..3e2d0f07 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1011,19 +1011,6 @@ void ScaLBL_Communicator::RecvD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Unpack(15,dvcRecvDist_Y,3*recvCount_Y,recvCount_Y,recvbuf_Y,dist,N);
 	ScaLBL_D3Q19_Unpack(17,dvcRecvDist_Y,4*recvCount_Y,recvCount_Y,recvbuf_Y,dist,N);
 	//...................................................................................
-	//...Packing for z face(6,12,13,16,17)................................
-	ScaLBL_D3Q19_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,dist,N);
-	ScaLBL_D3Q19_Unpack(12,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,dist,N);
-	ScaLBL_D3Q19_Unpack(13,dvcRecvDist_z,2*recvCount_z,recvCount_z,recvbuf_z,dist,N);
-	ScaLBL_D3Q19_Unpack(16,dvcRecvDist_z,3*recvCount_z,recvCount_z,recvbuf_z,dist,N);
-	ScaLBL_D3Q19_Unpack(17,dvcRecvDist_z,4*recvCount_z,recvCount_z,recvbuf_z,dist,N);
-	//...Packing for Z face(5,11,14,15,18)................................
-	ScaLBL_D3Q19_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,dist,N);
-	ScaLBL_D3Q19_Unpack(11,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
-	ScaLBL_D3Q19_Unpack(14,dvcRecvDist_Z,2*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
-	ScaLBL_D3Q19_Unpack(15,dvcRecvDist_Z,3*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
-	ScaLBL_D3Q19_Unpack(18,dvcRecvDist_Z,4*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
-	//..................................................................................
 	//...Pack the xy edge (8)................................
 	ScaLBL_D3Q19_Unpack(8,dvcRecvDist_xy,0,recvCount_xy,recvbuf_xy,dist,N);
 	//...Pack the Xy edge (9)................................
@@ -1032,22 +1019,75 @@ void ScaLBL_Communicator::RecvD3Q19AA(double *dist){
 	ScaLBL_D3Q19_Unpack(10,dvcRecvDist_xY,0,recvCount_xY,recvbuf_xY,dist,N);
 	//...Pack the XY edge (7)................................
 	ScaLBL_D3Q19_Unpack(7,dvcRecvDist_XY,0,recvCount_XY,recvbuf_XY,dist,N);
-	//...Pack the xz edge (12)................................
-	ScaLBL_D3Q19_Unpack(12,dvcRecvDist_xz,0,recvCount_xz,recvbuf_xz,dist,N);
-	//...Pack the xZ edge (14)................................
-	ScaLBL_D3Q19_Unpack(14,dvcRecvDist_xZ,0,recvCount_xZ,recvbuf_xZ,dist,N);
-	//...Pack the Xz edge (13)................................
-	ScaLBL_D3Q19_Unpack(13,dvcRecvDist_Xz,0,recvCount_Xz,recvbuf_Xz,dist,N);
-	//...Pack the XZ edge (11)................................
-	ScaLBL_D3Q19_Unpack(11,dvcRecvDist_XZ,0,recvCount_XZ,recvbuf_XZ,dist,N);
-	//...Pack the yz edge (16)................................
-	ScaLBL_D3Q19_Unpack(16,dvcRecvDist_yz,0,recvCount_yz,recvbuf_yz,dist,N);
-	//...Pack the yZ edge (18)................................
-	ScaLBL_D3Q19_Unpack(18,dvcRecvDist_yZ,0,recvCount_yZ,recvbuf_yZ,dist,N);
-	//...Pack the Yz edge (17)................................
-	ScaLBL_D3Q19_Unpack(17,dvcRecvDist_Yz,0,recvCount_Yz,recvbuf_Yz,dist,N);
-	//...Pack the YZ edge (15)................................
-	ScaLBL_D3Q19_Unpack(15,dvcRecvDist_YZ,0,recvCount_YZ,recvbuf_YZ,dist,N);
+
+	if (BoundaryCondition > 0){
+		if (kproc != 0){
+			//...Packing for z face(6,12,13,16,17)................................
+			ScaLBL_D3Q19_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,dist,N);
+			ScaLBL_D3Q19_Unpack(12,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,dist,N);
+			ScaLBL_D3Q19_Unpack(13,dvcRecvDist_z,2*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+			ScaLBL_D3Q19_Unpack(16,dvcRecvDist_z,3*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+			ScaLBL_D3Q19_Unpack(17,dvcRecvDist_z,4*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+			//...Pack the xz edge (12)................................
+			ScaLBL_D3Q19_Unpack(12,dvcRecvDist_xz,0,recvCount_xz,recvbuf_xz,dist,N);
+			//...Pack the Xz edge (13)................................
+			ScaLBL_D3Q19_Unpack(13,dvcRecvDist_Xz,0,recvCount_Xz,recvbuf_Xz,dist,N);
+			//...Pack the yz edge (16)................................
+			ScaLBL_D3Q19_Unpack(16,dvcRecvDist_yz,0,recvCount_yz,recvbuf_yz,dist,N);
+			//...Pack the Yz edge (17)................................
+			ScaLBL_D3Q19_Unpack(17,dvcRecvDist_Yz,0,recvCount_Yz,recvbuf_Yz,dist,N);
+			//..................................................................................
+		}
+		if (kproc != nprocz-1){
+			//...Packing for Z face(5,11,14,15,18)................................
+			ScaLBL_D3Q19_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,dist,N);
+			ScaLBL_D3Q19_Unpack(11,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+			ScaLBL_D3Q19_Unpack(14,dvcRecvDist_Z,2*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+			ScaLBL_D3Q19_Unpack(15,dvcRecvDist_Z,3*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+			ScaLBL_D3Q19_Unpack(18,dvcRecvDist_Z,4*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+			//...Pack the xZ edge (14)................................
+			ScaLBL_D3Q19_Unpack(14,dvcRecvDist_xZ,0,recvCount_xZ,recvbuf_xZ,dist,N);
+			//...Pack the XZ edge (11)................................
+			ScaLBL_D3Q19_Unpack(11,dvcRecvDist_XZ,0,recvCount_XZ,recvbuf_XZ,dist,N);
+			//...Pack the yZ edge (18)................................
+			ScaLBL_D3Q19_Unpack(18,dvcRecvDist_yZ,0,recvCount_yZ,recvbuf_yZ,dist,N);
+			//...Pack the YZ edge (15)................................
+			ScaLBL_D3Q19_Unpack(15,dvcRecvDist_YZ,0,recvCount_YZ,recvbuf_YZ,dist,N);
+			//..................................................................................
+		}
+	}
+	else {
+		//...Packing for z face(6,12,13,16,17)................................
+		ScaLBL_D3Q19_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,dist,N);
+		ScaLBL_D3Q19_Unpack(12,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,dist,N);
+		ScaLBL_D3Q19_Unpack(13,dvcRecvDist_z,2*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+		ScaLBL_D3Q19_Unpack(16,dvcRecvDist_z,3*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+		ScaLBL_D3Q19_Unpack(17,dvcRecvDist_z,4*recvCount_z,recvCount_z,recvbuf_z,dist,N);
+		//...Packing for Z face(5,11,14,15,18)................................
+		ScaLBL_D3Q19_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,dist,N);
+		ScaLBL_D3Q19_Unpack(11,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+		ScaLBL_D3Q19_Unpack(14,dvcRecvDist_Z,2*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+		ScaLBL_D3Q19_Unpack(15,dvcRecvDist_Z,3*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+		ScaLBL_D3Q19_Unpack(18,dvcRecvDist_Z,4*recvCount_Z,recvCount_Z,recvbuf_Z,dist,N);
+		//..................................................................................
+		//...Pack the xz edge (12)................................
+		ScaLBL_D3Q19_Unpack(12,dvcRecvDist_xz,0,recvCount_xz,recvbuf_xz,dist,N);
+		//...Pack the xZ edge (14)................................
+		ScaLBL_D3Q19_Unpack(14,dvcRecvDist_xZ,0,recvCount_xZ,recvbuf_xZ,dist,N);
+		//...Pack the Xz edge (13)................................
+		ScaLBL_D3Q19_Unpack(13,dvcRecvDist_Xz,0,recvCount_Xz,recvbuf_Xz,dist,N);
+		//...Pack the XZ edge (11)................................
+		ScaLBL_D3Q19_Unpack(11,dvcRecvDist_XZ,0,recvCount_XZ,recvbuf_XZ,dist,N);
+		//...Pack the yz edge (16)................................
+		ScaLBL_D3Q19_Unpack(16,dvcRecvDist_yz,0,recvCount_yz,recvbuf_yz,dist,N);
+		//...Pack the yZ edge (18)................................
+		ScaLBL_D3Q19_Unpack(18,dvcRecvDist_yZ,0,recvCount_yZ,recvbuf_yZ,dist,N);
+		//...Pack the Yz edge (17)................................
+		ScaLBL_D3Q19_Unpack(17,dvcRecvDist_Yz,0,recvCount_Yz,recvbuf_Yz,dist,N);
+		//...Pack the YZ edge (15)................................
+		ScaLBL_D3Q19_Unpack(15,dvcRecvDist_YZ,0,recvCount_YZ,recvbuf_YZ,dist,N);
+	}
+
 	//...................................................................................
 	Lock=false; // unlock the communicator after communications complete
 	//...................................................................................
@@ -1225,18 +1265,18 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 	ScaLBL_D3Q7_Unpack(3,dvcRecvDist_Y,0,recvCount_Y,recvbuf_Y,Aq,N);
 	ScaLBL_D3Q7_Unpack(3,dvcRecvDist_Y,recvCount_Y,recvCount_Y,recvbuf_Y,Bq,N);
 	//...................................................................................
-	
-	if (BoundaryCondition > 0 && kproc == 0){
-		// don't unpack little z
-		//...Packing for Z face(5,11,14,15,18)................................
-		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
-		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
-	}
-	else if (BoundaryCondition > 0 && kproc == nprocz-1){
-		// don't unpack big z
-		//...Packing for z face(6,12,13,16,17)................................
-		ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,Aq,N);
-		ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,Bq,N);
+
+	if (BoundaryCondition > 0){
+		if (kproc != 0){
+			//...Packing for z face(6,12,13,16,17)................................
+			ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,Aq,N);
+			ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,Bq,N);
+		}
+		if (kproc != nprocz-1){
+			//...Packing for Z face(5,11,14,15,18)................................
+			ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
+			ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
+		}
 	}
 	else {
 		//...Packing for z face(6,12,13,16,17)................................
@@ -1343,19 +1383,19 @@ void ScaLBL_Communicator::TriRecvD3Q7AA(double *Aq, double *Bq, double *Cq){
 	ScaLBL_D3Q7_Unpack(3,dvcRecvDist_Y,2*recvCount_Y,recvCount_Y,recvbuf_Y,Cq,N);
 	//...................................................................................
 	
-	if (BoundaryCondition > 0 && kproc == 0){
-		// don't unpack little z
-		//...Packing for Z face(5,11,14,15,18)................................
-		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
-		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
-		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,2*recvCount_Z,recvCount_Z,recvbuf_Z,Cq,N);
-	}
-	else if (BoundaryCondition > 0 && kproc == nprocz-1){
-		// don't unpack big z
-		//...Packing for z face(6,12,13,16,17)................................
-		ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,Aq,N);
-		ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,Bq,N);
-		ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,2*recvCount_z,recvCount_z,recvbuf_z,Cq,N);
+	if (BoundaryCondition > 0){
+		if (kproc != 0){
+			//...Packing for z face(6,12,13,16,17)................................
+			ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,0,recvCount_z,recvbuf_z,Aq,N);
+			ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,recvCount_z,recvCount_z,recvbuf_z,Bq,N);
+			ScaLBL_D3Q7_Unpack(6,dvcRecvDist_z,2*recvCount_z,recvCount_z,recvbuf_z,Cq,N);
+		}
+		if (kproc != nprocz-1){
+			//...Packing for Z face(5,11,14,15,18)................................
+			ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
+			ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
+			ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,2*recvCount_Z,recvCount_Z,recvbuf_Z,Cq,N);
+		}
 	}
 	else {
 		//...Packing for z face(6,12,13,16,17)................................

From e641e2e3ed01bdc9588a541efbd448804724e335 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 08:26:48 -0400
Subject: [PATCH 090/121] remove old comments

---
 common/ScaLBL.h | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 901e0e3b..3bf50f6f 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -119,11 +119,6 @@ extern "C" void ScaLBL_D3Q19_Gradient_DFH(int *NeighborList, double *Phi, double
 
 // BOUNDARY CONDITION ROUTINES
 
-//extern "C" void ScaLBL_D3Q19_Pressure_BC_z(double *disteven, double *distodd, double din,
-//		int Nx, int Ny, int Nz);
-//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(double *disteven, double *distodd, double dout,
-//		int Nx, int Ny, int Nz, int outlet);
-
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *neighborList, int *list, double *dist, double din, int count, int Np);
 
 extern "C" void ScaLBL_D3Q19_AAodd_Pressure_BC_Z(int *neighborList, int *list, double *dist, double dout, int count, int Np);
@@ -178,18 +173,8 @@ public:
 	int LastInterior();
 	
 	int MemoryOptimizedLayoutAA(IntArray &Map, int *neighborList, signed char *id, int Np);
-//	void MemoryOptimizedLayout(IntArray &Map, int *neighborList, char *id, int Np);
-//	void MemoryOptimizedLayoutFull(IntArray &Map, int *neighborList, char *id, int Np);
-//	void MemoryDenseLayout(IntArray &Map, int *neighborList, char *id, int Np);
-//	void MemoryDenseLayoutFull(IntArray &Map, int *neighborList, char *id, int Np);
-//	void SendD3Q19(double *f_even, double *f_odd);
-//	void RecvD3Q19(double *f_even, double *f_odd);
-//	void SendD3Q19AA(double *f_even, double *f_odd);
-//	void RecvD3Q19AA(double *f_even, double *f_odd);
 	void SendD3Q19AA(double *dist);
 	void RecvD3Q19AA(double *dist);
-//	void BiSendD3Q7(double *A_even, double *A_odd, double *B_even, double *B_odd);
-//	void BiRecvD3Q7(double *A_even, double *A_odd, double *B_even, double *B_odd);
 	void BiSendD3Q7AA(double *Aq, double *Bq);
 	void BiRecvD3Q7AA(double *Aq, double *Bq);
 	void TriSendD3Q7AA(double *Aq, double *Bq, double *Cq);
@@ -206,9 +191,6 @@ public:
 	void D3Q19_Pressure_BC_Z(int *neighborList, double *fq, double dout, int time);
 	double D3Q19_Flux_BC_z(int *neighborList, double *fq, double flux, int time);
 
-//	void TestSendD3Q19(double *f_even, double *f_odd);
-//	void TestRecvD3Q19(double *f_even, double *f_odd);
-
 	// Debugging and unit testing functions
 	void PrintD3Q19();
 

From e64d44e43835ee268d367b82522c385e80fb1e72 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 09:30:55 -0400
Subject: [PATCH 091/121] added D3Q19 reflection BVC

---
 common/ScaLBL.cpp | 11 +++++++++
 common/ScaLBL.h   |  6 +++++
 cpu/D3Q19.cpp     | 36 +++++++++++++++++++++++++++++
 gpu/D3Q19.cu      | 59 +++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 3e2d0f07..8f2aacee 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1633,6 +1633,17 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 	return din;
 }
 
+void ScaLBL_Communicator::D3Q19_Reflection_BC_z(int *neighborList, double *fq){
+	if (kproc == 0)
+		ScaLBL_D3Q19_AAeven_Reflection_BC_z(dvcSendList_z, fq, sendCount_z, N);
+	
+}
+
+void ScaLBL_Communicator::D3Q19_Reflection_BC_Z(int *neighborList, double *fq){
+	if (kproc == nprocz-1)
+		ScaLBL_D3Q19_AAeven_Reflection_BC_Z(dvcSendList_Z, fq, sendCount_Z, N);
+}
+
 void ScaLBL_Communicator::PrintD3Q19(){
 	printf("Printing D3Q19 communication buffer contents \n");
 
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 3bf50f6f..51ee66f4 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -137,6 +137,10 @@ extern "C" void ScaLBL_Color_BC_z(int *list, int *Map, double *Phi, double *Den,
 
 extern "C" void ScaLBL_Color_BC_Z(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np);
 
+extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_z(int *list, double *dist, int count, int Np);
+
+extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_Z(int *list, double *dist, int count, int Np);
+
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice);
 
 class ScaLBL_Communicator{
@@ -189,6 +193,8 @@ public:
 	void Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB);
 	void D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time);
 	void D3Q19_Pressure_BC_Z(int *neighborList, double *fq, double dout, int time);
+	void D3Q19_Reflection_BC_z(int *neighborList, double *fq);
+	void D3Q19_Reflection_BC_Z(int *neighborList, double *fq);
 	double D3Q19_Flux_BC_z(int *neighborList, double *fq, double flux, int time);
 
 	// Debugging and unit testing functions
diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp
index 2c0e686d..2c67501c 100644
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@@ -448,6 +448,42 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub
 	return dout;
 }
 
+extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_z(int *list, double *dist, int count, int Np){
+	for (int idx=0; idx<count; idx++){
+		int n = list[idx];
+		
+		double f5 = 0.111111111111111111111111 - dist[6*Np+n];
+		double f11 = 0.05555555555555555555556 - dist[12*Np+n];
+		double f14 = 0.05555555555555555555556 - dist[13*Np+n];
+		double f15 = 0.05555555555555555555556 - dist[16*Np+n];
+		double f18 = 0.05555555555555555555556 - dist[17*Np+n];
+		
+		dist[6*Np+n] = f5;
+		dist[12*Np+n] = f11;
+		dist[13*Np+n] = f14;
+		dist[16*Np+n] = f15;
+		dist[17*Np+n] = f18;
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+	for (int idx=0; idx<count; idx++){
+		int n = list[idx];
+		
+		double f6 = 0.111111111111111111111111 - dist[5*Np+n];
+		double f12 = 0.05555555555555555555556 - dist[11*Np+n];
+		double f13 = 0.05555555555555555555556 - dist[14*Np+n] ;
+		double f16 = 0.05555555555555555555556 - dist[15*Np+n];
+		double f17 = 0.05555555555555555555556 - dist[18*Np+n];
+		
+		dist[5*Np+n] = f6;
+		dist[11*Np+n] = f12;
+		dist[14*Np+n] = f13;
+		dist[15*Np+n] = f16;
+		dist[18*Np+n] = f17;
+	}
+}
+
 extern "C" void ScaLBL_D3Q19_AAeven_Pressure_BC_z(int *list, double *dist, double din, int count, int Np)
 {
 	// distributions
diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu
index ccd125b2..d43c5b29 100644
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
@@ -1758,6 +1758,43 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist,
 		//...................................................
 	}
 }
+__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_z(int *d_neighborList, int *list, double *dist, int count, int Np){
+	int idx, n;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		double f5 = 0.111111111111111111111111 - dist[6*Np+n];
+		double f11 = 0.05555555555555555555556 - dist[12*Np+n];
+		double f14 = 0.05555555555555555555556 - dist[13*Np+n];
+		double f15 = 0.05555555555555555555556 - dist[16*Np+n];
+		double f18 = 0.05555555555555555555556 - dist[17*Np+n];
+		
+		dist[6*Np+n] = f5;
+		dist[12*Np+n] = f11;
+		dist[13*Np+n] = f14;
+		dist[16*Np+n] = f15;
+		dist[17*Np+n] = f18;
+	}
+}
+
+__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_Z(int *d_neighborList, int *list, double *dist, int count, int Np){
+	int idx, n;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		double f6 = 0.111111111111111111111111 - dist[5*Np+n];
+		double f12 = 0.05555555555555555555556 - dist[11*Np+n];
+		double f13 = 0.05555555555555555555556 - dist[14*Np+n] ;
+		double f16 = 0.05555555555555555555556 - dist[15*Np+n];
+		double f17 = 0.05555555555555555555556 - dist[18*Np+n];
+		
+		dist[5*Np+n] = f6;
+		dist[11*Np+n] = f12;
+		dist[14*Np+n] = f13;
+		dist[15*Np+n] = f16;
+		dist[18*Np+n] = f17;
+	}
+}
 
 __global__  void dvc_ScaLBL_D3Q19_AAodd_Pressure_BC_z(int *d_neighborList, int *list, double *dist, double din, int count, int Np)
 {
@@ -2652,11 +2689,23 @@ extern "C" double deviceReduce(double *in, double* out, int N) {
 	return sum;
 }
 
-//
-//extern "C" void ScaLBL_D3Q19_Pressure_BC_Z(int *list, double *dist, double dout, int count, int Np){
-//	int GRID = count / 512 + 1;
-//	dvc_ScaLBL_D3Q19_Pressure_BC_Z<<<GRID,512>>>(disteven, distodd, dout, Nx, Ny, Nz, outlet);
-//}
+extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q19_Reflection_BC_z<<<GRID,512>>>(neighborList, list, dist, count, N);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_Reflection_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q19_Reflection_BC_Z<<<GRID,512>>>(neighborList, list, dist, count, N);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q19_Reflection_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
 
 extern "C" void ScaLBL_D3Q19_AAeven_MRT(double *dist, int start, int finish, int Np, double rlx_setA, double rlx_setB, double Fx,
        double Fy, double Fz){

From 81f25486330fbe833bbaea5bb23d932a815f8e30 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 09:34:35 -0400
Subject: [PATCH 092/121] fix a few warnings

---
 common/ScaLBL.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 8f2aacee..71beb152 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1520,7 +1520,7 @@ void ScaLBL_Communicator::RecvHalo(double *data){
 
 void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, DoubleArray &regdata){
 	// Gets data from the device and stores in regular layout
-	int i,j,k,n,idx;
+	int i,j,k,idx;
 	int Nx = map.size(0);
 	int Ny = map.size(1);
 	int Nz = map.size(2);
@@ -1551,7 +1551,6 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 
 
 void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
-	double Value=(vA-vB)/(vA+vB);
 	if (kproc == 0) {
 		// Set the phase indicator field and density on the z inlet
 		ScaLBL_Color_BC_z(dvcSendList_z, Map, Phi, Den, vA, vB, sendCount_z, N);
@@ -1560,7 +1559,6 @@ void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double
 }
 
 void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
-	double Value=(vA-vB)/(vA+vB);
 	if (kproc == nprocz-1){
 		// Set the phase indicator field and density on the Z outlet
 		ScaLBL_Color_BC_Z(dvcSendList_Z, Map, Phi, Den, vA, vB, sendCount_Z, N);

From e62208caaabec617c84e8b514e15b4b7f7da450e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 09:52:23 -0400
Subject: [PATCH 093/121] add reflection BC to MRT / Color

---
 common/ScaLBL.cpp     |  4 ++--
 common/ScaLBL.h       |  4 ++--
 models/ColorModel.cpp | 12 ++++++++++--
 models/MRTModel.cpp   | 26 ++++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 71beb152..a612bc73 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1631,13 +1631,13 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 	return din;
 }
 
-void ScaLBL_Communicator::D3Q19_Reflection_BC_z(int *neighborList, double *fq){
+void ScaLBL_Communicator::D3Q19_Reflection_BC_z(double *fq){
 	if (kproc == 0)
 		ScaLBL_D3Q19_AAeven_Reflection_BC_z(dvcSendList_z, fq, sendCount_z, N);
 	
 }
 
-void ScaLBL_Communicator::D3Q19_Reflection_BC_Z(int *neighborList, double *fq){
+void ScaLBL_Communicator::D3Q19_Reflection_BC_Z(double *fq){
 	if (kproc == nprocz-1)
 		ScaLBL_D3Q19_AAeven_Reflection_BC_Z(dvcSendList_Z, fq, sendCount_Z, N);
 }
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 51ee66f4..bac60b0d 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -193,8 +193,8 @@ public:
 	void Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB);
 	void D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time);
 	void D3Q19_Pressure_BC_Z(int *neighborList, double *fq, double dout, int time);
-	void D3Q19_Reflection_BC_z(int *neighborList, double *fq);
-	void D3Q19_Reflection_BC_Z(int *neighborList, double *fq);
+	void D3Q19_Reflection_BC_z(double *fq);
+	void D3Q19_Reflection_BC_Z(double *fq);
 	double D3Q19_Flux_BC_z(int *neighborList, double *fq, double flux, int time);
 
 	// Debugging and unit testing functions
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 954bca7e..4e9720ed 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -673,7 +673,7 @@ void ScaLBL_ColorModel::Run(){
 
 		// Perform the collision operation
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
-		if (BoundaryCondition > 0){
+		if (BoundaryCondition > 0 && BoundaryCondition < 5){
 			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
 			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
 		}
@@ -694,6 +694,10 @@ void ScaLBL_ColorModel::Run(){
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
+		else if (BoundaryCondition == 5){
+			ScaLBL_Comm->D3Q19_Reflection_BC_z(fq);
+			ScaLBL_Comm->D3Q19_Reflection_BC_Z(fq);
+		}
 		ScaLBL_D3Q19_AAodd_Color(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
@@ -711,7 +715,7 @@ void ScaLBL_ColorModel::Run(){
 		// Perform the collision operation
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
 		// Halo exchange for phase field
-		if (BoundaryCondition > 0){
+		if (BoundaryCondition > 0 && BoundaryCondition < 5){
 			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
 			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
 		}
@@ -730,6 +734,10 @@ void ScaLBL_ColorModel::Run(){
 			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
 		}
+		else if (BoundaryCondition == 5){
+			ScaLBL_Comm->D3Q19_Reflection_BC_z(fq);
+			ScaLBL_Comm->D3Q19_Reflection_BC_Z(fq);
+		}
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_DeviceBarrier(); 
diff --git a/models/MRTModel.cpp b/models/MRTModel.cpp
index c1db7c1c..acfb8821 100644
--- a/models/MRTModel.cpp
+++ b/models/MRTModel.cpp
@@ -238,12 +238,38 @@ void ScaLBL_MRTModel::Run(){
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq,  ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		// Set boundary conditions
+		if (BoundaryCondition == 3){
+			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		else if (BoundaryCondition == 4){
+			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		else if (BoundaryCondition == 5){
+			ScaLBL_Comm->D3Q19_Reflection_BC_z(fq);
+			ScaLBL_Comm->D3Q19_Reflection_BC_Z(fq);
+		}
 		ScaLBL_D3Q19_AAodd_MRT(NeighborList, fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		timestep++;
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
 		ScaLBL_D3Q19_AAeven_MRT(fq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		// Set boundary conditions
+		if (BoundaryCondition == 3){
+			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		else if (BoundaryCondition == 4){
+			din = ScaLBL_Comm->D3Q19_Flux_BC_z(NeighborList, fq, flux, timestep);
+			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
+		}
+		else if (BoundaryCondition == 5){
+			ScaLBL_Comm->D3Q19_Reflection_BC_z(fq);
+			ScaLBL_Comm->D3Q19_Reflection_BC_Z(fq);
+		}
 		ScaLBL_D3Q19_AAeven_MRT(fq, 0, ScaLBL_Comm->LastExterior(), Np, rlx_setA, rlx_setB, Fx, Fy, Fz);
 		ScaLBL_DeviceBarrier(); MPI_Barrier(comm);
 		//************************************************************************/

From f72d401be6c88ed60ed3c71609a9c163f2a687bb Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 09:56:56 -0400
Subject: [PATCH 094/121] fix bugs in cu

---
 gpu/D3Q19.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu/D3Q19.cu b/gpu/D3Q19.cu
index d43c5b29..f6f396be 100644
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
@@ -1758,7 +1758,7 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_Pressure_BC_Z(int *list, double *dist,
 		//...................................................
 	}
 }
-__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_z(int *d_neighborList, int *list, double *dist, int count, int Np){
+__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np){
 	int idx, n;
 	idx = blockIdx.x*blockDim.x + threadIdx.x;
 	if (idx < count){
@@ -1777,7 +1777,7 @@ __global__  void dvc_ScaLBL_D3Q19_Reflection_BC_z(int *d_neighborList, int *list
 	}
 }
 
-__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_Z(int *d_neighborList, int *list, double *dist, int count, int Np){
+__global__  void dvc_ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np){
 	int idx, n;
 	idx = blockIdx.x*blockDim.x + threadIdx.x;
 	if (idx < count){
@@ -2691,7 +2691,7 @@ extern "C" double deviceReduce(double *in, double* out, int N) {
 
 extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q19_Reflection_BC_z<<<GRID,512>>>(neighborList, list, dist, count, N);
+	dvc_ScaLBL_D3Q19_Reflection_BC_z<<<GRID,512>>>(list, dist, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
 		printf("CUDA error in ScaLBL_D3Q19_Reflection_BC_z (kernel): %s \n",cudaGetErrorString(err));
@@ -2700,7 +2700,7 @@ extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count,
 
 extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q19_Reflection_BC_Z<<<GRID,512>>>(neighborList, list, dist, count, N);
+	dvc_ScaLBL_D3Q19_Reflection_BC_Z<<<GRID,512>>>(list, dist, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
 		printf("CUDA error in ScaLBL_D3Q19_Reflection_BC_Z (kernel): %s \n",cudaGetErrorString(err));

From 10b630662a8f5016458ca71b1fc0150765aa9373 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 10:00:17 -0400
Subject: [PATCH 095/121] fix reflection name

---
 common/ScaLBL.cpp | 4 ++--
 common/ScaLBL.h   | 4 ++--
 cpu/D3Q19.cpp     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index a612bc73..0fc3d6d2 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1633,13 +1633,13 @@ double ScaLBL_Communicator::D3Q19_Flux_BC_z(int *neighborList, double *fq, doubl
 
 void ScaLBL_Communicator::D3Q19_Reflection_BC_z(double *fq){
 	if (kproc == 0)
-		ScaLBL_D3Q19_AAeven_Reflection_BC_z(dvcSendList_z, fq, sendCount_z, N);
+		ScaLBL_D3Q19_Reflection_BC_z(dvcSendList_z, fq, sendCount_z, N);
 	
 }
 
 void ScaLBL_Communicator::D3Q19_Reflection_BC_Z(double *fq){
 	if (kproc == nprocz-1)
-		ScaLBL_D3Q19_AAeven_Reflection_BC_Z(dvcSendList_Z, fq, sendCount_Z, N);
+		ScaLBL_D3Q19_Reflection_BC_Z(dvcSendList_Z, fq, sendCount_Z, N);
 }
 
 void ScaLBL_Communicator::PrintD3Q19(){
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index bac60b0d..11445d2a 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -137,9 +137,9 @@ extern "C" void ScaLBL_Color_BC_z(int *list, int *Map, double *Phi, double *Den,
 
 extern "C" void ScaLBL_Color_BC_Z(int *list, int *Map, double *Phi, double *Den, double vA, double vB, int count, int Np);
 
-extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_z(int *list, double *dist, int count, int Np);
+extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np);
 
-extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_Z(int *list, double *dist, int count, int Np);
+extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np);
 
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice);
 
diff --git a/cpu/D3Q19.cpp b/cpu/D3Q19.cpp
index 2c67501c..b4f7c005 100644
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@@ -448,7 +448,7 @@ extern "C" double ScaLBL_D3Q19_Flux_BC_Z(double *disteven, double *distodd, doub
 	return dout;
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_z(int *list, double *dist, int count, int Np){
+extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count, int Np){
 	for (int idx=0; idx<count; idx++){
 		int n = list[idx];
 		
@@ -466,7 +466,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_z(int *list, double *dist, int
 	}
 }
 
-extern "C" void ScaLBL_D3Q19_AAeven_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np){
 	for (int idx=0; idx<count; idx++){
 		int n = list[idx];
 		

From 2aeaa3a264c7ee6feaad439fc2615840cf4c5294 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 16:30:54 -0400
Subject: [PATCH 096/121] fix cudamemcpy bug

---
 models/ColorModel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 4e9720ed..9c46be83 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -1190,7 +1190,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 
 	ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
 	ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
-	ScaLBL_CopyToHost(Vel_tmp, Velocity, 7*Np*sizeof(double));
+	ScaLBL_CopyToHost(Vel_tmp, Velocity, 3*Np*sizeof(double));
 	
     //Extract averged velocity
 	double vx_glb = (Averages->gnb.Px+Averages->gwb.Px)/(Averages->gnb.M+Averages->gwb.M); 

From e4d836e7fcf7b8e268d2ed2e26601d5cbbed23d8 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 20:24:29 -0400
Subject: [PATCH 097/121] add reflection condition for color grad

---
 common/ScaLBL.cpp | 19 ++++++++++++++-----
 common/ScaLBL.h   |  2 ++
 cpu/Color.cpp     | 10 +++++++++-
 gpu/Color.cu      | 13 +++++++++++++
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 0fc3d6d2..fe1ce24b 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1549,21 +1549,30 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 	delete [] TmpDat;
 }
 
-
 void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == 0) {
-		// Set the phase indicator field and density on the z inlet
-		ScaLBL_Color_BC_z(dvcSendList_z, Map, Phi, Den, vA, vB, sendCount_z, N);
+		if (BoundaryCondition == 5){
+			ScaLBL_CopySlice_z(Phi,Value,Nx,Ny,Nz,1,0);
+		}
+		else {
+			// Set the phase indicator field and density on the z inlet
+			ScaLBL_Color_BC_z(dvcSendList_z, Map, Phi, Den, vA, vB, sendCount_z, N);
+		}
 		//ScaLBL_SetSlice_z(Phi,Value,Nx,Ny,Nz,0);
 	}
 }
 
 void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == nprocz-1){
+		if (BoundaryCondition == 5){
+			ScaLBL_CopySlice_z(Phi,Value,Nx,Ny,Nz,Nz-2,Nz-1);
+		}
+		else {
 		// Set the phase indicator field and density on the Z outlet
-		ScaLBL_Color_BC_Z(dvcSendList_Z, Map, Phi, Den, vA, vB, sendCount_Z, N);
-		//ScaLBL_SetSlice_z(Phi,Value,Nx,Ny,Nz,Nz-1);
+			ScaLBL_Color_BC_Z(dvcSendList_Z, Map, Phi, Den, vA, vB, sendCount_Z, N);
+		}
 	}
+
 }
 
 void ScaLBL_Communicator::D3Q19_Pressure_BC_z(int *neighborList, double *fq, double din, int time){
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 11445d2a..90209679 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -143,6 +143,8 @@ extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count,
 
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice);
 
+extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Destination);
+
 class ScaLBL_Communicator{
 public:
 	//......................................................................................
diff --git a/cpu/Color.cpp b/cpu/Color.cpp
index 7ae84341..1b1ce0c2 100644
--- a/cpu/Color.cpp
+++ b/cpu/Color.cpp
@@ -1869,7 +1869,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_Color(int *neighborList, int *Map, double *di
 	const double mrt_V12=0.04166666666666666;
 
 	for (int n=start; n<finish; n++){
-		
+	
 		// read the component number densities
 		nA = Den[n];
 		nB = Den[Np + n];
@@ -2810,3 +2810,11 @@ extern "C" void ScaLBL_PhaseField_Init(int *Map, double *Phi, double *Den, doubl
 	}
 }
 
+extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+	int n; double value;
+	for (n=0; n<Nx*Ny; n++){
+		value = Phi[Source*Nx*Ny+n];
+		Phi[Dest*Nx*Ny+n] = value;
+	}
+}
+
diff --git a/gpu/Color.cu b/gpu/Color.cu
index 30c16b51..4484ccb9 100644
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
@@ -1240,6 +1240,15 @@ __global__  void dvc_ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny
 }
 
 
+__global__  void dvc_ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+	int n; double value;
+	int n =  blockIdx.x*blockDim.x + threadIdx.x;
+	if (n < Nx*Ny){
+		value = Phi[Source*Nx*Ny+n];
+		Phi[Dest*Nx*Ny+n] = value;
+	}
+}
+
 
 __global__  void dvc_ScaLBL_D3Q19_AAeven_Color(int *Map, double *dist, double *Aq, double *Bq, double *Den, double *Phi,
 		double *Velocity, double rhoA, double rhoB, double tauA, double tauB, double alpha, double beta,
@@ -4134,5 +4143,9 @@ extern "C" void ScaLBL_Color_BC_Z(int *list, int *Map, double *Phi, double *Den,
 	}
 }
 
+extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+	int GRID = Nx*Ny / 512 + 1;
+	dvc_ScaLBL_CopySlice_z<<<GRID,512>>>(Phi,value,Nx,Ny,Nz,Slice,Dest);
+}
 
 

From e1e603b25f1be9598bc1d7ca710433d23a6f19d4 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 20:26:32 -0400
Subject: [PATCH 098/121] add reflection condition for color grad

---
 common/ScaLBL.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index fe1ce24b..007a7290 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1552,7 +1552,7 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == 0) {
 		if (BoundaryCondition == 5){
-			ScaLBL_CopySlice_z(Phi,Value,Nx,Ny,Nz,1,0);
+			ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,1,0);
 		}
 		else {
 			// Set the phase indicator field and density on the z inlet
@@ -1565,7 +1565,7 @@ void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double
 void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == nprocz-1){
 		if (BoundaryCondition == 5){
-			ScaLBL_CopySlice_z(Phi,Value,Nx,Ny,Nz,Nz-2,Nz-1);
+			ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,Nz-2,Nz-1);
 		}
 		else {
 		// Set the phase indicator field and density on the Z outlet

From 735b3f5d3ea838871aa8bc69b3dcdcc53046a52c Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 20:29:37 -0400
Subject: [PATCH 099/121] fix argfs

---
 common/ScaLBL.h | 2 +-
 cpu/Color.cpp   | 2 +-
 gpu/Color.cu    | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 90209679..92956f1f 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -143,7 +143,7 @@ extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count,
 
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice);
 
-extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Destination);
+extern "C" void ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Destination);
 
 class ScaLBL_Communicator{
 public:
diff --git a/cpu/Color.cpp b/cpu/Color.cpp
index 1b1ce0c2..35cbd5fd 100644
--- a/cpu/Color.cpp
+++ b/cpu/Color.cpp
@@ -2810,7 +2810,7 @@ extern "C" void ScaLBL_PhaseField_Init(int *Map, double *Phi, double *Den, doubl
 	}
 }
 
-extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+extern "C" void ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Dest){
 	int n; double value;
 	for (n=0; n<Nx*Ny; n++){
 		value = Phi[Source*Nx*Ny+n];
diff --git a/gpu/Color.cu b/gpu/Color.cu
index 4484ccb9..389a8dc7 100644
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
@@ -1240,7 +1240,7 @@ __global__  void dvc_ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny
 }
 
 
-__global__  void dvc_ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+__global__  void dvc_ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Dest){
 	int n; double value;
 	int n =  blockIdx.x*blockDim.x + threadIdx.x;
 	if (n < Nx*Ny){
@@ -4143,9 +4143,9 @@ extern "C" void ScaLBL_Color_BC_Z(int *list, int *Map, double *Phi, double *Den,
 	}
 }
 
-extern "C" void ScaLBL_CopySlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Source, int Dest){
+extern "C" void ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Dest){
 	int GRID = Nx*Ny / 512 + 1;
-	dvc_ScaLBL_CopySlice_z<<<GRID,512>>>(Phi,value,Nx,Ny,Nz,Slice,Dest);
+	dvc_ScaLBL_CopySlice_z<<<GRID,512>>>(Phi,Nx,Ny,Nz,Slice,Dest);
 }
 
 

From 354067e2da4d8b21e7bec7902086810bbd3958d0 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 20:31:13 -0400
Subject: [PATCH 100/121] fix argfs

---
 gpu/Color.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/Color.cu b/gpu/Color.cu
index 389a8dc7..7fd87e30 100644
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
@@ -4145,7 +4145,7 @@ extern "C" void ScaLBL_Color_BC_Z(int *list, int *Map, double *Phi, double *Den,
 
 extern "C" void ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Dest){
 	int GRID = Nx*Ny / 512 + 1;
-	dvc_ScaLBL_CopySlice_z<<<GRID,512>>>(Phi,Nx,Ny,Nz,Slice,Dest);
+	dvc_ScaLBL_CopySlice_z<<<GRID,512>>>(Phi,Nx,Ny,Nz,Source,Dest);
 }
 
 

From bad52221a8758291dce46a9ef8f0a5d31d1905cb Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Fri, 3 Apr 2020 20:33:03 -0400
Subject: [PATCH 101/121] fix argfs

---
 gpu/Color.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/Color.cu b/gpu/Color.cu
index 7fd87e30..aeeb3998 100644
--- a/gpu/Color.cu
+++ b/gpu/Color.cu
@@ -1241,7 +1241,7 @@ __global__  void dvc_ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny
 
 
 __global__  void dvc_ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Dest){
-	int n; double value;
+	double value;
 	int n =  blockIdx.x*blockDim.x + threadIdx.x;
 	if (n < Nx*Ny){
 		value = Phi[Source*Nx*Ny+n];

From b81d4199a180c776120d5852f1f3cb63e8e594bd Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Sat, 4 Apr 2020 09:00:46 -0400
Subject: [PATCH 102/121] fix volume bug

---
 models/ColorModel.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 9c46be83..d8af7355 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -760,7 +760,7 @@ void ScaLBL_ColorModel::Run(){
 			double volA = Averages->gnb.V; 
 			volA /= Dm->Volume;
 			volB /= Dm->Volume;;
-			initial_volume = volA*Dm->Volume;
+			//initial_volume = volA*Dm->Volume;
 			double vA_x = Averages->gnb.Px/Averages->gnb.M; 
 			double vA_y = Averages->gnb.Py/Averages->gnb.M; 
 			double vA_z = Averages->gnb.Pz/Averages->gnb.M; 
@@ -1221,6 +1221,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
 			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
 			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+			count += 1.0;
 		}
 		mass_loss += random_value*seed_water_in_oil;
 	}
@@ -1248,6 +1249,7 @@ double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
 			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
 			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
 			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+			count += 1.0;
 		}
 		mass_loss += random_value*seed_water_in_oil;
 	}

From 6d4eaebf4799ce94385dfb4a2186de0534403f4a Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Mon, 6 Apr 2020 06:07:46 -0400
Subject: [PATCH 103/121] drop velocity seeding alg

---
 models/ColorModel.cpp | 245 ++++++++++++------------------------------
 1 file changed, 67 insertions(+), 178 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index d8af7355..21a1f597 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -56,8 +56,6 @@ void ScaLBL_ColorModel::ReadCheckpoint(char *FILENAME, double *cPhi, double *cfq
     File.close();
 }
  */
-
-
 void ScaLBL_ColorModel::ReadParams(string filename){
 	// read the input database 
 	db = std::make_shared<Database>( filename );
@@ -1176,190 +1174,81 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 	}
 	return(volume_change);
 }
-
 double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
-	srand(time(NULL));
-	double mass_loss =0.f;
-	double count =0.f;
-	double *Aq_tmp, *Bq_tmp;
-    double *Vel_tmp;
-	
-	Aq_tmp  = new double [7*Np];
-	Bq_tmp  = new double [7*Np];
-    Vel_tmp = new double [3*Np];
+  srand(time(NULL));
+  double mass_loss =0.f;
+  double count =0.f;
+  double *Aq_tmp, *Bq_tmp;
+  
+  Aq_tmp = new double [7*Np];
+  Bq_tmp = new double [7*Np];
 
-	ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
-	ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
-	ScaLBL_CopyToHost(Vel_tmp, Velocity, 3*Np*sizeof(double));
-	
-    //Extract averged velocity
-	double vx_glb = (Averages->gnb.Px+Averages->gwb.Px)/(Averages->gnb.M+Averages->gwb.M); 
-	double vy_glb = (Averages->gnb.Py+Averages->gwb.Py)/(Averages->gnb.M+Averages->gwb.M); 
-	double vz_glb = (Averages->gnb.Pz+Averages->gwb.Pz)/(Averages->gnb.M+Averages->gwb.M); 
-    double v_mag_glb = sqrt(vx_glb*vx_glb+vy_glb*vy_glb+vz_glb*vz_glb);
+  ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
+  ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
+  
+ 
+   for (int n=0; n < ScaLBL_Comm->LastExterior(); n++){
+    double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+    double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
+    double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
+    double phase_id = (dA - dB) / (dA + dB);
+    if (phase_id > 0.0){
+      Aq_tmp[n] -= 0.3333333333333333*random_value;
+      Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
+      
+      Bq_tmp[n] += 0.3333333333333333*random_value;
+      Bq_tmp[n+Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+    }
+    mass_loss += random_value*seed_water_in_oil;
+  }
 
-	for (int n=0; n < ScaLBL_Comm->LastExterior(); n++){
-        double v_mag_local = sqrt(Vel_tmp[n]*Vel_tmp[n]+Vel_tmp[n+1*Np]*Vel_tmp[n+1*Np]+Vel_tmp[n+2*Np]*Vel_tmp[n+2*Np]);
-        double weight = (v_mag_local<v_mag_glb) ? v_mag_local/v_mag_glb : 1.0;
-		double random_value = weight*seed_water_in_oil*double(rand())/ RAND_MAX;
-		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
-		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
-		double phase_id = (dA - dB) / (dA + dB);
-		if (phase_id > 0.0){
-			Aq_tmp[n] -= 0.3333333333333333*random_value;
-			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
-			
-			Bq_tmp[n] += 0.3333333333333333*random_value;
-			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
-			count += 1.0;
-		}
-		mass_loss += random_value*seed_water_in_oil;
-	}
+  for (int n=ScaLBL_Comm->FirstInterior(); n < ScaLBL_Comm->LastInterior(); n++){
+    double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
+    double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
+    double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
+    double phase_id = (dA - dB) / (dA + dB);
+    if (phase_id > 0.0){
+      Aq_tmp[n] -= 0.3333333333333333*random_value;
+      Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
+      Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
+      
+      Bq_tmp[n] += 0.3333333333333333*random_value;
+      Bq_tmp[n+Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
+      Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
+    }
+    mass_loss += random_value*seed_water_in_oil;
+  }
 
-	for (int n=ScaLBL_Comm->FirstInterior(); n < ScaLBL_Comm->LastInterior(); n++){
-        double v_mag_local = sqrt(Vel_tmp[n]*Vel_tmp[n]+Vel_tmp[n+1*Np]*Vel_tmp[n+1*Np]+Vel_tmp[n+2*Np]*Vel_tmp[n+2*Np]);
-        double weight = (v_mag_local<v_mag_glb) ? v_mag_local/v_mag_glb : 1.0;
-		double random_value = weight*seed_water_in_oil*double(rand())/ RAND_MAX;
-		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
-		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
-		double phase_id = (dA - dB) / (dA + dB);
-		if (phase_id > 0.0){
-			Aq_tmp[n] -= 0.3333333333333333*random_value;
-			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
-			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
-			
-			Bq_tmp[n] += 0.3333333333333333*random_value;
-			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
-			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
-			count += 1.0;
-		}
-		mass_loss += random_value*seed_water_in_oil;
-	}
+  count= sumReduce( Dm->Comm, count);
+  mass_loss= sumReduce( Dm->Comm, mass_loss);
+  if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
 
-	count= sumReduce( Dm->Comm, count);
-	mass_loss= sumReduce( Dm->Comm, mass_loss);
-	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
+  // Need to initialize Aq, Bq, Den, Phi directly
+  //ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));
+  ScaLBL_CopyToDevice(Aq, Aq_tmp, 7*Np*sizeof(double));
+  ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
 
-	// Need to initialize Aq, Bq, Den, Phi directly
-	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));
-	ScaLBL_CopyToDevice(Aq, Aq_tmp, 7*Np*sizeof(double));
-	ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
-
-	return(mass_loss);
+  return(mass_loss);
 }
 
-//double ScaLBL_ColorModel::SeedPhaseField(const double seed_water_in_oil){
-//	srand(time(NULL));
-//	double mass_loss =0.f;
-//	double count =0.f;
-//	double *Aq_tmp, *Bq_tmp;
-//	
-//	Aq_tmp = new double [7*Np];
-//	Bq_tmp = new double [7*Np];
-//
-//	ScaLBL_CopyToHost(Aq_tmp, Aq, 7*Np*sizeof(double));
-//	ScaLBL_CopyToHost(Bq_tmp, Bq, 7*Np*sizeof(double));
-//	
-///*	for (int k=1; k<Nz-1; k++){
-//		for (int j=1; j<Ny-1; j++){
-//			for (int i=1; i<Nx-1; i++){
-//				double random_value = double(rand())/ RAND_MAX;
-//
-//				if (Averages->SDs(i,j,k) < 0.f){
-//					// skip
-//				}
-//				else if (phase(i,j,k) > 0.f ){
-//					phase(i,j,k) -= random_value*seed_water_in_oil;
-//					mass_loss += random_value*seed_water_in_oil;
-//					count++;
-//				}
-//				else {
-//
-//				}
-//			}
-//		}
-//	}
-//	*/
-//	for (int n=0; n < ScaLBL_Comm->LastExterior(); n++){
-//		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
-//		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
-//		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
-//		double phase_id = (dA - dB) / (dA + dB);
-//		if (phase_id > 0.0){
-//			Aq_tmp[n] -= 0.3333333333333333*random_value;
-//			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
-//			
-//			Bq_tmp[n] += 0.3333333333333333*random_value;
-//			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
-//		}
-//		mass_loss += random_value*seed_water_in_oil;
-//	}
-//
-//	for (int n=ScaLBL_Comm->FirstInterior(); n < ScaLBL_Comm->LastInterior(); n++){
-//		double random_value = seed_water_in_oil*double(rand())/ RAND_MAX;
-//		double dA = Aq_tmp[n] + Aq_tmp[n+Np]  + Aq_tmp[n+2*Np] + Aq_tmp[n+3*Np] + Aq_tmp[n+4*Np] + Aq_tmp[n+5*Np] + Aq_tmp[n+6*Np];
-//		double dB = Bq_tmp[n] + Bq_tmp[n+Np]  + Bq_tmp[n+2*Np] + Bq_tmp[n+3*Np] + Bq_tmp[n+4*Np] + Bq_tmp[n+5*Np] + Bq_tmp[n+6*Np];
-//		double phase_id = (dA - dB) / (dA + dB);
-//		if (phase_id > 0.0){
-//			Aq_tmp[n] -= 0.3333333333333333*random_value;
-//			Aq_tmp[n+Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+2*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+3*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+4*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+5*Np] -= 0.1111111111111111*random_value;
-//			Aq_tmp[n+6*Np] -= 0.1111111111111111*random_value;
-//			
-//			Bq_tmp[n] += 0.3333333333333333*random_value;
-//			Bq_tmp[n+Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+2*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+3*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+4*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+5*Np] += 0.1111111111111111*random_value;
-//			Bq_tmp[n+6*Np] += 0.1111111111111111*random_value;
-//		}
-//		mass_loss += random_value*seed_water_in_oil;
-//	}
-//
-//	count = Dm->Comm.sumReduce( count );
-//	mass_loss = Dm->Comm.sumReduce( mass_loss );
-//	if (rank == 0) printf("Remove mass %f from %f voxels \n",mass_loss,count);
-//
-//	// Need to initialize Aq, Bq, Den, Phi directly
-//	//ScaLBL_CopyToDevice(Phi,phase.data(),7*Np*sizeof(double));
-//	ScaLBL_CopyToDevice(Aq, Aq_tmp, 7*Np*sizeof(double));
-//	  ScaLBL_CopyToDevice(Bq, Bq_tmp, 7*Np*sizeof(double));
-//
-//	return(mass_loss);
-//}
-
 double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta_volume){
 	const RankInfoStruct rank_info(rank,nprocx,nprocy,nprocz);
 

From 91f42ab74f14a44883d2d68e59750f93da1f7018 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 08:45:06 -0400
Subject: [PATCH 104/121] condition unpack routines on BC for halo

---
 common/ScaLBL.cpp                     | 42 ++++++++++++++++++++-------
 example/Workflow/ComputeSaturation.py | 37 +++++++++++++++++++++++
 2 files changed, 69 insertions(+), 10 deletions(-)
 create mode 100755 example/Workflow/ComputeSaturation.py

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 007a7290..12589ecf 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1497,22 +1497,44 @@ void ScaLBL_Communicator::RecvHalo(double *data){
 	//...................................................................................
 	ScaLBL_Scalar_Unpack(dvcRecvList_x, recvCount_x,recvbuf_x, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_y, recvCount_y,recvbuf_y, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_z, recvCount_z,recvbuf_z, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_X, recvCount_X,recvbuf_X, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_Y, recvCount_Y,recvbuf_Y, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_Z, recvCount_Z,recvbuf_Z, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_xy, recvCount_xy,recvbuf_xy, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_xY, recvCount_xY,recvbuf_xY, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_Xy, recvCount_Xy,recvbuf_Xy, data, N);
 	ScaLBL_Scalar_Unpack(dvcRecvList_XY, recvCount_XY,recvbuf_XY, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_xz, recvCount_xz,recvbuf_xz, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_xZ, recvCount_xZ,recvbuf_xZ, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_Xz, recvCount_Xz,recvbuf_Xz, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_XZ, recvCount_XZ,recvbuf_XZ, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_yz, recvCount_yz,recvbuf_yz, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_yZ, recvCount_yZ,recvbuf_yZ, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, data, N);
-	ScaLBL_Scalar_Unpack(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, data, N);
+	
+	if (BoundaryCondition > 0){
+		if (kproc != 0){
+			//...Packing for z face(6,12,13,16,17)................................
+			ScaLBL_Scalar_Unpack(dvcRecvList_z, recvCount_z,recvbuf_z, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_xz, recvCount_xz,recvbuf_xz, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_Xz, recvCount_Xz,recvbuf_Xz, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_yz, recvCount_yz,recvbuf_yz, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, data, N);
+		}
+		if (kproc != nprocz-1){
+			//...Packing for Z face(5,11,14,15,18)................................
+			ScaLBL_Scalar_Unpack(dvcRecvList_Z, recvCount_Z,recvbuf_Z, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_xZ, recvCount_xZ,recvbuf_xZ, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_XZ, recvCount_XZ,recvbuf_XZ, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_yZ, recvCount_yZ,recvbuf_yZ, data, N);
+			ScaLBL_Scalar_Unpack(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, data, N);
+		}
+	}
+	else {
+		ScaLBL_Scalar_Unpack(dvcRecvList_z, recvCount_z,recvbuf_z, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_xz, recvCount_xz,recvbuf_xz, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_Xz, recvCount_Xz,recvbuf_Xz, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_yz, recvCount_yz,recvbuf_yz, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_Yz, recvCount_Yz,recvbuf_Yz, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_Z, recvCount_Z,recvbuf_Z, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_xZ, recvCount_xZ,recvbuf_xZ, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_XZ, recvCount_XZ,recvbuf_XZ, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_yZ, recvCount_yZ,recvbuf_yZ, data, N);
+		ScaLBL_Scalar_Unpack(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, data, N);
+	}
+
 	//...................................................................................
 	Lock=false; // unlock the communicator after communications complete
 	//...................................................................................
diff --git a/example/Workflow/ComputeSaturation.py b/example/Workflow/ComputeSaturation.py
new file mode 100755
index 00000000..56a34ece
--- /dev/null
+++ b/example/Workflow/ComputeSaturation.py
@@ -0,0 +1,37 @@
+import sys
+import numpy as np
+import matplotlib.pylab as plt
+
+FILENAME=sys.argv[1]
+Nx=int(sys.argv[2])
+Ny=int(sys.argv[3])
+Nz=int(sys.argv[4])
+
+# read the input image
+Output = np.fromfile(FILENAME,dtype = np.uint8)
+Output.shape = (Nz,Ny,Nx)
+
+Oil=np.count_nonzero(Output==1)
+Water=np.count_nonzero(Output==2)
+Sw=Water/(Oil+Water)
+
+Porosity=1.0-(Oil+Water)/(Nx*Ny*Nz)
+
+print(FILENAME,"Porosity=", Porosity)
+
+SaturationProfile=np.zeros(Nz)
+PorosityProfile=np.zeros(Nz)
+# Compute saturation slice by slice 
+for idx in range(0, Nz):
+   Slice = Output[idx,:,:]
+   Oil=np.count_nonzero(Slice==1)
+   Water=np.count_nonzero(Slice==2)
+   SaturationProfile[idx]=Water/(Oil+Water)
+   PorosityProfile[idx]=(Oil+Water)/(Nx*Ny)
+   
+
+plt.figure()
+plt.plot(SaturationProfile)
+plt.xlabel('Position (z)')
+plt.ylabel('Water Saturation')
+plt.show()

From a82c8995fe64cc9c34e39e5400a53c3e2b687a3e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 09:27:32 -0400
Subject: [PATCH 105/121] make sure not to remove solid for reflection BC

---
 common/Domain.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 03d0c5ca..33d6117a 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -592,10 +592,10 @@ void Domain::Decomp( const std::string& Filename )
 	double sum;
 	double sum_local=0.0;
 	double iVol_global = 1.0/(1.0*(Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-	if (BoundaryCondition > 0) iVol_global = 1.0/(1.0*(Nx-2)*nprocx*(Ny-2)*nprocy*((Nz-2)*nprocz-6));
+	if (BoundaryCondition > 0 && BoundaryCondition !=5) iVol_global = 1.0/(1.0*(Nx-2)*nprocx*(Ny-2)*nprocy*((Nz-2)*nprocz-6));
 	//.........................................................
 	// If external boundary conditions are applied remove solid
-	if (BoundaryCondition >  0  && kproc() == 0){
+	if (BoundaryCondition >  0 && BoundaryCondition !=5 && kproc() == 0){
     	if (inlet_layers_z < 4){
             inlet_layers_z=4;
             if(RANK==0){
@@ -611,7 +611,7 @@ void Domain::Decomp( const std::string& Filename )
 			}
  		}
  	}
-    if (BoundaryCondition >  0  && kproc() == nprocz-1){
+    if (BoundaryCondition >  0 && BoundaryCondition !=5 && kproc() == nprocz-1){
     	if (outlet_layers_z < 4){
             outlet_layers_z=4;
             if(RANK==nprocs-1){
@@ -1061,10 +1061,10 @@ void Domain::ReadIDs(){
 	double sum;
 	double sum_local=0.0;
 	double iVol_global = 1.0/(1.0*(Nx-2)*(Ny-2)*(Nz-2)*nprocs);
-	if (BoundaryCondition > 0) iVol_global = 1.0/(1.0*(Nx-2)*nprocx()*(Ny-2)*nprocy()*((Nz-2)*nprocz()-6));
+	if (BoundaryCondition > 0 && BoundaryCondition !=5) iVol_global = 1.0/(1.0*(Nx-2)*nprocx()*(Ny-2)*nprocy()*((Nz-2)*nprocz()-6));
 	//.........................................................
 	// If external boundary conditions are applied remove solid
-	if (BoundaryCondition >  0  && kproc() == 0){
+	if (BoundaryCondition >  0 && BoundaryCondition !=5 && kproc() == 0){
     	if (inlet_layers_z < 4)	inlet_layers_z=4;
 		for (int k=0; k<inlet_layers_z; k++){
 			for (int j=0;j<Ny;j++){
@@ -1075,7 +1075,7 @@ void Domain::ReadIDs(){
 			}
  		}
  	}
-    if (BoundaryCondition >  0  && kproc() == nprocz()-1){
+    if (BoundaryCondition >  0 && BoundaryCondition !=5 && kproc() == nprocz()-1){
     	if (outlet_layers_z < 4)	outlet_layers_z=4;
  		for (int k=Nz-outlet_layers_z; k<Nz; k++){
  			for (int j=0;j<Ny;j++){

From 2dfa0dee3145e57cc7bbd5206bf63b8a28071e5b Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 09:58:32 -0400
Subject: [PATCH 106/121] disable periodic BC override

---
 models/ColorModel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 21a1f597..def4ab4e 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -128,21 +128,21 @@ void ScaLBL_ColorModel::ReadParams(string filename){
 	// Override user-specified boundary condition for specific protocols
 	auto protocol = color_db->getWithDefault<std::string>( "protocol", "none" );
 	if (protocol == "seed water"){
-		if (BoundaryCondition != 0 ){
+		if (BoundaryCondition != 0 && BoundaryCondition != 5){
 			BoundaryCondition = 0;
 			if (rank==0) printf("WARNING: protocol (seed water) supports only full periodic boundary condition \n");
 		}
 		domain_db->putScalar<int>( "BC", BoundaryCondition );
 	}
 	else if (protocol == "open connected oil"){
-		if (BoundaryCondition != 0 ){
+		if (BoundaryCondition != 0 && BoundaryCondition != 5){
 			BoundaryCondition = 0;
 			if (rank==0) printf("WARNING: protocol (open connected oil) supports only full periodic boundary condition \n");
 		}
 		domain_db->putScalar<int>( "BC", BoundaryCondition );
 	}
 	else if (protocol == "shell aggregation"){
-		if (BoundaryCondition != 0 ){
+		if (BoundaryCondition != 0 && BoundaryCondition != 5){
 			BoundaryCondition = 0;
 			if (rank==0) printf("WARNING: protocol (shell aggregation) supports only full periodic boundary condition \n");
 		}

From af8b2d799aaa27859f257ae75e52651325687814 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 10:06:54 -0400
Subject: [PATCH 107/121] enable target Ca for reflection BC

---
 models/ColorModel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index def4ab4e..83ddc5d4 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -567,8 +567,8 @@ void ScaLBL_ColorModel::Run(){
 	if (color_db->keyExists( "timestep" )){
 		timestep = color_db->getScalar<int>( "timestep" );
 	}
-	if (BoundaryCondition != 0 && SET_CAPILLARY_NUMBER==true){
-		if (rank == 0) printf("WARINING: capillary number target only supported for BC = 0 \n");
+	if (BoundaryCondition != 0 && BoundaryCondition != 5 && SET_CAPILLARY_NUMBER==true){
+		if (rank == 0) printf("WARINING: capillary number target only supported for BC = 0 or 5 \n");
 		SET_CAPILLARY_NUMBER=false;
 	}
 	if (analysis_db->keyExists( "seed_water" )){

From 3b006fbc3c8aa7ed81b0e34b6f0a67687e58881e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 10:38:21 -0400
Subject: [PATCH 108/121] reflect BC for D3Q7

---
 common/ScaLBL.cpp | 11 ++++++++++-
 common/ScaLBL.h   |  4 ++++
 cpu/D3Q7.cpp      | 15 +++++++++++++++
 gpu/D3Q7.cu       | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index 12589ecf..ef9b2341 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1286,7 +1286,16 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
 		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
 	}
-	
+	if (BoundaryCondition == 5){
+		if (kproc == 0){
+			ScaLBL_D3Q7_Reflection_BC_z(dvcSendList_z, Aq, sendCount_z, N);
+			ScaLBL_D3Q7_Reflection_BC_z(dvcSendList_z, Bq, sendCount_z, N);
+		}
+		if (kproc == nprocz-1){
+			ScaLBL_D3Q7_Reflection_BC_Z(dvcSendList_Z, Aq, sendCount_Z, N);
+			ScaLBL_D3Q7_Reflection_BC_Z(dvcSendList_Z, Bq, sendCount_Z, N);
+		}
+	}
 	//...................................................................................
 	Lock=false; // unlock the communicator after communications complete
 	//...................................................................................
diff --git a/common/ScaLBL.h b/common/ScaLBL.h
index 92956f1f..dec8b3d1 100644
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@@ -141,6 +141,10 @@ extern "C" void ScaLBL_D3Q19_Reflection_BC_z(int *list, double *dist, int count,
 
 extern "C" void ScaLBL_D3Q19_Reflection_BC_Z(int *list, double *dist, int count, int Np);
 
+extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np);
+
+extern "C" void ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np);
+
 extern "C" void ScaLBL_SetSlice_z(double *Phi, double value, int Nx, int Ny, int Nz, int Slice);
 
 extern "C" void ScaLBL_CopySlice_z(double *Phi, int Nx, int Ny, int Nz, int Source, int Destination);
diff --git a/cpu/D3Q7.cpp b/cpu/D3Q7.cpp
index 344e6851..0940b3b6 100644
--- a/cpu/D3Q7.cpp
+++ b/cpu/D3Q7.cpp
@@ -72,6 +72,21 @@ extern  "C" void ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int
 	}
 }
 
+extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np){
+	for (int idx=0; idx<count; idx++){
+		n = list[idx];
+		double f5 = 0.222222222222222222222222 - dist[6*Np+n];
+		dist[6*Np+n] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+	for (int idx=0; idx<count; idx++){
+		n = list[idx];
+		double f6 = 0.222222222222222222222222 - dist[5*Np+n];
+		dist[5*Np+n] = f6;
+	}
+}
 extern "C" void ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
 {
 	int n,N;
diff --git a/gpu/D3Q7.cu b/gpu/D3Q7.cu
index 16863fec..c10a865b 100644
--- a/gpu/D3Q7.cu
+++ b/gpu/D3Q7.cu
@@ -79,6 +79,25 @@ __global__ void dvc_ScaLBL_D3Q7_Unpack(int q,  int *list,  int start, int count,
 	}
 }
 
+__global__  void dvc_ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np){
+	int idx, n;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		double f5 = 0.222222222222222222222222 - dist[6*Np+n];
+		dist[6*Np+n] = f5;
+	}
+}
+
+__global__  void dvc_ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+	int idx, n;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		double f6 = 0.222222222222222222222222 - dist[5*Np+n];
+		dist[5*Np+n] = f6;
+	}
+}
 __global__ void dvc_ScaLBL_D3Q7_Init(char *ID, double *f_even, double *f_odd, double *Den, int Nx, int Ny, int Nz)
 {
 	int n,N;
@@ -207,6 +226,24 @@ __global__  void dvc_ScaLBL_D3Q7_Density(char *ID, double *disteven, double *dis
 	}
 }
 
+extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_Reflection_BC_z<<<GRID,512>>>(list, dist, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_Reflection_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_Reflection_BC_Z<<<GRID,512>>>(list, dist, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_Reflection_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
 extern "C" void ScaLBL_D3Q7_Unpack(int q, int *list,  int start, int count, double *recvbuf, double *dist, int N){
 	int GRID = count / 512 + 1;
 	dvc_ScaLBL_D3Q7_Unpack <<<GRID,512 >>>(q, list, start, count, recvbuf, dist, N);

From 7636220a4894a1a7c485e289e928408a50efd54e Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 10:43:01 -0400
Subject: [PATCH 109/121] add header for print

---
 gpu/D3Q7.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu/D3Q7.cu b/gpu/D3Q7.cu
index c10a865b..8a551f78 100644
--- a/gpu/D3Q7.cu
+++ b/gpu/D3Q7.cu
@@ -1,4 +1,5 @@
 // GPU Functions for D3Q7 Lattice Boltzmann Methods
+#include <stdio.h>
 
 #define NBLOCKS 560
 #define NTHREADS 128

From cfa40bdcba7fb109136dfd619da098ff8dd8596d Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Tue, 7 Apr 2020 13:57:29 -0400
Subject: [PATCH 110/121] fix declare

---
 cpu/D3Q7.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpu/D3Q7.cpp b/cpu/D3Q7.cpp
index 0940b3b6..48f71495 100644
--- a/cpu/D3Q7.cpp
+++ b/cpu/D3Q7.cpp
@@ -73,6 +73,7 @@ extern  "C" void ScaLBL_UnpackDenD3Q7(int *list, int count, double *recvbuf, int
 }
 
 extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count, int Np){
+  int n;
 	for (int idx=0; idx<count; idx++){
 		n = list[idx];
 		double f5 = 0.222222222222222222222222 - dist[6*Np+n];
@@ -81,7 +82,8 @@ extern "C" void ScaLBL_D3Q7_Reflection_BC_z(int *list, double *dist, int count,
 }
 
 extern "C" void ScaLBL_D3Q7_Reflection_BC_Z(int *list, double *dist, int count, int Np){
-	for (int idx=0; idx<count; idx++){
+  int n;
+  for (int idx=0; idx<count; idx++){
 		n = list[idx];
 		double f6 = 0.222222222222222222222222 - dist[5*Np+n];
 		dist[5*Np+n] = f6;

From bdf8539f408405ff9612d054d9981d35de1e0368 Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 10 Apr 2020 15:03:15 -0400
Subject: [PATCH 111/121] debugging strange mass conservation issue

---
 analysis/SubPhase.cpp              |   6 +-
 common/ScaLBL.cpp                  |  14 +-
 models/ColorModel.cpp              |  49 +++----
 tests/TestMassConservationD3Q7.cpp | 206 +++++++++++++++--------------
 tests/lbpm_color_simulator.cpp     |   2 +-
 5 files changed, 143 insertions(+), 134 deletions(-)

diff --git a/analysis/SubPhase.cpp b/analysis/SubPhase.cpp
index 76541ffd..7ef8194b 100644
--- a/analysis/SubPhase.cpp
+++ b/analysis/SubPhase.cpp
@@ -280,7 +280,7 @@ void SubPhase::Basic(){
 			dir_y = 0.0;
 			dir_z = 1.0;
 		}
-		if (Dm->BoundaryCondition > 0 ){
+		if (Dm->BoundaryCondition == 1 || Dm->BoundaryCondition == 2 || Dm->BoundaryCondition == 3 || Dm->BoundaryCondition == 4 ){
 			// compute the pressure drop
 			double pressure_drop = (Pressure(Nx*Ny + Nx + 1) - 1.0) / 3.0;
 			double length = ((Nz-2)*Dm->nprocz());
@@ -376,8 +376,8 @@ void SubPhase::Full(){
 
 	// If external boundary conditions are set, do not average over the inlet
 	kmin=1; kmax=Nz-1;
-	if (Dm->BoundaryCondition > 0 && Dm->kproc() == 0) kmin=4;
-	if (Dm->BoundaryCondition > 0 && Dm->kproc() == Dm->nprocz()-1) kmax=Nz-4;
+	if (Dm->BoundaryCondition > 0 && Dm->BoundaryCondition != 5 && Dm->kproc() == 0) kmin=4;
+	if (Dm->BoundaryCondition > 0 && Dm->BoundaryCondition != 5 && Dm->kproc() == Dm->nprocz()-1) kmax=Nz-4;
 
 	imin=jmin=1;
 	// If inlet layers exist use these as default
diff --git a/common/ScaLBL.cpp b/common/ScaLBL.cpp
index ef9b2341..07aa3f1d 100644
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@@ -1286,7 +1286,7 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,0,recvCount_Z,recvbuf_Z,Aq,N);
 		ScaLBL_D3Q7_Unpack(5,dvcRecvDist_Z,recvCount_Z,recvCount_Z,recvbuf_Z,Bq,N);
 	}
-	if (BoundaryCondition == 5){
+/*	if (BoundaryCondition == 5){
 		if (kproc == 0){
 			ScaLBL_D3Q7_Reflection_BC_z(dvcSendList_z, Aq, sendCount_z, N);
 			ScaLBL_D3Q7_Reflection_BC_z(dvcSendList_z, Bq, sendCount_z, N);
@@ -1296,6 +1296,7 @@ void ScaLBL_Communicator::BiRecvD3Q7AA(double *Aq, double *Bq){
 			ScaLBL_D3Q7_Reflection_BC_Z(dvcSendList_Z, Bq, sendCount_Z, N);
 		}
 	}
+	*/
 	//...................................................................................
 	Lock=false; // unlock the communicator after communications complete
 	//...................................................................................
@@ -1543,10 +1544,15 @@ void ScaLBL_Communicator::RecvHalo(double *data){
 		ScaLBL_Scalar_Unpack(dvcRecvList_yZ, recvCount_yZ,recvbuf_yZ, data, N);
 		ScaLBL_Scalar_Unpack(dvcRecvList_YZ, recvCount_YZ,recvbuf_YZ, data, N);
 	}
-
 	//...................................................................................
 	Lock=false; // unlock the communicator after communications complete
 	//...................................................................................
+	if (BoundaryCondition == 5 && kproc == 0){
+		ScaLBL_CopySlice_z(data,Nx,Ny,Nz,1,0);
+	}
+	if (BoundaryCondition == 5 && kproc == nprocz-1){
+		ScaLBL_CopySlice_z(data,Nx,Ny,Nz,Nz-2,Nz-1);
+	}	
 }
 
 void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, DoubleArray &regdata){
@@ -1583,7 +1589,7 @@ void ScaLBL_Communicator::RegularLayout(IntArray map, const double *data, Double
 void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == 0) {
 		if (BoundaryCondition == 5){
-			ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,1,0);
+			//ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,1,0);
 		}
 		else {
 			// Set the phase indicator field and density on the z inlet
@@ -1596,7 +1602,7 @@ void ScaLBL_Communicator::Color_BC_z(int *Map, double *Phi, double *Den, double
 void ScaLBL_Communicator::Color_BC_Z(int *Map, double *Phi, double *Den, double vA, double vB){
 	if (kproc == nprocz-1){
 		if (BoundaryCondition == 5){
-			ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,Nz-2,Nz-1);
+			//ScaLBL_CopySlice_z(Phi,Nx,Ny,Nz,Nz-2,Nz-1);
 		}
 		else {
 		// Set the phase indicator field and density on the Z outlet
diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 83ddc5d4..a0f339c6 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -470,7 +470,8 @@ void ScaLBL_ColorModel::Initialize(){
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
 
-	if (BoundaryCondition >0 ){
+	// establish reservoirs for external bC
+	if (BoundaryCondition == 1 || BoundaryCondition == 2 ||  BoundaryCondition == 3 || BoundaryCondition == 4 ){
 		if (Dm->kproc()==0){
 			ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,0);
 			ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,1);
@@ -743,7 +744,7 @@ void ScaLBL_ColorModel::Run(){
 		//************************************************************************
 		PROFILE_STOP("Update");
 
-		if (rank==0 && timestep%analysis_interval == 0 && BoundaryCondition > 0){
+		if (rank==0 && timestep%analysis_interval == 0 && BoundaryCondition == 4){
 			printf("%i %f \n",timestep,din);
 		}
 		// Run the analysis
@@ -1159,7 +1160,7 @@ double ScaLBL_ColorModel::MorphOpenConnected(double target_volume_change){
 		ScaLBL_CopyToDevice(Phi,phase.data(),N*sizeof(double));
 		ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 		ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
-		if (BoundaryCondition >0 ){
+		if (BoundaryCondition == 1 || BoundaryCondition == 2 || BoundaryCondition == 3 || BoundaryCondition == 4){
 			if (Dm->kproc()==0){
 				ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,0);
 				ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,1);
@@ -1447,7 +1448,7 @@ double ScaLBL_ColorModel::MorphInit(const double beta, const double target_delta
 	// 7. Re-initialize phase field and density
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, 0, ScaLBL_Comm->LastExterior(), Np);
 	ScaLBL_PhaseField_Init(dvcMap, Phi, Den, Aq, Bq, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
-	if (BoundaryCondition >0 ){
+	if (BoundaryCondition == 1 || BoundaryCondition == 2 || BoundaryCondition == 3 || BoundaryCondition == 4){
 		if (Dm->kproc()==0){
 			ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,0);
 			ScaLBL_SetSlice_z(Phi,1.0,Nx,Ny,Nz,1);
@@ -1516,25 +1517,25 @@ void ScaLBL_ColorModel::WriteDebug(){
 	fwrite(PhaseField.data(),8,N,VELZ_FILE);
 	fclose(VELZ_FILE);
 
-//	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[0],PhaseField);
-//	FILE *CGX_FILE;
-//	sprintf(LocalRankFilename,"Gradient_X.%05i.raw",rank);
-//	CGX_FILE = fopen(LocalRankFilename,"wb");
-//	fwrite(PhaseField.data(),8,N,CGX_FILE);
-//	fclose(CGX_FILE);
-//
-//	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[Np],PhaseField);
-//	FILE *CGY_FILE;
-//	sprintf(LocalRankFilename,"Gradient_Y.%05i.raw",rank);
-//	CGY_FILE = fopen(LocalRankFilename,"wb");
-//	fwrite(PhaseField.data(),8,N,CGY_FILE);
-//	fclose(CGY_FILE);
-//
-//	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[2*Np],PhaseField);
-//	FILE *CGZ_FILE;
-//	sprintf(LocalRankFilename,"Gradient_Z.%05i.raw",rank);
-//	CGZ_FILE = fopen(LocalRankFilename,"wb");
-//	fwrite(PhaseField.data(),8,N,CGZ_FILE);
-//	fclose(CGZ_FILE);
+/*	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[0],PhaseField);
+	FILE *CGX_FILE;
+	sprintf(LocalRankFilename,"Gradient_X.%05i.raw",rank);
+	CGX_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,CGX_FILE);
+	fclose(CGX_FILE);
 
+	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[Np],PhaseField);
+	FILE *CGY_FILE;
+	sprintf(LocalRankFilename,"Gradient_Y.%05i.raw",rank);
+	CGY_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,CGY_FILE);
+	fclose(CGY_FILE);
+
+	ScaLBL_Comm->RegularLayout(Map,&ColorGrad[2*Np],PhaseField);
+	FILE *CGZ_FILE;
+	sprintf(LocalRankFilename,"Gradient_Z.%05i.raw",rank);
+	CGZ_FILE = fopen(LocalRankFilename,"wb");
+	fwrite(PhaseField.data(),8,N,CGZ_FILE);
+	fclose(CGZ_FILE);
+*/
 }
diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index bbfe8cae..35e42c1c 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -69,10 +69,11 @@ int main(int argc, char **argv)
 	// Initialize MPI
 	int rank,nprocs;
 	MPI_Init(&argc,&argv);
-    MPI_Comm comm = MPI_COMM_WORLD;
+	MPI_Comm comm = MPI_COMM_WORLD;
 	MPI_Comm_rank(comm,&rank);
 	MPI_Comm_size(comm,&nprocs);
 	// parallel domain size (# of sub-domains)
+	int CleanCheck = 0;
 
 	if (rank == 0){
 		printf("********************************************************\n");
@@ -84,68 +85,68 @@ int main(int argc, char **argv)
 		}
 	}
 	{		
-	auto filename = argv[1];
-	ScaLBL_ColorModel CM(rank,nprocs,comm);
-	CM.ReadParams(filename);
-	CM.SetDomain();    	
-	int i,j,k,n;
-	int Nx,Ny,Nz,N,Np;
-	Nx = CM.Nx;
-	Ny = CM.Ny;
-	Nz = CM.Nz;
-	N = Nx*Ny*Nz;
+		auto filename = argv[1];
+		ScaLBL_ColorModel CM(rank,nprocs,comm);
+		CM.ReadParams(filename);
+		CM.SetDomain();    	
+		int i,j,k,n;
+		int Nx,Ny,Nz,N,Np;
+		Nx = CM.Nx;
+		Ny = CM.Ny;
+		Nz = CM.Nz;
+		N = Nx*Ny*Nz;
 
-	//CM.ReadInput(); 
-	double radius=0.4*double(Nx);
-	InitializeBubble(CM,radius);
- 	CM.Create();       // creating the model will create data structure to match the pore structure and allocate variables
-	CM.Initialize();   // initializing the model will set initial conditions for variables
-	//CM.Run();	       
-	//CM.WriteDebug();
+		//CM.ReadInput(); 
+		double radius=0.4*double(Nx);
+		InitializeBubble(CM,radius);
+		CM.Create();       // creating the model will create data structure to match the pore structure and allocate variables
+		CM.Initialize();   // initializing the model will set initial conditions for variables
+		//CM.Run();	       
+		//CM.WriteDebug();
 
-	CM.timestepMax = 10;
-	CM.Run();
+		CM.timestepMax = 10;
+		CM.Run();
 
-	Np = CM.Np;
-	double *DenOriginal, *DenFinal;
-	DenOriginal = new double [2*Np];
-	DenFinal = new double [2*Np];
+		Np = CM.Np;
+		double *DenOriginal, *DenFinal;
+		DenOriginal = new double [2*Np];
+		DenFinal = new double [2*Np];
 
-	// Run the odd timestep 
- 	ScaLBL_CopyToHost(DenOriginal,CM.Den,2*Np*sizeof(double));
-	/*
+		// Run the odd timestep 
+		ScaLBL_CopyToHost(DenOriginal,CM.Den,2*Np*sizeof(double));
+		/*
 	CM.ScaLBL_Comm->BiSendD3Q7AA(CM.Aq,CM.Bq); //READ FROM NORMAL
 	ScaLBL_D3Q7_AAodd_PhaseField(CM.NeighborList, CM.dvcMap, CM.Aq, CM.Bq, CM.Den, CM.Phi, CM.ScaLBL_Comm->FirstInterior(), CM.ScaLBL_Comm->LastInterior(), CM.Np);
 	CM.ScaLBL_Comm->BiRecvD3Q7AA(CM.Aq,CM.Bq); //WRITE INTO OPPOSITE
 	ScaLBL_DeviceBarrier();
 	ScaLBL_D3Q7_AAodd_PhaseField(CM.NeighborList, CM.dvcMap, CM.Aq, CM.Bq, CM.Den, CM.Phi, 0, CM.ScaLBL_Comm->LastExterior(), CM.Np);
-	*/
+		 */
 
-	CM.timestepMax = 2;
-	CM.Run();
-	int D3Q7[7][3]={{0,0,0},{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1}};
-	// Compare and make sure mass is conserved at every lattice site
-	auto Error = new double[N];
-	auto A_q = new double[7*Np];
-	//auto B_q = new double[7*Np];
-	bool CleanCheck = true;
-	double original,final, sum_q;
-	double total_mass_A_0 = 0.0;
-	double total_mass_B_0= 0.0;
-	double total_mass_A_1 = 0.0;
-	double total_mass_B_1= 0.0;
-	int count_negative_A = 0;
-	int count_negative_B = 0;
-	ScaLBL_CopyToHost(DenFinal,CM.Den,2*Np*sizeof(double));
-	ScaLBL_CopyToHost(A_q,CM.Aq,7*Np*sizeof(double));
-	for (i=0; i<N; i++) Error[i]=0.0;
-	for (k=1;k<Nz-1;k++){
-		for (j=1;j<Ny-1;j++){
-			for (i=1;i<Nx-1;i++){
-				n = k*Nx*Ny+j*Nx+i;
-				int idx = CM.Map(i,j,k);
-				if (idx < Np && idx>-1){
-				  //printf("idx=%i\n",idx);
+		CM.timestepMax = 2;
+		CM.timestep = 0;
+		CM.Run();
+		int D3Q7[7][3]={{0,0,0},{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1}};
+		// Compare and make sure mass is conserved at every lattice site
+		auto Error = new double[N];
+		auto A_q = new double[7*Np];
+		//auto B_q = new double[7*Np];
+		double original,final, sum_q;
+		double total_mass_A_0 = 0.0;
+		double total_mass_B_0= 0.0;
+		double total_mass_A_1 = 0.0;
+		double total_mass_B_1= 0.0;
+		int count_negative_A = 0;
+		int count_negative_B = 0;
+		ScaLBL_CopyToHost(DenFinal,CM.Den,2*Np*sizeof(double));
+		ScaLBL_CopyToHost(A_q,CM.Aq,7*Np*sizeof(double));
+		for (i=0; i<N; i++) Error[i]=0.0;
+		for (k=1;k<Nz-1;k++){
+			for (j=1;j<Ny-1;j++){
+				for (i=1;i<Nx-1;i++){
+					n = k*Nx*Ny+j*Nx+i;
+					int idx = CM.Map(i,j,k);
+					if (idx < Np && idx>-1){
+						//printf("idx=%i\n",idx);
 						final = DenFinal[idx];
 						if (final < 0.0) count_negative_A++;
 						original = DenOriginal[idx];
@@ -153,60 +154,61 @@ int main(int argc, char **argv)
 						total_mass_A_1 += final;
 						sum_q = A_q[idx];
 						for (int q=1; q<7; q++){
-						  int Cqx = D3Q7[q][0]; 
-						  int Cqy = D3Q7[q][1]; 
-						  int Cqz = D3Q7[q][2];
-						  int iq = CM.Map(i-Cqx,j-Cqy,k-Cqz);
-						  if (iq < Np && iq > -1){
-						    sum_q += A_q[q*Np+iq]; 
-						  }
-						  else if (q%2==0){
-						    sum_q += A_q[(q-1)*Np+idx]; 
-						  }
-						  else{
-						    sum_q += A_q[(q+1)*Np+idx]; 
-						  }
+							int Cqx = D3Q7[q][0]; 
+							int Cqy = D3Q7[q][1]; 
+							int Cqz = D3Q7[q][2];
+							int iq = CM.Map(i-Cqx,j-Cqy,k-Cqz);
+							if (iq < Np && iq > -1){
+								sum_q += A_q[q*Np+iq]; 
+							}
+							else if (q%2==0){
+								sum_q += A_q[(q-1)*Np+idx]; 
+							}
+							else{
+								sum_q += A_q[(q+1)*Np+idx]; 
+							}
 						}
 						Error[n] = sum_q - original;
-						
-					       /*if (fabs(DenFinal[idx] - DenOriginal[idx]) > 1e-15){		      
-						//if (CM.Dm->id[n] == 0) printf("Solid phase! \n");
-						//if (CM.Dm->id[n] == 1) printf("Wetting phase! \n");
-						//if (CM.Dm->id[n] == 2) printf("Non-wetting phase! \n");							
-						printf("Mass not conserved: WP density, site=%i,%i,%i, original = %f, final = %f \n",i,j,k,original,final);
-						CleanCheck=false;
-						Error[n] += final-original;
-						}*/
+
+						if (fabs(DenFinal[idx] - DenOriginal[idx]) > 1e-15){		      
+							//if (CM.Dm->id[n] == 0) printf("Solid phase! \n");
+							//if (CM.Dm->id[n] == 1) printf("Wetting phase! \n");
+							//if (CM.Dm->id[n] == 2) printf("Non-wetting phase! \n");							
+							//printf("Mass not conserved: WP density, site=%i,%i,%i, original = %f, final = %f \n",i,j,k,original,final);
+							CleanCheck=false;
+							Error[n] += final-original;
+						}
 						final = DenFinal[Np+idx];
 						if (final < 0.0) count_negative_B++;
 						original = DenOriginal[Np+idx];
 						total_mass_B_0 += original;
 						total_mass_B_1 += final;
-						/*if (fabs(DenFinal[Np+idx] - DenOriginal[Np+idx]) > 1e-15){
-						//if (CM.Dm->id[n] == 0) printf("Solid phase! \n");
-						//if (CM.Dm->id[n] == 1) printf("Wetting phase! \n");
-						//if (CM.Dm->id[n] == 2) printf("Non-wetting phase! \n");
-						printf("Mass not conserved: NWP density, site=%i,%i,%i, original = %f, final = %f \n",i,j,k,original,final);
-						CleanCheck=false;
-						Error[n] += final-original;
-						}*/
+						if (fabs(DenFinal[Np+idx] - DenOriginal[Np+idx]) > 1e-15){
+							//if (CM.Dm->id[n] == 0) printf("Solid phase! \n");
+							//if (CM.Dm->id[n] == 1) printf("Wetting phase! \n");
+							//if (CM.Dm->id[n] == 2) printf("Non-wetting phase! \n");
+							//printf("Mass not conserved: NWP density, site=%i,%i,%i, original = %f, final = %f \n",i,j,k,original,final);
+							CleanCheck=false;
+							Error[n] += final-original;
+
+						}
+					}
 				}
 			}
 		}
-	}
-	printf("Negative density values for A = %i \n",count_negative_A);
-	printf("Negative density values for B = %i \n",count_negative_B);
-	printf("Global mass difference A = %.5g\n",total_mass_A_1-total_mass_A_0);
-	printf("Global mass difference B = %.5g\n",total_mass_B_1-total_mass_B_0);
+		printf("Negative density values for A = %i \n",count_negative_A);
+		printf("Negative density values for B = %i \n",count_negative_B);
+		printf("Global mass difference A = %.5g\n",total_mass_A_1-total_mass_A_0);
+		printf("Global mass difference B = %.5g\n",total_mass_B_1-total_mass_B_0);
 
-	if (count_negative_A > 0 ||count_negative_B > 0) CleanCheck=1;
-	if (fabs(total_mass_A_1-total_mass_A_0) > 1.0e-15||fabs(total_mass_B_1-total_mass_B_0) > 1.0e-15 ) CleanCheck=2;
+		if (count_negative_A > 0 ||count_negative_B > 0) CleanCheck=1;
+		if (fabs(total_mass_A_1-total_mass_A_0) > 1.0e-13||fabs(total_mass_B_1-total_mass_B_0) > 1.0e-13 ) CleanCheck=2;
 
-	/*	
-	 	FILE *OUTFILE;
-	OUTFILE = fopen("error.raw","wb");
-	fwrite(Error,8,N,OUTFILE);
-	fclose(OUTFILE);	
+		FILE *OUTFILE;
+		OUTFILE = fopen("error.raw","wb");
+		fwrite(Error,8,N,OUTFILE);
+		fclose(OUTFILE);	
+		/*	
 
 	if (rank==0) printf("Checking that the correct velocity is retained \n");
 	// Swap convention is observed -- velocity is negative
@@ -256,15 +258,15 @@ int main(int argc, char **argv)
 			}
 		}
 	}
-*/
-	if (CleanCheck){
-		if (rank==0) printf("Test passed: mass conservation for D3Q7 \n");
-	}
-	else {
-		if (rank==0) printf("Test failed!: mass conservation for D3Q7 \n");
+		 */
+		if (CleanCheck == 0){
+			if (rank==0) printf("Test passed: mass conservation for D3Q7 \n");
+		}
+		else {
+			if (rank==0) printf("Test failed!: mass conservation for D3Q7 \n");
 
+		}
 	}
-}
 	// ****************************************************
 	MPI_Barrier(comm);
 	MPI_Finalize();
diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index 1f63c653..79b2a718 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
     ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
     ColorModel.Initialize();   // initializing the model will set initial conditions for variables
     ColorModel.Run();	       
-    //ColorModel.WriteDebug();
+    ColorModel.WriteDebug();
 
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);

From d1d92ea6bbc793fe66e007fffb3f79e3aec75a74 Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 10 Apr 2020 21:31:44 -0400
Subject: [PATCH 112/121] debug mass conservation test

---
 tests/TestMassConservationD3Q7.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index 35e42c1c..6186fd60 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -41,7 +41,7 @@ inline void InitializeBubble(ScaLBL_ColorModel &ColorModel, double BubbleRadius)
 				int jglobal= j+(Ny-2)*ColorModel.Mask->jproc();
 				int kglobal= k+(Nz-2)*ColorModel.Mask->kproc();
 				// Initialize phase position field for parallel bubble test
-				if (jglobal < 40){
+				if (kglobal < 40){
 					ColorModel.Mask->id[n] = 0;
 				}
 				else if ((iglobal-0.5*(Nx-2)*nprocx)*(iglobal-0.5*(Nx-2)*nprocx)
@@ -183,15 +183,15 @@ int main(int argc, char **argv)
 						original = DenOriginal[Np+idx];
 						total_mass_B_0 += original;
 						total_mass_B_1 += final;
-						if (fabs(DenFinal[Np+idx] - DenOriginal[Np+idx]) > 1e-15){
+						/*if (fabs(DenFinal[Np+idx] - DenOriginal[Np+idx]) > 1e-15){
 							//if (CM.Dm->id[n] == 0) printf("Solid phase! \n");
 							//if (CM.Dm->id[n] == 1) printf("Wetting phase! \n");
 							//if (CM.Dm->id[n] == 2) printf("Non-wetting phase! \n");
 							//printf("Mass not conserved: NWP density, site=%i,%i,%i, original = %f, final = %f \n",i,j,k,original,final);
 							CleanCheck=false;
 							Error[n] += final-original;
-
-						}
+						} 
+						*/
 					}
 				}
 			}

From d6a8647ee1d0fd4f8ef30a079b7526ede60aa0c4 Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 17 Apr 2020 12:21:29 -0400
Subject: [PATCH 113/121] cannot exclude inlet / outlet without screwing up
 topology

---
 analysis/SubPhase.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/analysis/SubPhase.cpp b/analysis/SubPhase.cpp
index 7ef8194b..2a5e3350 100644
--- a/analysis/SubPhase.cpp
+++ b/analysis/SubPhase.cpp
@@ -161,12 +161,12 @@ void SubPhase::Basic(){
 	// If external boundary conditions are set, do not average over the inlet
 	kmin=1; kmax=Nz-1;
 	imin=jmin=1;
-	// If inlet/outlet layers exist use these as default
+	/*// If inlet/outlet layers exist use these as default
 	if (Dm->inlet_layers_x > 0) imin = Dm->inlet_layers_x;
 	if (Dm->inlet_layers_y > 0) jmin = Dm->inlet_layers_y;
 	if (Dm->inlet_layers_z > 0 && Dm->kproc() == 0) kmin += Dm->inlet_layers_z; 
 	if (Dm->outlet_layers_z > 0 && Dm->kproc() == Dm->nprocz()-1) kmax -= Dm->outlet_layers_z; 
-	
+	*/
 	nb.reset(); wb.reset();
 
 	double count_w = 0.0;
@@ -376,16 +376,17 @@ void SubPhase::Full(){
 
 	// If external boundary conditions are set, do not average over the inlet
 	kmin=1; kmax=Nz-1;
-	if (Dm->BoundaryCondition > 0 && Dm->BoundaryCondition != 5 && Dm->kproc() == 0) kmin=4;
+	/*if (Dm->BoundaryCondition > 0 && Dm->BoundaryCondition != 5 && Dm->kproc() == 0) kmin=4;
 	if (Dm->BoundaryCondition > 0 && Dm->BoundaryCondition != 5 && Dm->kproc() == Dm->nprocz()-1) kmax=Nz-4;
-
+	*/
 	imin=jmin=1;
-	// If inlet layers exist use these as default
+	/*// If inlet layers exist use these as default
+	 * NOTE -- excluding inlet / outlet will screw up topological averages!!!
 	if (Dm->inlet_layers_x > 0) imin = Dm->inlet_layers_x;
 	if (Dm->inlet_layers_y > 0) jmin = Dm->inlet_layers_y;
 	if (Dm->inlet_layers_z > 0 && Dm->kproc() == 0) kmin += Dm->inlet_layers_z; 
 	if (Dm->outlet_layers_z > 0 && Dm->kproc() == Dm->nprocz()-1) kmax -= Dm->outlet_layers_z; 
-		
+	*/
 	nd.reset();	nc.reset(); wd.reset();	wc.reset();	iwn.reset();	iwnc.reset();
 
  	Dm->CommunicateMeshHalo(Phi);

From 8dc9aed0abc0ca518a902f61dc38d60445e93e96 Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 17 Apr 2020 17:55:00 -0400
Subject: [PATCH 114/121] fix mass conservation test

---
 tests/TestMassConservationD3Q7.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/TestMassConservationD3Q7.cpp b/tests/TestMassConservationD3Q7.cpp
index 6186fd60..17c50d19 100644
--- a/tests/TestMassConservationD3Q7.cpp
+++ b/tests/TestMassConservationD3Q7.cpp
@@ -202,7 +202,7 @@ int main(int argc, char **argv)
 		printf("Global mass difference B = %.5g\n",total_mass_B_1-total_mass_B_0);
 
 		if (count_negative_A > 0 ||count_negative_B > 0) CleanCheck=1;
-		if (fabs(total_mass_A_1-total_mass_A_0) > 1.0e-13||fabs(total_mass_B_1-total_mass_B_0) > 1.0e-13 ) CleanCheck=2;
+		if (fabs(total_mass_A_1-total_mass_A_0) > 1.0e-8 || fabs(total_mass_B_1-total_mass_B_0) > 1.0e-8) CleanCheck=2;
 
 		FILE *OUTFILE;
 		OUTFILE = fopen("error.raw","wb");

From 67896fcbe291d792950545747aeeee4602856e9b Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 17 Apr 2020 18:35:47 -0400
Subject: [PATCH 115/121] remove debug dump

---
 tests/lbpm_color_simulator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lbpm_color_simulator.cpp b/tests/lbpm_color_simulator.cpp
index 79b2a718..1f63c653 100644
--- a/tests/lbpm_color_simulator.cpp
+++ b/tests/lbpm_color_simulator.cpp
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
     ColorModel.Create();       // creating the model will create data structure to match the pore structure and allocate variables
     ColorModel.Initialize();   // initializing the model will set initial conditions for variables
     ColorModel.Run();	       
-    ColorModel.WriteDebug();
+    //ColorModel.WriteDebug();
 
     PROFILE_STOP("Main");
     PROFILE_SAVE("lbpm_color_simulator",1);

From b495c9916c2be425fe739c899299e5de38fc0674 Mon Sep 17 00:00:00 2001
From: James McClure <jemcclur@gmail.com>
Date: Fri, 17 Apr 2020 19:12:08 -0400
Subject: [PATCH 116/121] seed water morphdelta negative by default

---
 models/ColorModel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index a0f339c6..189f0059 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -546,18 +546,18 @@ void ScaLBL_ColorModel::Run(){
 		USE_MORPH = true;
 	}
 	else if (protocol == "seed water"){
-		morph_delta = 0.05;
+		morph_delta = -0.05;
 		seed_water = 0.01;
 		USE_SEED = true;
 		USE_MORPH = true;
 	}
 	else if (protocol == "open connected oil"){
-		morph_delta = 0.05;
+		morph_delta = -0.05;
 		USE_MORPH = true;
 		USE_MORPHOPEN_OIL = true;
 	}
 	else if (protocol == "shell aggregation"){
-		morph_delta = 0.05;
+		morph_delta = -0.05;
 		USE_MORPH = true;
 	}  
 	if (color_db->keyExists( "capillary_number" )){

From 16e187e1dc0d637a28d1efce7f2c1716a3bc1017 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Mon, 4 May 2020 14:50:23 -0400
Subject: [PATCH 117/121] add pseudo-reflection

---
 common/Domain.cpp | 220 ++++++++++++++++++++++++++--------------------
 1 file changed, 127 insertions(+), 93 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 33d6117a..32e13501 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -256,6 +256,7 @@ void Domain::Decomp( const std::string& Filename )
 	int64_t i,j,k,n;
 	int64_t xStart,yStart,zStart;
 	int checkerSize;
+	bool USE_CHECKER = false;
 	//int inlet_layers_x, inlet_layers_y, inlet_layers_z;
 	//int outlet_layers_x, outlet_layers_y, outlet_layers_z;
 	xStart=yStart=zStart=0;
@@ -295,6 +296,7 @@ void Domain::Decomp( const std::string& Filename )
 	}
 	if (database->keyExists( "checkerSize" )){
 		checkerSize = database->getScalar<int>( "checkerSize" );
+		USE_CHECKER = true;
 	}
 	else {
 		checkerSize = SIZE[0];
@@ -367,7 +369,7 @@ void Domain::Decomp( const std::string& Filename )
 			}
 		}
 		printf("Read segmented data from %s \n",Filename.c_str());
-		
+
 		// relabel the data
 		std::vector<long int> LabelCount(ReadValues.size(),0);
 		for (int k = 0; k<global_Nz; k++){
@@ -396,117 +398,149 @@ void Domain::Decomp( const std::string& Filename )
 			}
 		}
 
-		if (inlet_layers_x > 0){
-			// use checkerboard pattern
-			printf("Checkerboard pattern at x inlet for %i layers \n",inlet_layers_x);
-			for (int k = 0; k<global_Nz; k++){
-				for (int j = 0; j<global_Ny; j++){
-					for (int i = xStart; i < xStart+inlet_layers_x; i++){
-						if ( (j/checkerSize + k/checkerSize)%2 == 0){
-							// void checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+		if (USE_CHECKER) {
+			if (inlet_layers_x > 0){
+				// use checkerboard pattern
+				printf("Checkerboard pattern at x inlet for %i layers \n",inlet_layers_x);
+				for (int k = 0; k<global_Nz; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = xStart; i < xStart+inlet_layers_x; i++){
+							if ( (j/checkerSize + k/checkerSize)%2 == 0){
+								// void checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+							}
 						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+					}
+				}
+			}
+
+			if (inlet_layers_y > 0){
+				printf("Checkerboard pattern at y inlet for %i layers \n",inlet_layers_y);
+				// use checkerboard pattern
+				for (int k = 0; k<global_Nz; k++){
+					for (int j = yStart; j < yStart+inlet_layers_y; j++){
+						for (int i = 0; i<global_Nx; i++){
+							if ( (i/checkerSize + k/checkerSize)%2 == 0){
+								// void checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+							}
+						}
+					}
+				}
+			}
+
+			if (inlet_layers_z > 0){
+				printf("Checkerboard pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
+				// use checkerboard pattern
+				for (int k = zStart; k < zStart+inlet_layers_z; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = 0; i<global_Nx; i++){
+							if ( (i/checkerSize+j/checkerSize)%2 == 0){
+								// void checkers
+								//SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = inlet_layers_phase;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+							}
+						}
+					}
+				}
+			}
+
+			if (outlet_layers_x > 0){
+				// use checkerboard pattern
+				printf("Checkerboard pattern at x outlet for %i layers \n",outlet_layers_x);
+				for (int k = 0; k<global_Nz; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = xStart + nx*nprocx - outlet_layers_x; i <  xStart + nx*nprocx; i++){
+							if ( (j/checkerSize + k/checkerSize)%2 == 0){
+								// void checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+							}
+						}
+					}
+				}
+			}
+
+			if (outlet_layers_y > 0){
+				printf("Checkerboard pattern at y outlet for %i layers \n",outlet_layers_y);
+				// use checkerboard pattern
+				for (int k = 0; k<global_Nz; k++){
+					for (int j = yStart + ny*nprocy - outlet_layers_y; j < yStart + ny*nprocy; j++){
+						for (int i = 0; i<global_Nx; i++){
+							if ( (i/checkerSize + k/checkerSize)%2 == 0){
+								// void checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+
+							}
+						}
+					}
+				}
+			}
+
+			if (outlet_layers_z > 0){
+				printf("Checkerboard pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
+				// use checkerboard pattern
+				for (int k = zStart + nz*nprocz - outlet_layers_z; k < zStart + nz*nprocz; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = 0; i<global_Nx; i++){
+							if ( (i/checkerSize+j/checkerSize)%2 == 0){
+								// void checkers
+								//SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = outlet_layers_phase;
+							}
+							else{
+								// solid checkers
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+							}
 						}
 					}
 				}
 			}
 		}
-
-		if (inlet_layers_y > 0){
-			printf("Checkerboard pattern at y inlet for %i layers \n",inlet_layers_y);
-			// use checkerboard pattern
-			for (int k = 0; k<global_Nz; k++){
-				for (int j = yStart; j < yStart+inlet_layers_y; j++){
-					for (int i = 0; i<global_Nx; i++){
-						if ( (i/checkerSize + k/checkerSize)%2 == 0){
-							// void checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
-						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
-						}
-					}
-				}
-			}
-		}
-
+	}
+	else {
 		if (inlet_layers_z > 0){
-			printf("Checkerboard pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
-			// use checkerboard pattern
+			printf("Mixed reflection pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
 			for (int k = zStart; k < zStart+inlet_layers_z; k++){
 				for (int j = 0; j<global_Ny; j++){
 					for (int i = 0; i<global_Nx; i++){
-						if ( (i/checkerSize+j/checkerSize)%2 == 0){
-							// void checkers
-							//SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = inlet_layers_phase;
-						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+						signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
+						signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+						if ( local_id < 1 && reflection_id > 0){
+							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
 						}
 					}
 				}
 			}
 		}
-
-		if (outlet_layers_x > 0){
-			// use checkerboard pattern
-			printf("Checkerboard pattern at x outlet for %i layers \n",outlet_layers_x);
-			for (int k = 0; k<global_Nz; k++){
-				for (int j = 0; j<global_Ny; j++){
-					for (int i = xStart + nx*nprocx - outlet_layers_x; i <  xStart + nx*nprocx; i++){
-						if ( (j/checkerSize + k/checkerSize)%2 == 0){
-							// void checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
-						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
-						}
-					}
-				}
-			}
-		}
-
-		if (outlet_layers_y > 0){
-			printf("Checkerboard pattern at y outlet for %i layers \n",outlet_layers_y);
-			// use checkerboard pattern
-			for (int k = 0; k<global_Nz; k++){
-				for (int j = yStart + ny*nprocy - outlet_layers_y; j < yStart + ny*nprocy; j++){
-					for (int i = 0; i<global_Nx; i++){
-						if ( (i/checkerSize + k/checkerSize)%2 == 0){
-							// void checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
-						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
-							
-						}
-					}
-				}
-			}
-		}
-
 		if (outlet_layers_z > 0){
-			printf("Checkerboard pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
-			// use checkerboard pattern
+			printf("Mixed reflection pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
 			for (int k = zStart + nz*nprocz - outlet_layers_z; k < zStart + nz*nprocz; k++){
 				for (int j = 0; j<global_Ny; j++){
 					for (int i = 0; i<global_Nx; i++){
-						if ( (i/checkerSize+j/checkerSize)%2 == 0){
-							// void checkers
-							//SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 2;
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = outlet_layers_phase;
-						}
-						else{
-							// solid checkers
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = 0;
+						signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
+						signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+						if ( local_id < 1 && reflection_id > 0){
+							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
 						}
 					}
 				}

From d424771849c46bb15ff0831c29e7ef77afb21ff7 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Mon, 4 May 2020 15:12:50 -0400
Subject: [PATCH 118/121] fix scope in Domain inlet/outlet

---
 common/Domain.cpp | 53 ++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index 32e13501..e355310f 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -390,14 +390,11 @@ void Domain::Decomp( const std::string& Filename )
 				}
 			}
 		}
-		if (RANK==0){
-			for (size_t idx=0; idx<ReadValues.size(); idx++){
-				long int label=ReadValues[idx];
-				long int count=LabelCount[idx];
-				printf("Label=%ld, Count=%ld \n",label,count);
-			}
+		for (size_t idx=0; idx<ReadValues.size(); idx++){
+			long int label=ReadValues[idx];
+			long int count=LabelCount[idx];
+			printf("Label=%ld, Count=%ld \n",label,count);
 		}
-
 		if (USE_CHECKER) {
 			if (inlet_layers_x > 0){
 				// use checkerboard pattern
@@ -516,31 +513,31 @@ void Domain::Decomp( const std::string& Filename )
 				}
 			}
 		}
-	}
-	else {
-		if (inlet_layers_z > 0){
-			printf("Mixed reflection pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
-			for (int k = zStart; k < zStart+inlet_layers_z; k++){
-				for (int j = 0; j<global_Ny; j++){
-					for (int i = 0; i<global_Nx; i++){
-						signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
-						signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
-						if ( local_id < 1 && reflection_id > 0){
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
+		else {
+			if (inlet_layers_z > 0){
+				printf("Mixed reflection pattern at z inlet for %i layers, saturated with phase label=%i \n",inlet_layers_z,inlet_layers_phase);
+				for (int k = zStart; k < zStart+inlet_layers_z; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = 0; i<global_Nx; i++){
+							signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
+							signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+							if ( local_id < 1 && reflection_id > 0){
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
+							}
 						}
 					}
 				}
 			}
-		}
-		if (outlet_layers_z > 0){
-			printf("Mixed reflection pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
-			for (int k = zStart + nz*nprocz - outlet_layers_z; k < zStart + nz*nprocz; k++){
-				for (int j = 0; j<global_Ny; j++){
-					for (int i = 0; i<global_Nx; i++){
-						signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
-						signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
-						if ( local_id < 1 && reflection_id > 0){
-							SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
+			if (outlet_layers_z > 0){
+				printf("Mixed reflection pattern at z outlet for %i layers, saturated with phase label=%i \n",outlet_layers_z,outlet_layers_phase);
+				for (int k = zStart + nz*nprocz - outlet_layers_z; k < zStart + nz*nprocz; k++){
+					for (int j = 0; j<global_Ny; j++){
+						for (int i = 0; i<global_Nx; i++){
+							signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
+							signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+							if ( local_id < 1 && reflection_id > 0){
+								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
+							}
 						}
 					}
 				}

From 7b731e327be8989fe1983e21d813627272ea05eb Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Mon, 4 May 2020 15:26:41 -0400
Subject: [PATCH 119/121] update pseudo-reflection

---
 common/Domain.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/Domain.cpp b/common/Domain.cpp
index e355310f..3dec0128 100644
--- a/common/Domain.cpp
+++ b/common/Domain.cpp
@@ -520,7 +520,7 @@ void Domain::Decomp( const std::string& Filename )
 					for (int j = 0; j<global_Ny; j++){
 						for (int i = 0; i<global_Nx; i++){
 							signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
-							signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+							signed char reflection_id = SegData[(zStart + nz*nprocz - 1)*global_Nx*global_Ny+j*global_Nx+i];
 							if ( local_id < 1 && reflection_id > 0){
 								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
 							}
@@ -534,7 +534,7 @@ void Domain::Decomp( const std::string& Filename )
 					for (int j = 0; j<global_Ny; j++){
 						for (int i = 0; i<global_Nx; i++){
 							signed char local_id = SegData[k*global_Nx*global_Ny+j*global_Nx+i];
-							signed char reflection_id = SegData[(zStart + nz*nprocz - k)*global_Nx*global_Ny+j*global_Nx+i];
+							signed char reflection_id = SegData[zStart*global_Nx*global_Ny+j*global_Nx+i];
 							if ( local_id < 1 && reflection_id > 0){
 								SegData[k*global_Nx*global_Ny+j*global_Nx+i] = reflection_id;
 							}

From 214917021aaf9d89c6efa6d2f795683df9421ff3 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 6 May 2020 14:10:57 -0400
Subject: [PATCH 120/121] rescale force after user time interval

---
 models/ColorModel.cpp | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 189f0059..49fc635c 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -494,6 +494,7 @@ void ScaLBL_ColorModel::Run(){
 	int IMAGE_COUNT = 0;
 	std::vector<std::string> ImageList;
 	bool SET_CAPILLARY_NUMBER = false;
+	bool RESCALE_FORCE = false;
 	bool MORPH_ADAPT = false;
 	bool USE_MORPH = false;
 	bool USE_SEED = false;
@@ -502,6 +503,7 @@ void ScaLBL_ColorModel::Run(){
 	int MAX_MORPH_TIMESTEPS = 50000; // maximum number of LBM timesteps to spend in morphological adaptation routine
 	int MIN_STEADY_TIMESTEPS = 100000;
 	int MAX_STEADY_TIMESTEPS = 200000;
+	int RESCALE_FORCE_AFTER_TIMESTEP = 0;
 	int RAMP_TIMESTEPS = 0;//50000;		 // number of timesteps to run initially (to get a reasonable velocity field before other pieces kick in)
 	int CURRENT_MORPH_TIMESTEPS=0;   // counter for number of timesteps spent in  morphological adaptation routine (reset each time)
 	int CURRENT_STEADY_TIMESTEPS=0;   // counter for number of timesteps spent in  morphological adaptation routine (reset each time)
@@ -563,7 +565,9 @@ void ScaLBL_ColorModel::Run(){
 	if (color_db->keyExists( "capillary_number" )){
 		capillary_number = color_db->getScalar<double>( "capillary_number" );
 		SET_CAPILLARY_NUMBER=true;
-		//RESCALE_FORCE_MAX = 1;
+	}
+	if (color_db->keyExists( "rescale_force_after_timestep" )){
+		RESCALE_FORCE_AFTER_TIMESTEP = color_db->getScalar<int>( "rescale_force_after_timestep" );
 	}
 	if (color_db->keyExists( "timestep" )){
 		timestep = color_db->getScalar<int>( "timestep" );
@@ -791,7 +795,20 @@ void ScaLBL_ColorModel::Run(){
 					isSteady = true;
 				if (CURRENT_STEADY_TIMESTEPS > MAX_STEADY_TIMESTEPS)
 					isSteady = true;
-
+				if (RESCALE_FORCE == true && SET_CAPILLARY_NUMBER == true && CURRENT_STEADY_TIMESTEPS > RESCALE_FORCE_AFTER_TIMESTEP){
+					RESCALE_FORCE = false;
+					Fx *= capillary_number / Ca;
+					Fy *= capillary_number / Ca;
+					Fz *= capillary_number / Ca;
+					if (force_mag > 1e-3){
+						Fx *= 1e-3/force_mag;   // impose ceiling for stability
+						Fy *= 1e-3/force_mag;   
+						Fz *= 1e-3/force_mag;   
+					}
+					if (rank == 0) printf("    -- adjust force by factor %f \n ",capillary_number / Ca);
+					Averages->SetParams(rhoA,rhoB,tauA,tauB,Fx,Fy,Fz,alpha,beta);
+					color_db->putVector<double>("F",{Fx,Fy,Fz});
+				}
 				if ( isSteady ){
 					MORPH_ADAPT = true;
 					CURRENT_MORPH_TIMESTEPS=0;
@@ -952,12 +969,17 @@ void ScaLBL_ColorModel::Run(){
 					CURRENT_STEADY_TIMESTEPS=0;
 					initial_volume = volA*Dm->Volume;
 					delta_volume = 0.0;
+					if (RESCALE_FORCE_AFTER_TIMESTEP > 0)
+						RESCALE_FORCE = true;
 				}
 				else if (!(USE_DIRECT) && CURRENT_MORPH_TIMESTEPS > MAX_MORPH_TIMESTEPS) {
 					MORPH_ADAPT = false;
 					CURRENT_STEADY_TIMESTEPS=0;
 					initial_volume = volA*Dm->Volume;
 					delta_volume = 0.0;
+					RESCALE_FORCE = true;
+					if (RESCALE_FORCE_AFTER_TIMESTEP > 0)
+						RESCALE_FORCE = true;
 				}
 			}
 			morph_timesteps += analysis_interval;

From 09a9a05a8780941240fd790b6825cb3f771d73c3 Mon Sep 17 00:00:00 2001
From: James McClure <mcclurej@vt.edu>
Date: Wed, 6 May 2020 15:12:50 -0400
Subject: [PATCH 121/121] enable force adaptation

---
 models/ColorModel.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/ColorModel.cpp b/models/ColorModel.cpp
index 49fc635c..7b883657 100644
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@@ -568,6 +568,7 @@ void ScaLBL_ColorModel::Run(){
 	}
 	if (color_db->keyExists( "rescale_force_after_timestep" )){
 		RESCALE_FORCE_AFTER_TIMESTEP = color_db->getScalar<int>( "rescale_force_after_timestep" );
+		RESCALE_FORCE = true;
 	}
 	if (color_db->keyExists( "timestep" )){
 		timestep = color_db->getScalar<int>( "timestep" );