Merge branch 'master' of github.com:JamesEMcClure/LBPM-WIA

2018-09-23 00:00:14 -04:00 · 2018-09-23 00:00:14 -04:00 · bc1e23adf3
commit bc1e23adf3
parent 433b30ada1 3c55966984
26 changed files with 1553 additions and 1204 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,14 +31,10 @@ IF ( NOT CMAKE_CXX_STANDARD )
        MESSAGE( WARNING "CXX_STD is obsolete, please set CMAKE_CXX_STANDARD" )
        SET( CMAKE_CXX_STANDARD ${CXX_STD} )
    ENDIF()
-    SET( CMAKE_CXX_STANDARD 11 )
+    SET( CMAKE_CXX_STANDARD 14 )
 ENDIF()


-# Disable Link Time Optimization until we have a chance to test
-SET( DISABLE_LTO )
-
-
 # Set source/install paths
 SET( ${PROJ}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
 SET( ${PROJ}_BUILD_DIR  "${CMAKE_CURRENT_BINARY_DIR}" )
--- a/analysis/Minkowski.cpp
+++ b/analysis/Minkowski.cpp
@ -26,9 +26,19 @@
 #include "IO/Reader.h"
 #include "IO/Writer.h"

+#include "ProfilerApp.h"
+
+
 #define PI 3.14159265359

 // Constructor
+
+Minkowski::Minkowski():
+	kstart(0), kfinish(0), isovalue(0), Volume(0),
+    LOGFILE(NULL), Dm(NULL), Vi(0), Vi_global(0)
+{
+}
+
 Minkowski::Minkowski(std::shared_ptr <Domain> dm):
 	kstart(0), kfinish(0), isovalue(0), Volume(0),
    LOGFILE(NULL), Dm(dm), Vi(0), Vi_global(0)
@ -51,7 +61,7 @@ Minkowski::Minkowski(std::shared_ptr <Domain> dm):
 // Destructor
 Minkowski::~Minkowski()
 {
-    if ( LOGFILE!=NULL ) { fclose(LOGFILE); }
+  //    if ( LOGFILE!=NULL ) { fclose(LOGFILE); }
 }

 double Minkowski::V(){
@ -70,11 +80,11 @@ double Minkowski::X(){
 	return Xi_global;
 }

-void Minkowski::ComputeScalar(const DoubleArray Field, const double isovalue)
+void Minkowski::ComputeScalar(const DoubleArray& Field, const double isovalue)
 {
+    PROFILE_START("ComputeScalar");
 	Xi = Ji = Ai = 0.0;
 	DECL object;
-	Point P1,P2,P3;
 	int e1,e2,e3;
 	double s,s1,s2,s3;
 	double a1,a2,a3;
@ -90,13 +100,13 @@ void Minkowski::ComputeScalar(const DoubleArray Field, const double isovalue)
 					e1 = object.Face(idx); 
 					e2 = object.halfedge.next(e1);
 					e3 = object.halfedge.next(e2);
-					P1 = object.vertex.coords(object.halfedge.v1(e1));
-					P2 = object.vertex.coords(object.halfedge.v1(e2));
-					P3 = object.vertex.coords(object.halfedge.v1(e3));
+					auto P1 = object.vertex.coords(object.halfedge.v1(e1));
+					auto P2 = object.vertex.coords(object.halfedge.v1(e2));
+					auto P3 = object.vertex.coords(object.halfedge.v1(e3));
 					// Surface area
-					s1 = sqrt((P1.x-P2.x)*(P1.x-P2.x)+(P1.y-P2.y)*(P1.y-P2.y)+(P1.z-P2.z)*(P1.z-P2.z));
-					s2 = sqrt((P2.x-P3.x)*(P2.x-P3.x)+(P2.y-P3.y)*(P2.y-P3.y)+(P2.z-P3.z)*(P2.z-P3.z));
-					s3 = sqrt((P1.x-P3.x)*(P1.x-P3.x)+(P1.y-P3.y)*(P1.y-P3.y)+(P1.z-P3.z)*(P1.z-P3.z));
+					s1 = Distance( P1, P2 );
+					s2 = Distance( P2, P3 );
+					s3 = Distance( P1, P3 );
 					s = 0.5*(s1+s2+s3);
 					Ai += sqrt(s*(s-s1)*(s-s2)*(s-s3));
 					// Mean curvature based on half edge angle
@ -137,7 +147,7 @@ void Minkowski::ComputeScalar(const DoubleArray Field, const double isovalue)
 	MPI_Allreduce(&Ai,&Ai_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 	MPI_Allreduce(&Ji,&Ji_global,1,MPI_DOUBLE,MPI_SUM,Dm->Comm);
 	MPI_Barrier(Dm->Comm);
-
+    PROFILE_STOP("ComputeScalar");
 }

 void Minkowski::PrintAll()
--- a/analysis/Minkowski.h
+++ b/analysis/Minkowski.h
@ -61,9 +61,10 @@ public:
 	double X();
 	
 	//...........................................................................
+	Minkowski();
 	Minkowski(std::shared_ptr <Domain> Dm);
 	~Minkowski();
-	void ComputeScalar(const DoubleArray Field, const double isovalue);
+	void ComputeScalar(const DoubleArray& Field, const double isovalue);
 	void PrintAll();

 };
--- a/analysis/TwoPhase.cpp
+++ b/analysis/TwoPhase.cpp
@ -127,6 +127,10 @@ TwoPhase::TwoPhase(std::shared_ptr <Domain> dm):
 	Vel_x.resize(Nx,Ny,Nz);         Vel_x.fill(0);	    // Gradient of the phase indicator field
 	Vel_y.resize(Nx,Ny,Nz);         Vel_y.fill(0);
 	Vel_z.resize(Nx,Ny,Nz);         Vel_z.fill(0);
+	
+	wet_morph = Minkowski(Dm);
+	nonwet_morph = Minkowski(Dm);
+
 	//.........................................
 	// Allocate cube storage space
 	CubeValues.resize(2,2,2);
@ -180,7 +184,8 @@ TwoPhase::TwoPhase(std::shared_ptr <Domain> dm):
 			fprintf(TIMELOG,"Gnsxx Gnsyy Gnszz Gnsxy Gnsxz Gnsyz ");
 			fprintf(TIMELOG,"trawn trJwn trRwn ");			//trimmed curvature,
 			fprintf(TIMELOG,"wwndnw wwnsdnwn Jwnwwndnw "); 	//kinematic quantities,
-			fprintf(TIMELOG,"Euler Kn Jn An\n"); 			//miknowski measures,
+			fprintf(TIMELOG,"Vw Jw Aw Xw "); 			//miknowski measures,
+			fprintf(TIMELOG,"Vn Jn An Xn\n"); 			//miknowski measures,
 		}

 		NWPLOG = fopen("components.NWP.tcat","a+");
@ -209,8 +214,9 @@ TwoPhase::TwoPhase(std::shared_ptr <Domain> dm):
 		fprintf(TIMELOG,"Gnsxx Gnsyy Gnszz Gnsxy Gnsxz Gnsyz ");
 		fprintf(TIMELOG,"trawn trJwn trRwn ");			//trimmed curvature,
 		fprintf(TIMELOG,"wwndnw wwnsdnwn Jwnwwndnw "); 	//kinematic quantities,
-		fprintf(TIMELOG,"Euler Kn Jn An\n"); 			//miknowski measures,
-	}
+		fprintf(TIMELOG,"Vw Jw Aw Xw "); 			//miknowski measures,
+		fprintf(TIMELOG,"Vn Jn An Xn\n"); 			//miknowski measures,
+		}
 }


@ -572,7 +578,55 @@ void TwoPhase::ComputeLocal()
 			}
 		}
 	}
-	
+	Array <char> phase_label(Nx,Ny,Nz);
+	Array <double> phase_distance(Nx,Ny,Nz);
+	// Analyze the wetting fluid
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
+				n = k*Nx*Ny+j*Nx+i;
+				if (!(Dm->id[n] > 0)){
+					// Solid phase
+					phase_label(i,j,k) = 0;
+				}
+				else if (SDn(i,j,k) < 0.0){
+					// wetting phase
+					phase_label(i,j,k) = 1;
+				}
+				else {
+					// non-wetting phase
+					phase_label(i,j,k) = 0;
+				}
+				phase_distance(i,j,k) =2.0*double(phase_label(i,j,k))-1.0;
+			}
+		}
+	}	
+	CalcDist(phase_distance,phase_label,*Dm);
+	wet_morph.ComputeScalar(phase_distance,0.f);
+
+	// Analyze the wetting fluid
+	for (k=0; k<Nz; k++){
+		for (j=0; j<Ny; j++){
+			for (i=0; i<Nx; i++){
+				n = k*Nx*Ny+j*Nx+i;
+				if (!(Dm->id[n] > 0)){
+					// Solid phase
+					phase_label(i,j,k) = 0;
+				}
+				else if (SDn(i,j,k) < 0.0){
+					// wetting phase
+					phase_label(i,j,k) = 0;
+				}
+				else {
+					// non-wetting phase
+					phase_label(i,j,k) = 1;
+				}
+				phase_distance(i,j,k) =2.0*double(phase_label(i,j,k))-1.0;
+			}
+		}
+	}	
+	CalcDist(phase_distance,phase_label,*Dm);
+	nonwet_morph.ComputeScalar(phase_distance,0.f);
 }


@ -1274,7 +1328,10 @@ void TwoPhase::PrintAll(int timestep)
 				Gws_global(0),Gws_global(1),Gws_global(2),Gws_global(3),Gws_global(4),Gws_global(5));	// orientation of ws interface
 		fprintf(TIMELOG,"%.5g %.5g %.5g ",trawn_global, trJwn_global, trRwn_global);		// Trimmed curvature
 		fprintf(TIMELOG,"%.5g %.5g %.5g ",wwndnw_global, wwnsdnwn_global, Jwnwwndnw_global);		// kinematic quantities
-		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g\n",euler_global, Kn_global, Jn_global, An_global);			// minkowski measures
+		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g ", wet_morph.Vi_global, wet_morph.Ji_global, wet_morph.Ai_global, wet_morph.Xi_global);			// minkowski measures
+		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g\n", nonwet_morph.Vi_global, nonwet_morph.Ji_global, nonwet_morph.Ai_global, nonwet_morph.Xi_global);			// minkowski measures
+
+
 		fflush(TIMELOG);
 	}
 	else{
@ -1299,7 +1356,9 @@ void TwoPhase::PrintAll(int timestep)
 				Gws(0),Gws(1),Gws(2),Gws(3),Gws(4),Gws(5));	// orientation of ws interface
 		fprintf(TIMELOG,"%.5g %.5g %.5g ",trawn, trJwn, trRwn);		// Trimmed curvature
 		fprintf(TIMELOG,"%.5g %.5g %.5g ",wwndnw, wwnsdnwn, Jwnwwndnw);		// kinematic quantities
-		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g\n",euler, Kn, Jn, An);			// minkowski measures
+		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g ", wet_morph.Vi, wet_morph.Ji, wet_morph.Ai, wet_morph.Xi);			// minkowski measures
+		fprintf(TIMELOG,"%.5g %.5g %.5g %.5g\n", nonwet_morph.Vi, nonwet_morph.Ji, nonwet_morph.Ai, nonwet_morph.Xi);			// minkowski measures
+
 		fflush(TIMELOG);
 	}

--- a/analysis/TwoPhase.h
+++ b/analysis/TwoPhase.h
@ -23,6 +23,8 @@
 #include "common/Domain.h"
 #include "common/Communication.h"
 #include "analysis/analysis.h"
+#include "analysis/distance.h"
+#include "analysis/Minkowski.h"

 #include "shared_ptr.h"
 #include "common/Utilities.h"
@ -156,6 +158,11 @@ public:
 	DoubleArray Vel_x;		// Velocity
 	DoubleArray Vel_y;
 	DoubleArray Vel_z;
+	DoubleArray PhaseDistance;
+
+	Minkowski wet_morph;
+	Minkowski nonwet_morph;
+	
 	//	Container for averages;
 	DoubleArray ComponentAverages_WP;
 	DoubleArray ComponentAverages_NWP;
--- a/analysis/analysis.cpp
+++ b/analysis/analysis.cpp
@ -15,6 +15,8 @@
 */
 #include "analysis/analysis.h"
 #include "ProfilerApp.h"
+
+#include <algorithm>
 #include <iostream>


--- a/analysis/distance.cpp
+++ b/analysis/distance.cpp
@ -16,6 +16,7 @@
 #include "analysis/distance.h"


+
 /******************************************************************
 * A fast distance calculation                                     *
 ******************************************************************/
--- a/analysis/distance.h
+++ b/analysis/distance.h
@ -17,6 +17,7 @@
 #define Distance_H_INC

 #include "common/Domain.h"
+#include "common/Array.hpp"


 struct Vec {
--- a/common/Array.cpp
+++ b/common/Array.cpp
@ -0,0 +1,117 @@
+#include "common/Array.h"
+#include "common/Array.hpp"
+
+#include <complex>
+
+
+/********************************************************
+ *  ArraySize                                            *
+ ********************************************************/
+ArraySize::ArraySize( const std::vector<size_t>& N )
+{
+    d_ndim = N.size();
+    d_N[0] = 0;
+    d_N[1] = 1;
+    d_N[2] = 1;
+    d_N[3] = 1;
+    d_N[4] = 1;
+    for ( size_t i = 0; i < d_ndim; i++ )
+        d_N[i] = N[i];
+    d_length = 1;
+    for ( unsigned long i : d_N )
+        d_length *= i;
+    if ( d_ndim == 0 )
+        d_length = 0;
+}
+
+
+/********************************************************
+ *  Explicit instantiations of Array                     *
+ ********************************************************/
+template class Array<char, FunctionTable>;
+template class Array<uint8_t, FunctionTable>;
+template class Array<uint16_t, FunctionTable>;
+template class Array<uint32_t, FunctionTable>;
+template class Array<uint64_t, FunctionTable>;
+template class Array<int8_t, FunctionTable>;
+template class Array<int16_t, FunctionTable>;
+template class Array<int32_t, FunctionTable>;
+template class Array<int64_t, FunctionTable>;
+template class Array<float, FunctionTable>;
+template class Array<double, FunctionTable>;
+
+
+/********************************************************
+ *  Explicit instantiations of Array<bool>               *
+ ********************************************************/
+// clang-format off
+template Array<bool, FunctionTable>::Array();
+template Array<bool, FunctionTable>::~Array();
+template Array<bool, FunctionTable>::Array( size_t );
+template Array<bool, FunctionTable>::Array( size_t, size_t );
+template Array<bool, FunctionTable>::Array( size_t, size_t, size_t );
+template Array<bool, FunctionTable>::Array( size_t, size_t, size_t, size_t );
+template Array<bool, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
+template Array<bool, FunctionTable>::Array( const std::vector<size_t>&, const bool* );
+template Array<bool, FunctionTable>::Array( std::string );
+template Array<bool, FunctionTable>::Array( std::initializer_list<bool> );
+template Array<bool, FunctionTable>::Array( const Array<bool, FunctionTable>& );
+template Array<bool, FunctionTable>::Array( Array<bool, FunctionTable>&& );
+template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( const Array<bool, FunctionTable>& );
+template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( Array<bool, FunctionTable>&& );
+template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( const std::vector<bool>& );
+template void Array<bool, FunctionTable>::fill(bool const&);
+template void Array<bool, FunctionTable>::clear();
+template bool Array<bool, FunctionTable>::operator==(Array<bool, FunctionTable> const&) const;
+template void Array<bool, FunctionTable>::resize( ArraySize const& );
+// clang-format on
+
+
+/********************************************************
+ *  Explicit instantiations of Array<std::complex>       *
+ ********************************************************/
+// clang-format off
+template Array<std::complex<double>, FunctionTable>::Array();
+template Array<std::complex<double>, FunctionTable>::~Array();
+template Array<std::complex<double>, FunctionTable>::Array( size_t );
+template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t );
+template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t );
+template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t, size_t );
+template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
+template Array<std::complex<double>, FunctionTable>::Array( const std::vector<size_t>&, const std::complex<double>* );
+template Array<std::complex<double>, FunctionTable>::Array( std::initializer_list<std::complex<double>> );
+template Array<std::complex<double>, FunctionTable>::Array( const Range<std::complex<double>>& range );
+template Array<std::complex<double>, FunctionTable>::Array( const Array<std::complex<double>, FunctionTable>& );
+template Array<std::complex<double>, FunctionTable>::Array( Array<std::complex<double>, FunctionTable>&& );
+template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( const Array<std::complex<double>, FunctionTable>& );
+template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( Array<std::complex<double>, FunctionTable>&& );
+template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( const std::vector<std::complex<double>>& );
+template void Array<std::complex<double>, FunctionTable>::resize( ArraySize const& );
+template void Array<std::complex<double>, FunctionTable>::viewRaw( ArraySize const&, std::complex<double>*, bool, bool );
+template void Array<std::complex<double>, FunctionTable>::fill(std::complex<double> const&);
+template void Array<std::complex<double>, FunctionTable>::clear();
+template bool Array<std::complex<double>, FunctionTable>::operator==(Array<std::complex<double>, FunctionTable> const&) const;
+template Array<std::complex<double>, FunctionTable> Array<std::complex<double>, FunctionTable>::repmat(std::vector<unsigned long, std::allocator<unsigned long> > const&) const;
+// clang-format on
+
+
+/********************************************************
+ *  Explicit instantiations of Array<std::string>        *
+ ********************************************************/
+// clang-format off
+template Array<std::string, FunctionTable>::Array();
+template Array<std::string, FunctionTable>::~Array();
+template Array<std::string, FunctionTable>::Array( size_t );
+template Array<std::string, FunctionTable>::Array( size_t, size_t );
+template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t );
+template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t, size_t );
+template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
+template Array<std::string, FunctionTable>::Array( const std::vector<size_t>&, const std::string* );
+template Array<std::string, FunctionTable>::Array( std::initializer_list<std::string> );
+template Array<std::string, FunctionTable>::Array( const Array<std::string, FunctionTable>& );
+template Array<std::string, FunctionTable>::Array( Array<std::string, FunctionTable>&& );
+template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( const Array<std::string, FunctionTable>& );
+template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( Array<std::string, FunctionTable>&& );
+template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( const std::vector<std::string>& );
+template void Array<std::string, FunctionTable>::resize( ArraySize const& );
+// clang-format on
--- a/common/Array.h
+++ b/common/Array.h
@ -16,247 +16,20 @@
 #ifndef included_ArrayClass
 #define included_ArrayClass

+#include "common/ArraySize.h"
+
 #include <array>
-#include <cstring>
 #include <functional>
 #include <initializer_list>
 #include <iostream>
 #include <memory>
 #include <vector>

-#include "Utilities.h"
-
-
-#if defined( __CUDA_ARCH__ )
-#include <cuda.h>
-#define HOST_DEVICE __host__ __device__
-#else
-#define HOST_DEVICE
-#endif
-#if defined( USING_GCC ) || defined( USING_CLANG )
-#define ATTRIBUTE_INLINE __attribute__( ( always_inline ) )
-#else
-#define ATTRIBUTE_INLINE
-#endif
-
-
-#if ( defined( DEBUG ) || defined( _DEBUG ) ) && !defined( NDEBUG )
-#define CHECK_ARRAY_LENGTH( i )                                      \
-    do {                                                             \
-        if ( i >= d_length )                                         \
-            throw std::length_error( "Index exceeds array bounds" ); \
-    } while ( 0 )
-#else
-#define CHECK_ARRAY_LENGTH( i ) \
-    do {                        \
-    } while ( 0 )
-#endif
-
-
-// Forward decleration
-class FunctionTable;
-
-
-//! Simple range class
-template<class TYPE = size_t>
-class Range final
-{
-public:
-    //! Empty constructor
-    Range() : i( 0 ), j( -1 ), k( 1 ) {}
-
-    /*!
-     * Create a range i:k:j (or i:j)
-     * @param i_            Starting value
-     * @param j_            Ending value
-     * @param k_            Increment value
-     */
-    Range( TYPE i_, TYPE j_, TYPE k_ = 1 ) : i( i_ ), j( j_ ), k( k_ ) {}
-
-    TYPE i, j, k;
-};
-
-
-//! Simple class to store the array dimensions
-class ArraySize final
-{
-public:
-    //! Empty constructor
-    inline ArraySize();
-
-    /*!
-     * Create the vector size
-     * @param N1            Number of elements in the first dimension
-     */
-    inline ArraySize( size_t N1 );
-
-    /*!
-     * Create the vector size
-     * @param N1            Number of elements in the first dimension
-     * @param N2            Number of elements in the second dimension
-     */
-    inline ArraySize( size_t N1, size_t N2 );
-
-    /*!
-     * Create the vector size
-     * @param N1            Number of elements in the first dimension
-     * @param N2            Number of elements in the second dimension
-     * @param N3            Number of elements in the third dimension
-     */
-    inline ArraySize( size_t N1, size_t N2, size_t N3 );
-
-    /*!
-     * Create the vector size
-     * @param N1            Number of elements in the first dimension
-     * @param N2            Number of elements in the second dimension
-     * @param N3            Number of elements in the third dimension
-     * @param N4            Number of elements in the fourth dimension
-     */
-    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 );
-
-    /*!
-     * Create the vector size
-     * @param N1            Number of elements in the first dimension
-     * @param N2            Number of elements in the second dimension
-     * @param N3            Number of elements in the third dimension
-     * @param N4            Number of elements in the fourth dimension
-     * @param N5            Number of elements in the fifth dimension
-     */
-    inline ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 );
-
-    /*!
-     * Create from initializer list
-     * @param N             Size of the array
-     */
-    inline ArraySize( std::initializer_list<size_t> N );
-
-    /*!
-     * Create from raw pointer
-     * @param ndim          Number of dimensions
-     * @param ndim          Dimensions
-     */
-    inline ArraySize( size_t ndim, const size_t *dims );
-
-    /*!
-     * Create from std::vector
-     * @param N             Size of the array
-     */
-    inline ArraySize( const std::vector<size_t> &N );
-
-    /*!
-     * Copy constructor
-     * @param rhs           Array to copy
-     */
-    inline ArraySize( const ArraySize &rhs );
-
-    /*!
-     * Move constructor
-     * @param rhs           Array to copy
-     */
-    inline ArraySize( ArraySize &&rhs );
-
-    /*!
-     * Assignment operator
-     * @param rhs           Array to copy
-     */
-    inline ArraySize &operator=( const ArraySize &rhs );
-
-    /*!
-     * Move assignment operator
-     * @param rhs           Array to copy
-     */
-    inline ArraySize &operator=( ArraySize &&rhs );
-
-    /*!
-     * Access the ith dimension
-     * @param i             Index to access
-     */
-    inline size_t operator[]( size_t i ) const { return d_N[i]; }
-
-    //! Sum the elements
-    inline uint8_t ndim() const ATTRIBUTE_INLINE { return d_ndim; }
-
-    //! Sum the elements
-    inline size_t size() const ATTRIBUTE_INLINE { return d_ndim; }
-
-    //! Sum the elements
-    inline size_t length() const ATTRIBUTE_INLINE { return d_length; }
-
-    //! Sum the elements
-    inline void resize( uint8_t dim, size_t N );
-
-    //! Returns an iterator to the beginning
-    inline const size_t *begin() const ATTRIBUTE_INLINE { return d_N; }
-
-    //! Returns an iterator to the end
-    inline const size_t *end() const ATTRIBUTE_INLINE { return d_N + d_ndim; }
-
-    // Check if two matrices are equal
-    inline bool operator==( const ArraySize &rhs ) const ATTRIBUTE_INLINE
-    {
-        return d_ndim == rhs.d_ndim && memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
-    }
-
-    //! Check if two matrices are not equal
-    inline bool operator!=( const ArraySize &rhs ) const ATTRIBUTE_INLINE
-    {
-        return d_ndim != rhs.d_ndim || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) != 0;
-    }
-
-    //! Maximum supported dimension
-    constexpr static uint8_t maxDim() ATTRIBUTE_INLINE { return 5u; }
-
-    //! Get the index
-    inline size_t index( size_t i ) const ATTRIBUTE_INLINE
-    {
-        CHECK_ARRAY_LENGTH( i );
-        return i;
-    }
-
-    //! Get the index
-    inline size_t index( size_t i1, size_t i2 ) const ATTRIBUTE_INLINE
-    {
-        size_t index = i1 + i2 * d_N[0];
-        CHECK_ARRAY_LENGTH( index );
-        return index;
-    }
-
-    //! Get the index
-    inline size_t index( size_t i1, size_t i2, size_t i3 ) const ATTRIBUTE_INLINE
-    {
-        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * i3 );
-        CHECK_ARRAY_LENGTH( index );
-        return index;
-    }
-
-    //! Get the index
-    inline size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
-    {
-        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * i4 ) );
-        CHECK_ARRAY_LENGTH( index );
-        return index;
-    }
-
-    //! Get the index
-    inline size_t index(
-        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
-    {
-        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * ( i4 + d_N[3] * i5 ) ) );
-        CHECK_ARRAY_LENGTH( index );
-        return index;
-    }
-
-private:
-    uint8_t d_ndim;
-    size_t d_length;
-    size_t d_N[5];
-};
-

 /*!
 * Class Array is a multi-dimensional array class written by Mark Berrill
 */
-template<class TYPE, class FUN = FunctionTable>
+template<class TYPE, class FUN, class Allocator>
 class Array final
 {
 public: // Constructors / assignment operators
@ -324,6 +97,12 @@ public: // Constructors / assignment operators
     */
    explicit Array( const Range<TYPE> &range );

+    /*!
+     * Create a 1D Array using a string that mimic's MATLAB
+     * @param range         Range of the data
+     */
+    explicit Array( std::string range );
+
    /*!
     * Create a 1D Array with the given initializer list
     * @param data          Input data
@ -361,74 +140,34 @@ public: // Constructors / assignment operators
     */
    Array &operator=( const std::vector<TYPE> &rhs );

+    //! Is copyable?
+    inline bool isCopyable() const { return d_isCopyable; }
+
+    //! Set is copyable
+    inline void setCopyable( bool flag ) { d_isCopyable = flag; }
+
+    //! Is fixed size?
+    inline bool isFixedSize() const { return d_isFixedSize; }
+
+    //! Set is copyable
+    inline void setFixedSize( bool flag ) { d_isFixedSize = flag; }
+
+
 public: // Views/copies/subset
    /*!
-     * Create a 1D Array view to a raw block of data
-     * @param N             Number of elements in the array
+     * Create a multi-dimensional Array view to a raw block of data
+     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    static std::shared_ptr<Array> view( size_t N, std::shared_ptr<TYPE> const &data );
+    static std::unique_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> &data );

-    /*!
-     * Create a new 2D Array with the given number of rows and columns
-     * @param N_rows        Number of rows
-     * @param N_columns     Number of columns
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<Array> view(
-        size_t N_rows, size_t N_columns, std::shared_ptr<TYPE> const &data );
-
-    /*!
-     * Create a new 3D Array view to a raw block of data
-     * @param N1            Number of rows
-     * @param N2            Number of columns
-     * @param N3            Number of elements in the third dimension
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<Array> view(
-        size_t N1, size_t N2, size_t N3, std::shared_ptr<TYPE> const &data );

    /*!
     * Create a multi-dimensional Array view to a raw block of data
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    static std::shared_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> const &data );
-
-
-    /*!
-     * Create a 1D Array view to a raw block of data
-     * @param N             Number of elements in the array
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<const Array> constView(
-        size_t N, std::shared_ptr<const TYPE> const &data );
-
-    /*!
-     * Create a new 2D Array with the given number of rows and columns
-     * @param N_rows        Number of rows
-     * @param N_columns     Number of columns
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<const Array> constView(
-        size_t N_rows, size_t N_columns, std::shared_ptr<const TYPE> const &data );
-
-    /*!
-     * Create a new 3D Array view to a raw block of data
-     * @param N1            Number of rows
-     * @param N2            Number of columns
-     * @param N3            Number of elements in the third dimension
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<const Array> constView(
-        size_t N1, size_t N2, size_t N3, std::shared_ptr<const TYPE> const &data );
-
-    /*!
-     * Create a multi-dimensional Array view to a raw block of data
-     * @param N             Number of elements in each dimension
-     * @param data          Pointer to the data
-     */
-    static std::shared_ptr<const Array> constView(
+    static std::unique_ptr<const Array> constView(
        const ArraySize &N, std::shared_ptr<const TYPE> const &data );


@ -447,7 +186,7 @@ public: // Views/copies/subset

    /*!
     * Make this object a view of the raw data (expert use only).
-     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
+     * Use view2( N, shared_ptr(data,[](TYPE*){}) ) instead.
     *   Note: this interface is not recommended as it does not protect from
     *   the src data being deleted while still being used by the Array.
     *   Additionally for maximum performance it does not set the internal shared_ptr
@ -455,27 +194,28 @@ public: // Views/copies/subset
     * @param ndim          Number of dimensions
     * @param dims          Number of elements in each dimension
     * @param data          Pointer to the data
+     * @param isCopyable    Once the view is created, can the array be copied
+     * @param isFixedSize   Once the view is created, is the array size fixed
     */
-    void viewRaw( int ndim, const size_t *dims, TYPE *data );
+    inline void viewRaw(
+        int ndim, const size_t *dims, TYPE *data, bool isCopyable = true, bool isFixedSize = true )
+    {
+        viewRaw( ArraySize( ndim, dims ), data, isCopyable, isFixedSize );
+    }

    /*!
     * Make this object a view of the raw data (expert use only).
-     * Use view2( N, std::shared_ptr(data,[](TYPE*){}) ) instead.
+     * Use view2( N, shared_ptr(data,[](TYPE*){}) ) instead.
     *   Note: this interface is not recommended as it does not protect from
     *   the src data being deleted while still being used by the Array.
     *   Additionally for maximum performance it does not set the internal shared_ptr
     *   so functions like getPtr and resize will not work correctly.
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
+     * @param isCopyable    Once the view is created, can the array be copied
+     * @param isFixedSize   Once the view is created, is the array size fixed
     */
-    void viewRaw( const ArraySize &N, TYPE *data );
-
-    /*!
-     * Convert an array of one type to another.  This may or may not allocate new memory.
-     * @param array         Input array
-     */
-    template<class TYPE2>
-    static std::shared_ptr<Array<TYPE2>> convert( std::shared_ptr<Array<TYPE, FUN>> array );
+    void viewRaw( const ArraySize &N, TYPE *data, bool isCopyable = true, bool isFixedSize = true );


    /*!
@ -483,8 +223,27 @@ public: // Views/copies/subset
     * @param array         Input array
     */
    template<class TYPE2>
-    static std::shared_ptr<const Array<TYPE2>> convert(
-        std::shared_ptr<const Array<TYPE, FUN>> array );
+    static inline std::unique_ptr<Array<TYPE2, FUN, Allocator>> convert(
+        std::shared_ptr<Array<TYPE, FUN, Allocator>> array )
+    {
+        auto array2 = std::make_unique<Array<TYPE2>>( array->size() );
+        array2.copy( *array );
+        return array2;
+    }
+
+
+    /*!
+     * Convert an array of one type to another.  This may or may not allocate new memory.
+     * @param array         Input array
+     */
+    template<class TYPE2>
+    static inline std::unique_ptr<const Array<TYPE2, FUN, Allocator>> convert(
+        std::shared_ptr<const Array<TYPE, FUN, Allocator>> array )
+    {
+        auto array2 = std::make_unique<Array<TYPE2>>( array->size() );
+        array2.copy( *array );
+        return array2;
+    }


    /*!
@ -492,7 +251,11 @@ public: // Views/copies/subset
     * @param array         Source array
     */
    template<class TYPE2>
-    void copy( const Array<TYPE2> &array );
+    void inline copy( const Array<TYPE2, FUN, Allocator> &array )
+    {
+        resize( array.size() );
+        copy( array.data() );
+    }

    /*!
     * Copy and convert data from a raw vector to this array.
@ -500,22 +263,36 @@ public: // Views/copies/subset
     * @param array         Source array
     */
    template<class TYPE2>
-    void copy( const TYPE2 *array );
+    void inline copy( const TYPE2 *data )
+    {
+        for ( size_t i = 0; i < d_size.length(); i++ )
+            d_data[i] = static_cast<TYPE>( data[i] );
+    }

    /*!
     * Copy and convert data from this array to a raw vector.
     * @param array         Source array
     */
    template<class TYPE2>
-    void copyTo( TYPE2 *array ) const;
+    void inline copyTo( TYPE2 *data ) const
+    {
+        for ( size_t i = 0; i < d_size.length(); i++ )
+            data[i] = static_cast<TYPE2>( d_data[i] );
+    }

    /*!
-     * Copy and convert data from this array to a raw vector.
-     * @param array         Source array
+     * Copy and convert data from this array to a new array
     */
    template<class TYPE2>
-    Array<TYPE2, FUN> cloneTo() const;
+    Array<TYPE2, FUN, Allocator> inline cloneTo() const
+    {
+        Array<TYPE2, FUN> dst( this->size() );
+        copyTo( dst.data() );
+        return dst;
+    }

+    /*! swap the raw data pointers for the Arrays after checking for compatibility */
+    void swap( Array &other );

    /*!
     * Fill the array with the given value
@ -534,7 +311,7 @@ public: // Views/copies/subset
     * @param base        Base array
     * @param exp         Exponent value
     */
-    void pow( const Array<TYPE, FUN> &base, const TYPE &exp );
+    void pow( const Array &base, const TYPE &exp );

    //! Destructor
    ~Array();
@ -549,11 +326,7 @@ public: // Views/copies/subset


    //! Return the size of the Array
-    inline ArraySize &size() { return d_size; }
-
-
-    //! Return the size of the Array
-    inline ArraySize size() const { return d_size; }
+    inline const ArraySize &size() const { return d_size; }


    //! Return the size of the Array
@ -572,14 +345,15 @@ public: // Views/copies/subset
     * Resize the Array
     * @param N             NUmber of elements
     */
-    void resize( size_t N );
+    inline void resize( size_t N ) { resize( ArraySize( N ) ); }
+

    /*!
     * Resize the Array
-     * @param N_rows        Number of rows
-     * @param N_columns     Number of columns
+     * @param N_row         Number of rows
+     * @param N_col         Number of columns
     */
-    void resize( size_t N_rows, size_t N_columns );
+    inline void resize( size_t N_row, size_t N_col ) { resize( ArraySize( N_row, N_col ) ); }

    /*!
     * Resize the Array
@ -587,7 +361,7 @@ public: // Views/copies/subset
     * @param N2            Number of columns
     * @param N3            Number of elements in the third dimension
     */
-    void resize( size_t N1, size_t N2, size_t N3 );
+    inline void resize( size_t N1, size_t N2, size_t N3 ) { resize( ArraySize( N1, N2, N3 ) ); }

    /*!
     * Resize the Array
@ -613,19 +387,25 @@ public: // Views/copies/subset


    /*!
-     * Subset the Array (total size of array will not change)
+     * Reshape the Array so that the number of dimensions is the
+     *    max of ndim and the largest dim>1.
+     * @param ndim          Desired number of dimensions
+     */
+    inline void setNdim( int ndim ) { d_size.setNdim( ndim ); }
+
+
+    /*!
+     * Subset the Array
     * @param index         Index to subset (imin,imax,jmin,jmax,kmin,kmax,...)
     */
-    template<class TYPE2 = TYPE>
-    Array<TYPE2, FUN> subset( const std::vector<size_t> &index ) const;
+    Array subset( const std::vector<size_t> &index ) const;


    /*!
-     * Subset the Array (total size of array will not change)
+     * Subset the Array
     * @param index         Index to subset (ix:kx:jx,iy:ky:jy,...)
     */
-    template<class TYPE2 = TYPE>
-    Array<TYPE2, FUN> subset( const std::vector<Range<size_t>> &index ) const;
+    Array subset( const std::vector<Range<size_t>> &index ) const;


    /*!
@ -633,30 +413,28 @@ public: // Views/copies/subset
     * @param index         Index of the subset (imin,imax,jmin,jmax,kmin,kmax,...)
     * @param subset        The subset array to copy from
     */
-    template<class TYPE2>
-    void copySubset( const std::vector<size_t> &index, const Array<TYPE2, FUN> &subset );
+    void copySubset( const std::vector<size_t> &index, const Array &subset );

    /*!
     * Copy data from an array into a subset of this array
     * @param index         Index of the subset
     * @param subset        The subset array to copy from
     */
-    template<class TYPE2>
-    void copySubset( const std::vector<Range<size_t>> &index, const Array<TYPE2, FUN> &subset );
+    void copySubset( const std::vector<Range<size_t>> &index, const Array &subset );

    /*!
     * Add data from an array into a subset of this array
     * @param index         Index of the subset (imin,imax,jmin,jmax,kmin,kmax,...)
     * @param subset        The subset array to add from
     */
-    void addSubset( const std::vector<size_t> &index, const Array<TYPE, FUN> &subset );
+    void addSubset( const std::vector<size_t> &index, const Array &subset );

    /*!
     * Add data from an array into a subset of this array
     * @param index         Index of the subset
     * @param subset        The subset array to add from
     */
-    void addSubset( const std::vector<Range<size_t>> &index, const Array<TYPE, FUN> &subset );
+    void addSubset( const std::vector<Range<size_t>> &index, const Array &subset );


 public: // Accessors
@ -664,16 +442,13 @@ public: // Accessors
     * Access the desired element
     * @param i             The row index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i ) ATTRIBUTE_INLINE
-    {
-        return d_data[d_size.index( i )];
-    }
+    ARRAY_ATTRIBUTE inline TYPE &operator()( size_t i ) { return d_data[d_size.index( i )]; }

    /*!
     * Access the desired element
     * @param i             The row index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE &operator()( size_t i ) const
    {
        return d_data[d_size.index( i )];
    }
@ -683,7 +458,7 @@ public: // Accessors
     * @param i             The row index
     * @param j             The column index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j ) ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline TYPE &operator()( size_t i, size_t j )
    {
        return d_data[d_size.index( i, j )];
    }
@ -693,7 +468,7 @@ public: // Accessors
     * @param i             The row index
     * @param j             The column index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE &operator()( size_t i, size_t j ) const
    {
        return d_data[d_size.index( i, j )];
    }
@ -704,7 +479,7 @@ public: // Accessors
     * @param j             The column index
     * @param k             The third index
     */
-    HOST_DEVICE inline TYPE &operator()( size_t i, size_t j, size_t k ) ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline TYPE &operator()( size_t i, size_t j, size_t k )
    {
        return d_data[d_size.index( i, j, k )];
    }
@ -715,7 +490,7 @@ public: // Accessors
     * @param j             The column index
     * @param k             The third index
     */
-    HOST_DEVICE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE &operator()( size_t i, size_t j, size_t k ) const
    {
        return d_data[d_size.index( i, j, k )];
    }
@ -727,8 +502,7 @@ public: // Accessors
     * @param i3            The third index
     * @param i4            The fourth index
     */
-    HOST_DEVICE inline TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4 ) ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline TYPE &operator()( size_t i1, size_t i2, size_t i3, size_t i4 )
    {
        return d_data[d_size.index( i1, i2, i3, i4 )];
    }
@ -740,8 +514,8 @@ public: // Accessors
     * @param i3            The third index
     * @param i4            The fourth index
     */
-    HOST_DEVICE inline const TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4 ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4 ) const
    {
        return d_data[d_size.index( i1, i2, i3, i4 )];
    }
@ -754,8 +528,7 @@ public: // Accessors
     * @param i4            The fourth index
     * @param i5            The fifth index
     */
-    HOST_DEVICE inline TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline TYPE &operator()( size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 )
    {
        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
    }
@ -768,8 +541,8 @@ public: // Accessors
     * @param i4            The fourth index
     * @param i5            The fifth index
     */
-    HOST_DEVICE inline const TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE &operator()(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
    {
        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
    }
@ -778,7 +551,7 @@ public: // Accessors
     * Access the desired element as a raw pointer
     * @param i             The global index
     */
-    HOST_DEVICE inline TYPE *ptr( size_t i ) ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline TYPE *ptr( size_t i )
    {
        return i >= d_size.length() ? nullptr : &d_data[i];
    }
@ -787,34 +560,34 @@ public: // Accessors
     * Access the desired element as a raw pointer
     * @param i             The global index
     */
-    HOST_DEVICE inline const TYPE *ptr( size_t i ) const ATTRIBUTE_INLINE
+    ARRAY_ATTRIBUTE inline const TYPE *ptr( size_t i ) const
    {
        return i >= d_size.length() ? nullptr : &d_data[i];
    }

    //! Get iterator to beginning of data
-    inline TYPE *begin() ATTRIBUTE_INLINE { return d_data; }
+    inline TYPE *begin() { return d_data; }

    //! Get iterator to beginning of data
-    inline const TYPE *begin() const ATTRIBUTE_INLINE { return d_data; }
+    inline const TYPE *begin() const { return d_data; }

    //! Get iterator to beginning of data
-    inline TYPE *end() ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+    inline TYPE *end() { return d_data + d_size.length(); }

    //! Get iterator to beginning of data
-    inline const TYPE *end() const ATTRIBUTE_INLINE { return d_data + d_size.length(); }
+    inline const TYPE *end() const { return d_data + d_size.length(); }

    //! Return the pointer to the raw data
-    inline std::shared_ptr<TYPE> getPtr() ATTRIBUTE_INLINE { return d_ptr; }
+    inline std::shared_ptr<TYPE> getPtr() { return d_ptr; }

    //! Return the pointer to the raw data
-    inline std::shared_ptr<const TYPE> getPtr() const ATTRIBUTE_INLINE { return d_ptr; }
+    inline std::shared_ptr<const TYPE> getPtr() const { return d_ptr; }

    //! Return the pointer to the raw data
-    HOST_DEVICE inline TYPE *data() ATTRIBUTE_INLINE { return d_data; }
+    ARRAY_ATTRIBUTE inline TYPE *data() { return d_data; }

    //! Return the pointer to the raw data
-    HOST_DEVICE inline const TYPE *data() const ATTRIBUTE_INLINE { return d_data; }
+    ARRAY_ATTRIBUTE inline const TYPE *data() const { return d_data; }


 public: // Operator overloading
@ -849,52 +622,52 @@ public: // Math operations
    void rand();

    //! Return true if NaNs are present
-    inline bool NaNs() const;
+    bool NaNs() const;

    //! Return the smallest value
-    inline TYPE min() const;
+    TYPE min() const;

    //! Return the largest value
-    inline TYPE max() const;
+    TYPE max() const;

    //! Return the sum of all elements
-    inline TYPE sum() const;
+    TYPE sum() const;

    //! Return the mean of all elements
-    inline TYPE mean() const;
+    TYPE mean() const;

    //! Return the min of all elements in a given direction
-    Array<TYPE, FUN> min( int dir ) const;
+    Array min( int dir ) const;

    //! Return the max of all elements in a given direction
-    Array<TYPE, FUN> max( int dir ) const;
+    Array max( int dir ) const;

    //! Return the sum of all elements in a given direction
-    Array<TYPE, FUN> sum( int dir ) const;
+    Array sum( int dir ) const;

    //! Return the smallest value
-    inline TYPE min( const std::vector<size_t> &index ) const;
+    TYPE min( const std::vector<size_t> &index ) const;

    //! Return the largest value
-    inline TYPE max( const std::vector<size_t> &index ) const;
+    TYPE max( const std::vector<size_t> &index ) const;

    //! Return the sum of all elements
-    inline TYPE sum( const std::vector<size_t> &index ) const;
+    TYPE sum( const std::vector<size_t> &index ) const;

    //! Return the mean of all elements
-    inline TYPE mean( const std::vector<size_t> &index ) const;
+    TYPE mean( const std::vector<size_t> &index ) const;

    //! Return the smallest value
-    inline TYPE min( const std::vector<Range<size_t>> &index ) const;
+    TYPE min( const std::vector<Range<size_t>> &index ) const;

    //! Return the largest value
-    inline TYPE max( const std::vector<Range<size_t>> &index ) const;
+    TYPE max( const std::vector<Range<size_t>> &index ) const;

    //! Return the sum of all elements
-    inline TYPE sum( const std::vector<Range<size_t>> &index ) const;
+    TYPE sum( const std::vector<Range<size_t>> &index ) const;

    //! Return the mean of all elements
-    inline TYPE mean( const std::vector<Range<size_t>> &index ) const;
+    TYPE mean( const std::vector<Range<size_t>> &index ) const;

    //! Find all elements that match the operator
    std::vector<size_t> find(
@ -909,17 +682,17 @@ public: // Math operations
    static Array multiply( const Array &a, const Array &b );

    //! Transpose an array
-    Array<TYPE, FUN> reverseDim() const;
+    Array reverseDim() const;

    //! Replicate an array a given number of times in each direction
-    Array<TYPE, FUN> repmat( const std::vector<size_t> &N ) const;
+    Array repmat( const std::vector<size_t> &N ) const;

    //! Coarsen an array using the given filter
-    Array<TYPE, FUN> coarsen( const Array<TYPE, FUN> &filter ) const;
+    Array coarsen( const Array &filter ) const;

    //! Coarsen an array using the given filter
-    Array<TYPE, FUN> coarsen( const std::vector<size_t> &ratio,
-        std::function<TYPE( const Array<TYPE, FUN> & )> filter ) const;
+    Array coarsen(
+        const std::vector<size_t> &ratio, std::function<TYPE( const Array & )> filter ) const;

    /*!
     * Perform a element-wise operation y = f(x)
@ -943,21 +716,31 @@ public: // Math operations
     * @param[in] x             x
     * @param[in] beta          beta
     */
-    void axpby( const TYPE &alpha, const Array<TYPE, FUN> &x, const TYPE &beta );
+    void axpby( const TYPE &alpha, const Array &x, const TYPE &beta );
+
+    /*!
+     * Linear interpolation
+     * @param[in] x             Position as a decimal index
+     */
+    TYPE interp( const std::vector<double> &x ) const;
+
+    /**
+     * \fn equals (Array & const rhs, TYPE tol )
+     * \brief  Determine if two Arrays are equal using an absolute tolerance
+     * \param[in] rhs Vector to compare to
+     * \param[in] tol Tolerance of comparison
+     * \return  True iff \f$||\mathit{rhs} - x||_\infty < \mathit{tol}\f$
+     */
+    bool equals( const Array &rhs, TYPE tol = 0.000001 ) const;

 private:
+    bool d_isCopyable;           // Can the array be copied
+    bool d_isFixedSize;          // Can the array be resized
    ArraySize d_size;            // Size of each dimension
    TYPE *d_data;                // Raw pointer to data in array
    std::shared_ptr<TYPE> d_ptr; // Shared pointer to data in array
    void allocate( const ArraySize &N );

-public:
-    template<class TYPE2, class FUN2>
-    inline bool sizeMatch( const Array<TYPE2, FUN2> &rhs ) const
-    {
-        return d_size == rhs.d_size;
-    }
-
 private:
    inline void checkSubsetIndex( const std::vector<Range<size_t>> &range ) const;
    inline std::vector<Range<size_t>> convert( const std::vector<size_t> &index ) const;
@ -967,11 +750,60 @@ private:
 };


+/********************************************************
+ *  ostream operator                                     *
+ ********************************************************/
+inline std::ostream &operator<<( std::ostream &out, const ArraySize &s )
+{
+    out << "[" << s[0];
+    for ( size_t i = 1; i < s.ndim(); i++ )
+        out << "," << s[i];
+    out << "]";
+    return out;
+}
+
+
+/********************************************************
+ *  Math operations                                      *
+ ********************************************************/
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator+(
+    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
+{
+    Array<TYPE, FUN, Allocator> c;
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    FUN::transform( fun, a, b, c );
+    return c;
+}
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator-(
+    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
+{
+    Array<TYPE, FUN, Allocator> c;
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a - b; };
+    FUN::transform( fun, a, b, c );
+    return c;
+}
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator*(
+    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
+{
+    return Array<TYPE, FUN, Allocator>::multiply( a, b );
+}
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator*(
+    const Array<TYPE, FUN, Allocator> &a, const std::vector<TYPE> &b )
+{
+    Array<TYPE, FUN, Allocator> b2;
+    b2.viewRaw( { b.size() }, const_cast<TYPE *>( b.data() ) );
+    return Array<TYPE, FUN, Allocator>::multiply( a, b2 );
+}
+
+
+/********************************************************
+ *  Convience typedefs                                   *
+ ********************************************************/
 typedef Array<double> DoubleArray;
-typedef Array<float> FloatArray;
 typedef Array<int> IntArray;

-
-#include "common/Array.hpp"
-
 #endif
--- a/common/Array.hpp
+++ b/common/Array.hpp
--- a/common/ArraySize.h
+++ b/common/ArraySize.h
@ -0,0 +1,294 @@
+#ifndef included_ArraySizeClass
+#define included_ArraySizeClass
+
+
+#include <array>
+#include <cstring>
+#include <initializer_list>
+#include <vector>
+
+
+#if defined( __CUDA_ARCH__ )
+#include <cuda.h>
+#define HOST_DEVICE __host__ __device__
+#else
+#define HOST_DEVICE
+#endif
+#if defined( USING_GCC ) || defined( USING_CLANG )
+#define ARRAY_ATTRIBUTE HOST_DEVICE __attribute__( ( always_inline ) )
+#else
+#define ARRAY_ATTRIBUTE HOST_DEVICE
+#endif
+
+
+#if ( defined( DEBUG ) || defined( _DEBUG ) ) && !defined( NDEBUG )
+#define CHECK_ARRAY_LENGTH( i )                                      \
+    do {                                                             \
+        if ( i >= d_length )                                         \
+            throw std::out_of_range( "Index exceeds array bounds" ); \
+    } while ( 0 )
+#else
+#define CHECK_ARRAY_LENGTH( i ) \
+    do {                        \
+    } while ( 0 )
+#endif
+
+
+// Forward declerations
+class FunctionTable;
+template<class TYPE, class FUN = FunctionTable, class Allocator = std::nullptr_t>
+class Array;
+
+
+//! Simple range class
+template<class TYPE = size_t>
+class Range final
+{
+public:
+    //! Empty constructor
+    constexpr Range() : i( 0 ), j( -1 ), k( 1 ) {}
+
+    /*!
+     * Create a range i:k:j (or i:j)
+     * @param i_            Starting value
+     * @param j_            Ending value
+     * @param k_            Increment value
+     */
+    constexpr Range( TYPE i_, TYPE j_, TYPE k_ = 1 ) : i( i_ ), j( j_ ), k( k_ ) {}
+
+    TYPE i, j, k;
+};
+
+
+//! Simple class to store the array dimensions
+class ArraySize final
+{
+public:
+    //! Empty constructor
+    constexpr ArraySize() : d_ndim( 1 ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 } {}
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     */
+    constexpr ArraySize( size_t N1 ) : d_ndim( 1 ), d_length( N1 ), d_N{ N1, 1, 1, 1, 1 } {}
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     */
+    constexpr ArraySize( size_t N1, size_t N2 )
+        : d_ndim( 2 ), d_length( N1 * N2 ), d_N{ N1, N2, 1, 1, 1 }
+    {
+    }
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     */
+    constexpr ArraySize( size_t N1, size_t N2, size_t N3 )
+        : d_ndim( 3 ), d_length( N1 * N2 * N3 ), d_N{ N1, N2, N3, 1, 1 }
+    {
+    }
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     */
+    constexpr ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 )
+        : d_ndim( 4 ), d_length( N1 * N2 * N3 * N4 ), d_N{ N1, N2, N3, N4, 1 }
+    {
+    }
+
+    /*!
+     * Create the vector size
+     * @param N1            Number of elements in the first dimension
+     * @param N2            Number of elements in the second dimension
+     * @param N3            Number of elements in the third dimension
+     * @param N4            Number of elements in the fourth dimension
+     * @param N5            Number of elements in the fifth dimension
+     */
+    constexpr ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 )
+        : d_ndim( 5 ), d_length( N1 * N2 * N3 * N4 * N5 ), d_N{ N1, N2, N3, N4, N5 }
+    {
+    }
+
+    /*!
+     * Create from initializer list
+     * @param N             Size of the array
+     */
+    constexpr ArraySize( std::initializer_list<size_t> N )
+        : d_ndim( N.size() ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 }
+    {
+        if ( d_ndim > maxDim() )
+            throw std::out_of_range( "Maximum number of dimensions exceeded" );
+        auto it = N.begin();
+        for ( size_t i = 0; i < d_ndim; i++, ++it )
+            d_N[i] = *it;
+        d_length = 1;
+        for ( unsigned long i : d_N )
+            d_length *= i;
+        if ( d_ndim == 0 )
+            d_length = 0;
+    }
+
+
+    /*!
+     * Create from raw pointer
+     * @param ndim          Number of dimensions
+     * @param dims          Dimensions
+     */
+    constexpr ArraySize( size_t ndim, const size_t *dims )
+        : d_ndim( ndim ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 }
+    {
+        if ( d_ndim > maxDim() )
+            throw std::out_of_range( "Maximum number of dimensions exceeded" );
+        for ( size_t i = 0; i < ndim; i++ )
+            d_N[i] = dims[i];
+        d_length = 1;
+        for ( unsigned long i : d_N )
+            d_length *= i;
+        if ( d_ndim == 0 )
+            d_length = 0;
+    }
+
+    /*!
+     * Create from std::vector
+     * @param N             Size of the array
+     */
+    ArraySize( const std::vector<size_t> &N );
+
+    // Copy/assignment constructors
+    constexpr ArraySize( ArraySize &&rhs )      = default;
+    constexpr ArraySize( const ArraySize &rhs ) = default;
+    constexpr ArraySize &operator=( ArraySize &&rhs ) = default;
+    constexpr ArraySize &operator=( const ArraySize &rhs ) = default;
+
+    /*!
+     * Access the ith dimension
+     * @param i             Index to access
+     */
+    constexpr ARRAY_ATTRIBUTE size_t operator[]( size_t i ) const { return d_N[i]; }
+
+    //! Return the number of dimensions
+    constexpr ARRAY_ATTRIBUTE uint8_t ndim() const { return d_ndim; }
+
+    //! Return the number of dimensions
+    constexpr ARRAY_ATTRIBUTE size_t size() const { return d_ndim; }
+
+    //! Return the total number of elements in the array
+    constexpr ARRAY_ATTRIBUTE size_t length() const { return d_length; }
+
+    //! Resize the dimension
+    constexpr void resize( uint8_t dim, size_t N )
+    {
+        if ( dim >= d_ndim )
+            throw std::out_of_range( "Invalid dimension" );
+        d_N[dim] = N;
+        d_length = 1;
+        for ( unsigned long i : d_N )
+            d_length *= i;
+    }
+
+    /*!
+     * Reshape the Array so that the number of dimensions is the
+     *    max of ndim and the largest dim>1.
+     * @param ndim          Desired number of dimensions
+     */
+    constexpr void setNdim( uint8_t ndim ) { d_ndim = std::max( ndim, d_ndim ); }
+
+    //! Returns an iterator to the beginning
+    constexpr const size_t *begin() const { return d_N; }
+
+    //! Returns an iterator to the end
+    constexpr const size_t *end() const { return d_N + d_ndim; }
+
+    // Check if two array sizes are equal
+    constexpr ARRAY_ATTRIBUTE bool operator==( const ArraySize &rhs ) const
+    {
+        return d_ndim == rhs.d_ndim && memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
+    }
+
+    // Check if two array sizes are equal (ignoring the dimension)
+    constexpr ARRAY_ATTRIBUTE bool approxEqual( const ArraySize &rhs ) const
+    {
+        return ( length() == 0 && rhs.length() == 0 ) || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
+    }
+
+    //! Check if two matrices are not equal
+    constexpr ARRAY_ATTRIBUTE bool operator!=( const ArraySize &rhs ) const
+    {
+        return d_ndim != rhs.d_ndim || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) != 0;
+    }
+
+    //! Maximum supported dimension
+    constexpr ARRAY_ATTRIBUTE static uint8_t maxDim() { return 5u; }
+
+    //! Get the index
+    constexpr ARRAY_ATTRIBUTE size_t index( size_t i ) const
+    {
+        CHECK_ARRAY_LENGTH( i );
+        return i;
+    }
+
+    //! Get the index
+    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2 ) const
+    {
+        size_t index = i1 + i2 * d_N[0];
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3 ) const
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * i3 );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * i4 ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+    //! Get the index
+    constexpr ARRAY_ATTRIBUTE size_t index(
+        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
+    {
+        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * ( i4 + d_N[3] * i5 ) ) );
+        CHECK_ARRAY_LENGTH( index );
+        return index;
+    }
+
+private:
+    uint8_t d_ndim;
+    size_t d_length;
+    size_t d_N[5];
+};
+
+
+// Function to concatenate dimensions of two array sizes
+constexpr ArraySize cat( const ArraySize &x, const ArraySize &y )
+{
+    if ( x.ndim() + y.ndim() > ArraySize::maxDim() )
+        throw std::out_of_range( "Maximum number of dimensions exceeded" );
+    size_t N[ArraySize::maxDim()] = { 0 };
+    for ( int i = 0; i < x.ndim(); i++ )
+        N[i] = x[i];
+    for ( int i = 0; i < y.ndim(); i++ )
+        N[i + x.ndim()] = y[i];
+    return ArraySize( x.ndim() + y.ndim(), N );
+}
+
+
+#endif
--- a/common/FunctionTable.h
+++ b/common/FunctionTable.h
@ -17,7 +17,7 @@
 #define included_FunctionTable


-#include "common/Array.h"
+#include "common/ArraySize.h"

 #include <functional>

@ -80,17 +80,43 @@ public:
     * @param[out] c            The output array
     */
    template<class TYPE, class FUN>
-    static void multiply(
+    static inline void multiply(
        const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c );

+    /*!
+     * Perform dgemv/dgemm equavalent operation ( C = alpha*A*B + beta*C )
+     * @param[in] alpha         The scalar value alpha
+     * @param[in] A             The first array
+     * @param[in] B             The second array
+     * @param[in] beta          The scalar value alpha
+     * @param[in,out] c         The output array C
+     */
+    template<class TYPE, class FUN>
+    static void gemm( const TYPE alpha, const Array<TYPE, FUN> &A, const Array<TYPE, FUN> &B,
+        const TYPE beta, Array<TYPE, FUN> &C );
+
+    /*!
+     * Perform axpy equavalent operation ( y = alpha*x + y )
+     * @param[in] alpha         The scalar value alpha
+     * @param[in] x             The input array x
+     * @param[in,out] y         The output array y
+     */
+    template<class TYPE, class FUN>
+    static void axpy( const TYPE alpha, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y );
+
+    /*!
+     * Check if two arrays are approximately equal
+     * @param[in] A             The first array
+     * @param[in] B             The second array
+     * @param[in] tol           The tolerance
+     */
+    template<class TYPE, class FUN>
+    static bool equals( const Array<TYPE, FUN> &A, const Array<TYPE, FUN> &B, TYPE tol );
+

 private:
    FunctionTable();
-
-    template<class T>
-    static inline void rand( size_t N, T *x );
 };

-#include "common/FunctionTable.hpp"

 #endif
--- a/common/FunctionTable.hpp
+++ b/common/FunctionTable.hpp
@ -43,37 +43,30 @@
 /********************************************************
 *  Random number initialization                         *
 ********************************************************/
+template<class TYPE>
+static inline typename std::enable_if<std::is_integral<TYPE>::value>::type genRand(
+    size_t N, TYPE* x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_int_distribution<TYPE> dis;
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
+template<class TYPE>
+static inline typename std::enable_if<std::is_floating_point<TYPE>::value>::type genRand(
+    size_t N, TYPE* x )
+{
+    std::random_device rd;
+    std::mt19937 gen( rd() );
+    std::uniform_real_distribution<TYPE> dis( 0, 1 );
+    for ( size_t i = 0; i < N; i++ )
+        x[i] = dis( gen );
+}
 template<class TYPE, class FUN>
-void FunctionTable::rand( Array<TYPE, FUN> &x )
+inline void FunctionTable::rand( Array<TYPE, FUN>& x )
 {
-    FunctionTable::rand<TYPE>( x.length(), x.data() );
-}
-template<>
-inline void FunctionTable::rand<double>( size_t N, double *x )
-{
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_real_distribution<> dis( 0, 1 );
-    for ( size_t i = 0; i < N; i++ )
-        x[i] = dis( gen );
-}
-template<>
-inline void FunctionTable::rand<float>( size_t N, float *x )
-{
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_real_distribution<> dis( 0, 1 );
-    for ( size_t i = 0; i < N; i++ )
-        x[i] = dis( gen );
-}
-template<>
-inline void FunctionTable::rand<int>( size_t N, int *x )
-{
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_int_distribution<> dis;
-    for ( size_t i = 0; i < N; i++ )
-        x[i] = dis( gen );
+    genRand<TYPE>( x.length(), x.data() );
 }


@ -81,11 +74,11 @@ inline void FunctionTable::rand<int>( size_t N, int *x )
 *  Reduction                                            *
 ********************************************************/
 template<class TYPE, class FUN, typename LAMBDA>
-inline TYPE FunctionTable::reduce( LAMBDA &op, const Array<TYPE, FUN> &A )
+inline TYPE FunctionTable::reduce( LAMBDA& op, const Array<TYPE, FUN>& A )
 {
    if ( A.length() == 0 )
        return TYPE();
-    const TYPE *x  = A.data();
+    const TYPE* x  = A.data();
    TYPE y         = x[0];
    const size_t N = A.length();
    for ( size_t i = 1; i < N; i++ )
@ -98,7 +91,7 @@ inline TYPE FunctionTable::reduce( LAMBDA &op, const Array<TYPE, FUN> &A )
 *  Unary transformation                                 *
 ********************************************************/
 template<class TYPE, class FUN, typename LAMBDA>
-inline void FunctionTable::transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y )
+inline void FunctionTable::transform( LAMBDA& fun, const Array<TYPE, FUN>& x, Array<TYPE, FUN>& y )
 {
    y.resize( x.size() );
    const size_t N = x.length();
@ -107,9 +100,9 @@ inline void FunctionTable::transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Ar
 }
 template<class TYPE, class FUN, typename LAMBDA>
 inline void FunctionTable::transform(
-    LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z )
+    LAMBDA& fun, const Array<TYPE, FUN>& x, const Array<TYPE, FUN>& y, Array<TYPE, FUN>& z )
 {
-    if ( !x.sizeMatch( y ) )
+    if ( x.size() != y.size() )
        throw std::logic_error( "Sizes of x and y do not match" );
    z.resize( x.size() );
    const size_t N = x.length();
@ -118,29 +111,157 @@ inline void FunctionTable::transform(
 }


+/********************************************************
+ *  axpy                                                 *
+ ********************************************************/
+template<class TYPE>
+inline void call_axpy( size_t N, const TYPE alpha, const TYPE* x, TYPE* y );
+template<>
+inline void call_axpy<float>( size_t, const float, const float*, float* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<>
+inline void call_axpy<double>( size_t, const double, const double*, double* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<class TYPE>
+inline void call_axpy( size_t N, const TYPE alpha, const TYPE* x, TYPE* y )
+{
+    for ( size_t i = 0; i < N; i++ )
+        y[i] += alpha * x[i];
+}
+template<class TYPE, class FUN>
+void FunctionTable::axpy( const TYPE alpha, const Array<TYPE, FUN>& x, Array<TYPE, FUN>& y )
+{
+    if ( x.size() != y.size() )
+        throw std::logic_error( "Array sizes do not match" );
+    call_axpy( x.length(), alpha, x.data(), y.data() );
+}
+
+
 /********************************************************
 *  Multiply two arrays                                  *
 ********************************************************/
+template<class TYPE>
+inline void call_gemv( size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* x, TYPE* y );
+template<>
+inline void call_gemv<double>(
+    size_t, size_t, double, double, const double*, const double*, double* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<>
+inline void call_gemv<float>( size_t, size_t, float, float, const float*, const float*, float* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<class TYPE>
+inline void call_gemv(
+    size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* x, TYPE* y )
+{
+    for ( size_t i = 0; i < M; i++ )
+        y[i] = beta * y[i];
+    for ( size_t j = 0; j < N; j++ ) {
+        for ( size_t i = 0; i < M; i++ )
+            y[i] += alpha * A[i + j * M] * x[j];
+    }
+}
+template<class TYPE>
+inline void call_gemm(
+    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* B, TYPE* C );
+template<>
+inline void call_gemm<double>( size_t, size_t, size_t, double, double, const double*, const double*, double* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<>
+inline void call_gemm<float>( size_t, size_t, size_t, float, float, const float*, const float*, float* )
+{
+    throw std::logic_error( "LapackWrappers not configured" );
+}
+template<class TYPE>
+inline void call_gemm(
+    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* B, TYPE* C )
+{
+    for ( size_t i = 0; i < K * M; i++ )
+        C[i] = beta * C[i];
+    for ( size_t k = 0; k < K; k++ ) {
+        for ( size_t j = 0; j < N; j++ ) {
+            for ( size_t i = 0; i < M; i++ )
+                C[i + k * M] += alpha * A[i + j * M] * B[j + k * N];
+        }
+    }
+}
+template<class TYPE, class FUN>
+void FunctionTable::gemm( const TYPE alpha, const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b,
+    const TYPE beta, Array<TYPE, FUN>& c )
+{
+    if ( a.ndim() == 2 && b.ndim() == 1 ) {
+        if ( a.size( 1 ) != b.size( 0 ) )
+            throw std::logic_error( "Inner dimensions must match" );
+        call_gemv<TYPE>( a.size( 0 ), a.size( 1 ), alpha, beta, a.data(), b.data(), c.data() );
+    } else if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
+        if ( a.size( 1 ) != b.size( 0 ) )
+            throw std::logic_error( "Inner dimensions must match" );
+        call_gemm<TYPE>(
+            a.size( 0 ), a.size( 1 ), b.size( 1 ), alpha, beta, a.data(), b.data(), c.data() );
+    } else {
+        throw std::logic_error( "Not finished yet" );
+    }
+}
 template<class TYPE, class FUN>
 void FunctionTable::multiply(
-    const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c )
+    const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, Array<TYPE, FUN>& c )
 {
-    if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
+    if ( a.ndim() == 2 && b.ndim() == 1 ) {
+        if ( a.size( 1 ) != b.size( 0 ) )
+            throw std::logic_error( "Inner dimensions must match" );
+        c.resize( a.size( 0 ) );
+        call_gemv<TYPE>( a.size( 0 ), a.size( 1 ), 1, 0, a.data(), b.data(), c.data() );
+    } else if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
        if ( a.size( 1 ) != b.size( 0 ) )
            throw std::logic_error( "Inner dimensions must match" );
        c.resize( a.size( 0 ), b.size( 1 ) );
-        c.fill( 0 );
-        for ( size_t k = 0; k < b.size( 1 ); k++ ) {
-            for ( size_t j = 0; j < a.size( 1 ); j++ ) {
-                for ( size_t i = 0; i < a.size( 0 ); i++ ) {
-                    c( i, k ) += a( i, j ) * b( j, k );
-                }
-            }
-        }
+        call_gemm<TYPE>(
+            a.size( 0 ), a.size( 1 ), b.size( 1 ), 1, 0, a.data(), b.data(), c.data() );
    } else {
        throw std::logic_error( "Not finished yet" );
    }
 }


+/********************************************************
+ *  Check if two arrays are equal                        *
+ ********************************************************/
+template<class TYPE, class FUN>
+inline typename std::enable_if<!std::is_floating_point<TYPE>::value, bool>::type
+FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE )
+{
+    bool pass = true;
+    if ( a.size() != b.size() )
+        throw std::logic_error( "Sizes of x and y do not match" );
+    for ( size_t i = 0; i < a.length(); i++ )
+        pass = pass && a( i ) == b( i );
+    return pass;
+}
+template<class TYPE, class FUN>
+inline typename std::enable_if<std::is_floating_point<TYPE>::value, bool>::type
+FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE tol )
+{
+    bool pass = true;
+    if ( a.size() != b.size() )
+        throw std::logic_error( "Sizes of x and y do not match" );
+    for ( size_t i = 0; i < a.length(); i++ )
+        pass = pass && ( std::abs( a( i ) - b( i ) ) < tol );
+    return pass;
+}
+template<class TYPE, class FUN>
+bool FunctionTable::equals( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE tol )
+{
+    return FunctionTableCompare( a, b, tol );
+}
+
+
 #endif
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@ -428,12 +428,12 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	for (k=2; k<Nz-2; k++){
 		for (j=2; j<Ny-2; j++){
 			for (i=2; i<Nx-2; i++){
-			  // Local index (regular layout)
-			  n = k*Nx*Ny + j*Nx + i;
-			  if (id[n] > 0 ){
-				Map(n) = idx++;
-				//neighborList[idx++] = n; // index of self in regular layout
-			  }
+				// Local index (regular layout)
+				n = k*Nx*Ny + j*Nx + i;
+				if (id[n] > 0 ){
+					Map(n) = idx++;
+					//neighborList[idx++] = n; // index of self in regular layout
+				}
 			}
 		}
 	}
@ -558,14 +558,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	}
 	ScaLBL_CopyToDevice(dvcSendList_x,TempBuffer,sendCount_x*sizeof(int));

-
-
-
-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcSendList_y,sendCount_y*sizeof(int));
 	for (i=0; i<sendCount_y; i++){
 		n = TempBuffer[i];
@ -583,12 +575,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	ScaLBL_CopyToDevice(dvcSendList_z,TempBuffer,sendCount_z*sizeof(int));


-
-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcSendList_X,sendCount_X*sizeof(int));
 	for (i=0; i<sendCount_X; i++){
 		n = TempBuffer[i];
@ -600,11 +586,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	ScaLBL_CopyToDevice(dvcSendList_X,TempBuffer,sendCount_X*sizeof(int));


-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcSendList_Y,sendCount_Y*sizeof(int));
 	for (i=0; i<sendCount_Y; i++){
 		n = TempBuffer[i];
@ -731,19 +712,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	ScaLBL_CopyToDevice(dvcRecvDist_x,TempBuffer,5*recvCount_x*sizeof(int));


-
-
-
-
-
-
-
-
-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcRecvDist_y,5*recvCount_y*sizeof(int));
 	for (i=0; i<5*recvCount_y; i++){
 		n = TempBuffer[i];
@ -763,15 +731,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	ScaLBL_CopyToDevice(dvcRecvDist_z,TempBuffer,5*recvCount_z*sizeof(int));


-
-
-
-
-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcRecvDist_X,5*recvCount_X*sizeof(int));
 	for (i=0; i<5*recvCount_X; i++){
 		n = TempBuffer[i];
@ -783,22 +742,6 @@ int ScaLBL_Communicator::MemoryOptimizedLayoutAA(IntArray &Map, int *neighborLis
 	ScaLBL_CopyToDevice(dvcRecvDist_X,TempBuffer,5*recvCount_X*sizeof(int));


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 	ScaLBL_CopyToHost(TempBuffer,dvcRecvDist_Y,5*recvCount_Y*sizeof(int));
 	for (i=0; i<5*recvCount_Y; i++){
 		n = TempBuffer[i];
--- a/cpu/D3Q19.cpp
+++ b/cpu/D3Q19.cpp
@ -27,36 +27,6 @@ extern "C" void ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double
 	}
 }

-/*extern "C" void ScaLBL_D3Q19_Unpack(int q, int Cqx, int Cqy, int Cqz, int *list,  int start, int count,
-					   double *recvbuf, double *dist, int Nx, int Ny, int Nz){
-	//....................................................................................
-	// Unack distribution from the recv buffer
-	// Distribution q matche Cqx, Cqy, Cqz
-	// swap rule means that the distributions in recvbuf are OPPOSITE of q
-	// dist may be even or odd distributions stored by stream layout
-	//....................................................................................
-	int i,j,k,n,nn,idx;
-	int N = Nx*Ny*Nz;
-	for (idx=0; idx<count; idx++){
-		// Get the value from the list -- note that n is the index is from the send (non-local) process
-		n = list[idx];
-		// Get the 3-D indices
-		k = n/(Nx*Ny);
-		j = (n-Nx*Ny*k)/Nx;
-		i = n-Nx*Ny*k-Nx*j;
-		// Streaming for the non-local distribution
-		i += Cqx;
-		j += Cqy;
-		k += Cqz;
-
-		nn = k*Nx*Ny+j*Nx+i;
-
-		// unpack the distribution to the proper location
-		dist[q*N+nn] = recvbuf[start+idx];
-	}
-}*/
-
-
 extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list,  int start, int count,
 		double *recvbuf, double *dist, int N){
 	//....................................................................................
@ -76,36 +46,6 @@ extern "C" void ScaLBL_D3Q19_Unpack(int q, int *list,  int start, int count,
 	}
 }

-/*
-extern "C" void ScaLBL_D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list,  int start, int count,
-					   int *d3q19_recvlist, int Nx, int Ny, int Nz){
-	//....................................................................................
-	// Map the recieve distributions to
-	// Distribution q matche Cqx, Cqy, Cqz
-	// swap rule means that the distributions in recvbuf are OPPOSITE of q
-	// dist may be even or odd distributions stored by stream layout
-	//....................................................................................
-
-	int i,j,k,n,nn,idx;
-	int N = Nx*Ny*Nz;
-	for (idx=0; idx<count; idx++){
-		// Get the value from the list -- note that n is the index is from the send (non-local) process
-		n = list[idx];
-		// Get the 3-D indices
-		k = n/(Nx*Ny);
-		j = (n-Nx*Ny*k)/Nx;
-		i = n-Nx*Ny*k-Nx*j;
-		// Streaming for the non-local distribution
-		i += Cqx;
-		j += Cqy;
-		k += Cqz;
-		// compute 1D index for the neighbor and save
-		nn = k*Nx*Ny+j*Nx+i;
-		d3q19_recvlist[start+idx] = nn;
-	}
-}
-
- */
 extern "C" void ScaLBL_D3Q19_AA_Init(double *f_even, double *f_odd, int Np)
 {
 	int n;
--- a/example/Piston/Piston.py
+++ b/example/Piston/Piston.py
@ -0,0 +1,29 @@
+import numpy
+
+nx=96
+ny=24
+nz=24
+N=nx*ny*nz
+
+mesh=(nx,ny,nz)
+data=numpy.ones(mesh,dtype=numpy.int8)
+
+#print(data)
+print("Writing piston")
+print("Mesh size: "+repr(mesh))
+
+radius = 8
+# assign a bubble in the middle
+for x in range(0,nx):
+  for y in range(0,ny):
+    for z in range(0,nz):
+       Y = y - ny/2
+       Z = z - nz/2
+       if Y*Y+Z*Z > radius*radius:
+          data[x,y,z]=0
+       elif x < 12:
+          data[x,y,z]=1
+       else:
+          data[x,y,z]=2
+
+data.tofile("Piston.raw")
--- a/example/Piston/Piston.sh
+++ b/example/Piston/Piston.sh
@ -1,37 +1,7 @@
 #!/bin/bash

-# Lines assigning various pressure BC for Color.in
-echo "0 1 1.01 0.99" > Color.in.pressures
-echo "0 1 1.0125 0.9875" >> Color.in.pressures
-echo "0 1 1.015 0.985" >> Color.in.pressures
-echo "0 1 1.02 0.98" >> Color.in.pressures
-echo "0 1 1.025 0.975" >> Color.in.pressures
-echo "0 1 1.03 0.97" >> Color.in.pressures
+LBPM_DIR=../../tests

-for i in `seq 1 6`; do 
-    # Set up cases for each boundary pressure pair
-    dir="Case"$i
-    echo $dir
-    mkdir -p $dir
-    # copy the domain file
-    cp Domain.in $dir
-
-    # set up each case -- parameters are fixed in Color.in, with multiple cases to set the boundary pressure
-    sed -n '1p' Color.in > $dir/Color.in
-    sed -n '2p' Color.in >> $dir/Color.in
-    sed -n '3p' Color.in >> $dir/Color.in
-    sed -n '4p' Color.in >> $dir/Color.in
-#    sed -n '5p' Color.in >> $dir/Color.in
-    # print the pressure values into the input file
-    sed -n "${i}p" Color.in.pressures >> $dir/Color.in
-    sed -n '6p' Color.in >> $dir/Color.in
-
-done
-
-# simulations should be run using the following syntax
-# PRE-PROCESSOR - set the radius to 18 voxel lengths
-#mpirun -np 10 ~/install-LBPM-WIA/bin/lbpm_captube_pp 18 1
-# RUN THE SIMULAUTION 
-#mpirun -np 10 ~/install-LBPM-WIA/bin/lbpm_color_simulator
-
-exit;
+python Piston.py
+mpirun -np 1 $LBPM_DIR/lbpm_serial_decomp input.db
+mpirun -np 4 $LBPM_DIR/lbpm_color_simulator input.db
--- a/example/Piston/input.db
+++ b/example/Piston/input.db
@ -1,34 +1,42 @@
 Color {
-    tau   = 1.0;  
-    alpha = 1e-2;
+    tauA   = 0.7;  
+    tauB   = 0.7;  
+    rhoA   = 1.0;  
+    rhoB   = 1.0;  
+    alpha = 1e-3;
    beta  = 0.95;
-    phi_s = 0.8;
-    wp_saturation = 0.7
    F = 0, 0, 0
    Restart = false
    pBC = 0
    din = 1.0
    dout = 1.0
-    timestepMax = 200
+    timestepMax = 3000
    interval = 1000
    tol = 1e-5;
    das = 0.1
    dbs = 0.9
+    flux = 2.0
+    ComponentLabels = 0
+    ComponentAffinity = -1.0;
 }

 Domain {
-    nproc = 1, 1, 1     // Number of processors (Npx,Npy,Npz)
-    n = 16, 16, 16      // Size of local domain (Nx,Ny,Nz)
+    Filename = "Piston.raw"
+    nproc = 1, 1, 4     // Number of processors (Npx,Npy,Npz)
+    n = 24, 24, 24      // Size of local domain (Nx,Ny,Nz)
+    N = 24, 24, 96      // size of the input image
    n_spheres = 1       // Number of spheres
    L = 1, 1, 1         // Length of domain (x,y,z)
-    BC = 0              // Boundary condition type
+    BC = 4              // Boundary condition type
+    ReadValues = 0, 1, 2
+    WriteValues = 0, 1, 2
 }

 Analysis {
    blobid_interval = 1000      // Frequency to perform blob identification
    analysis_interval = 1000    // Frequency to perform analysis
-    restart_interval = 20000    // Frequency to write restart data
-    vis_interval = 20000        // Frequency to write visualization data
+    restart_interval = 1000    // Frequency to write restart data
+    visualization_interval = 1000        // Frequency to write visualization data
    restart_file = "Restart"    // Filename to use for restart file (will append rank)
    N_threads    = 4            // Number of threads to use
    load_balance = "independent" // Load balance method to use: "none", "default", "independent"
--- a/gpu/D3Q19.cu
+++ b/gpu/D3Q19.cu
@ -142,7 +142,7 @@ __global__ void deviceReduceKernel(double *in, double* out, int N) {
 		out[blockIdx.x]=sum;
 }

-__global__  void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){
+__global__ void dvc_ScaLBL_D3Q19_Pack(int q, int *list, int start, int count, double *sendbuf, double *dist, int N){
 	//....................................................................................
 	// Pack distribution q into the send buffer for the listed lattice sites
 	// dist may be even or odd distributions stored by stream layout
@ -176,36 +176,6 @@ __global__ void dvc_ScaLBL_D3Q19_Unpack(int q,  int *list,  int start, int count
 		}
 	}
 }
-/*
-__global__ void dvc_ScaLBL_D3Q19_MapRecv(int q, int Cqx, int Cqy, int Cqz, int *list,  int start, int count,
-						   int *d3q19_recvlist, int Nx, int Ny, int Nz){
-	//....................................................................................
-	// Unack distribution from the recv buffer
-	// Distribution q matche Cqx, Cqy, Cqz
-	// swap rule means that the distributions in recvbuf are OPPOSITE of q
-	// dist may be even or odd distributions stored by stream layout
-	//....................................................................................
-	int i,j,k,n,nn,idx;
-	int N = Nx*Ny*Nz;
-	idx = blockIdx.x*blockDim.x + threadIdx.x;
-	if (idx<count){
-		// Get the value from the list -- note that n is the index is from the send (non-local) process
-		n = list[idx];
-		// Get the 3-D indices
-		k = n/(Nx*Ny);
-		j = (n-Nx*Ny*k)/Nx;
-		i = n-Nx*Ny*k-Nx*j;
-		// Streaming for the non-local distribution
-		i += Cqx;
-		j += Cqy;
-		k += Cqz;
-		nn = k*Nx*Ny+j*Nx+i;
-		// unpack the distribution to the proper location
-		d3q19_recvlist[start+idx]=nn;
-	}
-}
- */
-

 __global__ void dvc_ScaLBL_D3Q19_Init(char *ID, double *f_even, double *f_odd, int Nx, int Ny, int Nz)
 {
--- a/gpu/Extras.cu
+++ b/gpu/Extras.cu
@ -48,8 +48,8 @@ extern "C" void ScaLBL_CopyToDevice(void* dest, const void* source, size_t size)
 }

 extern "C" void ScaLBL_AllocateZeroCopy(void** address, size_t size){
-	cudaMallocHost(address,size);
-	//cudaMalloc(address,size);
+	//cudaMallocHost(address,size);
+	cudaMalloc(address,size);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
 		printf("Error in cudaMallocHost: %s \n",cudaGetErrorString(err));
@ -57,9 +57,9 @@ extern "C" void ScaLBL_AllocateZeroCopy(void** address, size_t size){
 }

 extern "C" void ScaLBL_CopyToZeroCopy(void* dest, const void* source, size_t size){
-        //cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
-        //cudaError_t err = cudaGetLastError();
-        memcpy(dest, source, size);
+        cudaMemcpy(dest,source,size,cudaMemcpyHostToDevice);
+        cudaError_t err = cudaGetLastError();
+        //memcpy(dest, source, size);

 }

--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@ -280,8 +280,14 @@ void ScaLBL_ColorModel::Create(){
 		}
 	}
 	// check that TmpMap is valid
-	for (int idx=0; idx<ScaLBL_Comm->LastInterior(); idx++){
-		if (idx == ScaLBL_Comm->LastExterior()) idx = ScaLBL_Comm->FirstInterior();
+	for (int idx=0; idx<ScaLBL_Comm->LastExterior(); idx++){
+		int n = TmpMap[idx];
+		if (n > Nx*Ny*Nz){
+			printf("Bad value! idx=%i \n");
+			TmpMap[idx] = Nx*Ny*Nz-1;
+		}
+	}
+	for (int idx=ScaLBL_Comm->FirstInterior(); idx<ScaLBL_Comm->LastInterior(); idx++){
 		int n = TmpMap[idx];
 		if (n > Nx*Ny*Nz){
 			printf("Bad value! idx=%i \n");
@ -417,10 +423,15 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_Comm->BiSendD3Q7AA(Aq,Bq); //READ FROM NORMAL
 		ScaLBL_D3Q7_AAodd_PhaseField(NeighborList, dvcMap, Aq, Bq, Den, Phi, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
 		ScaLBL_Comm->BiRecvD3Q7AA(Aq,Bq); //WRITE INTO OPPOSITE
+		ScaLBL_DeviceBarrier();
 		ScaLBL_D3Q7_AAodd_PhaseField(NeighborList, dvcMap, Aq, Bq, Den, Phi, 0, ScaLBL_Comm->LastExterior(), Np);
 		
 		// Perform the collision operation
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FROM NORMAL
+		if (BoundaryCondition > 0){
+			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
+			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
+		}
 		// Halo exchange for phase field
 		ScaLBL_Comm_Regular->SendHalo(Phi);

@ -428,11 +439,8 @@ void ScaLBL_ColorModel::Run(){
 				alpha, beta, Fx, Fy, Fz, Nx, Nx*Ny, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
 		ScaLBL_Comm_Regular->RecvHalo(Phi);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		ScaLBL_DeviceBarrier();
 		// Set BCs
-		if (BoundaryCondition > 0){
-			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
-			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
-		}
 		if (BoundaryCondition == 3){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
@ -451,21 +459,23 @@ void ScaLBL_ColorModel::Run(){
 		ScaLBL_Comm->BiSendD3Q7AA(Aq,Bq); //READ FROM NORMAL
 		ScaLBL_D3Q7_AAeven_PhaseField(dvcMap, Aq, Bq, Den, Phi, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
 		ScaLBL_Comm->BiRecvD3Q7AA(Aq,Bq); //WRITE INTO OPPOSITE
+		ScaLBL_DeviceBarrier();
 		ScaLBL_D3Q7_AAeven_PhaseField(dvcMap, Aq, Bq, Den, Phi, 0, ScaLBL_Comm->LastExterior(), Np);

 		// Perform the collision operation
 		ScaLBL_Comm->SendD3Q19AA(fq); //READ FORM NORMAL
 		// Halo exchange for phase field
+		if (BoundaryCondition > 0){
+			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
+			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
+		}
 		ScaLBL_Comm_Regular->SendHalo(Phi);
 		ScaLBL_D3Q19_AAeven_Color(dvcMap, fq, Aq, Bq, Den, Phi, Velocity, rhoA, rhoB, tauA, tauB,
 				alpha, beta, Fx, Fy, Fz,  Nx, Nx*Ny, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
 		ScaLBL_Comm_Regular->RecvHalo(Phi);
 		ScaLBL_Comm->RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
+		ScaLBL_DeviceBarrier();
 		// Set boundary conditions
-		if (BoundaryCondition > 0){
-			ScaLBL_Comm->Color_BC_z(dvcMap, Phi, Den, inletA, inletB);
-			ScaLBL_Comm->Color_BC_Z(dvcMap, Phi, Den, outletA, outletB);
-		}
 		if (BoundaryCondition == 3){
 			ScaLBL_Comm->D3Q19_Pressure_BC_z(NeighborList, fq, din, timestep);
 			ScaLBL_Comm->D3Q19_Pressure_BC_Z(NeighborList, fq, dout, timestep);
--- a/sample_scripts/configure_desktop
+++ b/sample_scripts/configure_desktop
@ -6,7 +6,7 @@ cmake                                    \
    -D CMAKE_CXX_COMPILER:PATH=mpicxx        \
    -D CMAKE_C_FLAGS="-g "         \
    -D CMAKE_CXX_FLAGS="-g "      \
-    -D CXX_STD=11 	   \
+    -D CMAKE_CXX_STANDARD=14 	   \
    -D MPI_COMPILER:BOOL=TRUE            \
    -D MPIEXEC=mpirun                     \
    -D USE_EXT_MPI_FOR_SERIAL_TESTS:BOOL=TRUE \
@ -14,10 +14,10 @@ cmake                                    \
    -D CUDA_FLAGS="-arch sm_35"          \
    -D CUDA_HOST_COMPILER="/usr/bin/gcc" \
    -D USE_NETCDF=0 			 \
-    -D HDF5_DIRECTORY="/home/mcclurej/TPL/hdf5/"	 \
-    -D HDF5_LIB="/home/mcclurej/TPL/hdf5/lib/libhdf5.a"	\
-    -D SILO_LIB="/home/mcclurej/TPL/silo/lib/libsiloh5.a"	 \
-    -D SILO_DIRECTORY="/home/mcclurej/TPL/silo/"	 \
+    -D HDF5_DIRECTORY="/opt/apps/hdf5/"	 \
+    -D HDF5_LIB="/opt/apps/hdf5/lib/libhdf5.a"	\
+    -D SILO_LIB="/opt/apps/silo/lib/libsiloh5.a"	 \
+    -D SILO_DIRECTORY="/opt/apps/silo/"	 \
    -D NETCDF_DIRECTORY="/apps/netcdf/"	 \
    -D USE_SILO=1			 \
    -D USE_CUDA=0                        \
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -61,7 +61,7 @@ ADD_LBPM_TEST_1_2_4( TestBlobIdentify )
 #ADD_LBPM_TEST_PARALLEL( TestTwoPhase 8 )
 #ADD_LBPM_TEST_PARALLEL( TestBlobAnalyze 8 )
 ADD_LBPM_TEST_PARALLEL( TestSegDist 8 )
-#ADD_LBPM_TEST_PARALLEL( TestCommD3Q19 8 )
+ADD_LBPM_TEST_PARALLEL( TestCommD3Q19 8 )
 #ADD_LBPM_TEST_PARALLEL( TestMassConservationD3Q7 1 )
 ADD_LBPM_TEST_1_2_4( testCommunication )
 ADD_LBPM_TEST_1_2_4( testUtilities )
--- a/tests/TestCommD3Q19.cpp
+++ b/tests/TestCommD3Q19.cpp
@ -13,8 +13,8 @@ using namespace std;

 std::shared_ptr<Database> loadInputs( int nprocs )
 {
-    auto db = std::make_shared<Database>( "Domain.in" );
-    const int dim = 50;
+    auto db = std::make_shared<Database>();
+    const int dim = 8;
    db->putScalar<int>( "BC", 0 );
    if ( nprocs == 1 ){
        db->putVector<int>( "nproc", { 1, 1, 1 } );
@ -180,17 +180,6 @@ int main(int argc, char **argv)
 	int check;

 	{
-		// parallel domain size (# of sub-domains)
-		int nprocx,nprocy,nprocz;
-		int iproc,jproc,kproc;
-		//*****************************************
-		// MPI ranks for all 18 neighbors
-		//**********************************
-		int rank_x,rank_y,rank_z,rank_X,rank_Y,rank_Z;
-		int rank_xy,rank_XY,rank_xY,rank_Xy;
-		int rank_xz,rank_XZ,rank_xZ,rank_Xz;
-		int rank_yz,rank_YZ,rank_yZ,rank_Yz;
-		//**********************************
 		MPI_Request req1[18],req2[18];
 		MPI_Status stat1[18],stat2[18];

@ -210,31 +199,15 @@ int main(int argc, char **argv)

        // Load inputs
        auto db = loadInputs( nprocs );
+	/*		auto filename = argv[1];
+		auto input_db = std::make_shared<Database>( filename );
+		auto db = input_db->getDatabase( "Domain" );
+	*/
        int Nx = db->getVector<int>( "n" )[0];
        int Ny = db->getVector<int>( "n" )[1];
        int Nz = db->getVector<int>( "n" )[2];
-
-		if (rank==0){
-			printf("********************************************************\n");
-			printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
-			printf("Parallel domain size = %i x %i x %i\n",nprocx,nprocy,nprocz);
-			printf("********************************************************\n");
-		}
-
-		MPI_Barrier(comm);
-		kproc = rank/(nprocx*nprocy);
-		jproc = (rank-nprocx*nprocy*kproc)/nprocx;
-		iproc = rank-nprocx*nprocy*kproc-nprocz*jproc;
-
-		double iVol_global = 1.0/Nx/Ny/Nz/nprocx/nprocy/nprocz;
 		
-		Domain Dm(db);
-
-
-		InitializeRanks( rank, nprocx, nprocy, nprocz, iproc, jproc, kproc,
-				rank_x, rank_y, rank_z, rank_X, rank_Y, rank_Z,
-				rank_xy, rank_XY, rank_xY, rank_Xy, rank_xz, rank_XZ, rank_xZ, rank_Xz,
-				rank_yz, rank_YZ, rank_yZ, rank_Yz );
+		auto Dm  = std::shared_ptr<Domain>(new Domain(db,comm));      // full domain for analysis

 		Nx += 2;
 		Ny += 2;
@ -253,25 +226,39 @@ int main(int argc, char **argv)
 		char *id;
 		id = new char[Nx*Ny*Nz];

-
-		if (rank==0) printf("Assigning phase ID from file \n");
+		/*		if (rank==0) printf("Assigning phase ID from file \n");
 		if (rank==0) printf("Initialize from segmented data: solid=0, NWP=1, WP=2 \n");
 		FILE *IDFILE = fopen(LocalRankFilename,"rb");
 		if (IDFILE==NULL) ERROR("Error opening file: ID.xxxxx");
 		fread(id,1,N,IDFILE);
 		fclose(IDFILE);
-		 
+		*/
 		// Setup the domain
 		for (k=0;k<Nz;k++){
 			for (j=0;j<Ny;j++){
 				for (i=0;i<Nx;i++){
 					n = k*Nx*Ny+j*Nx+i;
-					//id[n] = 1;
-					Dm.id[n] = id[n];
+					id[n] = 1;
+					Dm->id[n] = id[n];
 				}
 			}
 		}
-		Dm.CommInit();
+		Dm->CommInit();
+		int iproc,jproc,kproc;
+		int nprocx,nprocy,nprocz;
+		iproc = Dm->iproc();
+		jproc = Dm->jproc();
+		kproc = Dm->kproc();
+		nprocx = Dm->nprocx();
+		nprocy = Dm->nprocy();
+		nprocz = Dm->nprocz();
+
+		if (rank==0){
+			printf("********************************************************\n");
+			printf("Sub-domain size = %i x %i x %i\n",Nz,Nz,Nz);
+			printf("Parallel domain size = %i x %i x %i\n",Dm->nprocx(),Dm->nprocy(),Dm->nprocz());
+			printf("********************************************************\n");
+		}

 		//.......................................................................
 		// Compute the media porosity
@ -292,7 +279,8 @@ int main(int argc, char **argv)
 			}
 		}
 		MPI_Allreduce(&sum_local,&sum,1,MPI_DOUBLE,MPI_SUM,comm);
-		porosity = 1.0-sum*iVol_global;
+		double iVol_global=1.f/double((Nx-2)*(Ny-2)*(Nz-2)*nprocx*nprocy*nprocz);
+	    	porosity = 1.0-sum*iVol_global;
 		if (rank==0) printf("Media porosity = %f \n",porosity);
 		//.......................................................................

@ -301,20 +289,19 @@ int main(int argc, char **argv)
 		if (rank == 0) cout << "Domain set." << endl;
 		//...........................................................................

-		
 		//...........................................................................
 		if (rank==0)	printf ("Create ScaLBL_Communicator \n");
 		// Create a communicator for the device (will use optimized layout)
 		ScaLBL_Communicator ScaLBL_Comm(Dm);
 		
-		if (rank==0)	printf ("Set up memory efficient layout \n");
-		int neighborSize=18*Np*sizeof(int);
-		int *neighborList;
+		int Npad=(Np/16 + 2)*16;
+		if (rank==0)    printf ("Set up memory efficient layout, %i | %i | %i \n", Np, Npad, N);
+		auto neighborList= new int[18*Npad];
 		IntArray Map(Nx,Ny,Nz);
-		neighborList= new int[18*Np];
-		ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm.id,Np);
+		Map.fill(-2);		
+		Np = ScaLBL_Comm.MemoryOptimizedLayoutAA(Map,neighborList,Dm->id,Np);
 		MPI_Barrier(comm);
-		
+		int neighborSize=18*Np*sizeof(int);
 		//......................device distributions.................................
 		dist_mem_size = Np*sizeof(double);
 		if (rank==0)	printf ("Allocating distributions \n");
@ -380,7 +367,7 @@ int main(int argc, char **argv)
 		 */
 		if (rank==0)	printf("Setting the distributions, size = : %i\n", Np);
 		//...........................................................................
-		GlobalFlipScaLBL_D3Q19_Init(fq_host, Map, Np, Nx-2, Ny-2, Nz-2,iproc,jproc,kproc,nprocx,nprocy,nprocz);
+		GlobalFlipScaLBL_D3Q19_Init(fq_host, Map, Np, Nx-2, Ny-2, Nz-2, iproc,jproc,kproc,nprocx,nprocy,nprocz);
 		ScaLBL_CopyToDevice(fq, fq_host, 19*dist_mem_size);
 		ScaLBL_DeviceBarrier();
 		MPI_Barrier(comm);
@ -395,7 +382,6 @@ int main(int argc, char **argv)
 		
 		ScaLBL_Comm.RecvD3Q19AA(fq); //WRITE INTO OPPOSITE
 		
-		
 		//...........................................................................
 		ScaLBL_CopyToHost(fq_host,fq,19*Np*sizeof(double));
 		check =	GlobalCheckDebugDist(fq_host, Map, Np, Nx-2, Ny-2, Nz-2,iproc,jproc,kproc,nprocx,nprocy,nprocz,0,ScaLBL_Comm.next);
--- a/tests/test_dcel_minkowski.cpp
+++ b/tests/test_dcel_minkowski.cpp
@ -4,13 +4,13 @@
 #include "common/Domain.h"
 #include "common/SpherePack.h"

-using namespace std;
+#include "ProfilerApp.h"
+

 /*
 *  Compare the measured and analytical curvature for a sphere
 *
 */
-
 std::shared_ptr<Database> loadInputs( )
 {
  //auto db = std::make_shared<Database>( "Domain.in" );
@ -38,7 +38,7 @@ int main(int argc, char **argv)
 		int Nx = db->getVector<int>( "n" )[0];
 		int Ny = db->getVector<int>( "n" )[1];
 		int Nz = db->getVector<int>( "n" )[2];
-		std::shared_ptr<Domain> Dm = std::shared_ptr<Domain>(new Domain(db,comm));
+		auto Dm = std::make_shared<Domain>( db, comm );
 		
 		Nx+=2; Ny+=2; Nz+=2;
 		DoubleArray Phase(Nx,Ny,Nz);
@ -98,6 +98,7 @@ int main(int argc, char **argv)
 		printf("   Euler characteristic  = %f (analytical = 2.0) \n",sphere.Xi);
 		
 	}
+    PROFILE_SAVE("test_dcel_minkowski");
 	MPI_Barrier(comm);
 	MPI_Finalize();
 	return toReturn;