Merge branch 'master' of github.com:JamesEMcClure/LBPM-WIA

2021-09-01 07:39:10 -04:00 · 2021-09-01 07:39:10 -04:00 · 3d4a0a1f2f
commit 3d4a0a1f2f
parent bde1e7111a b36d9dea93
43 changed files with 6852 additions and 2902 deletions
--- a/IO/HDF5Writer.cpp
+++ b/IO/HDF5Writer.cpp
@ -0,0 +1,283 @@
+#include "IO/HDF5_IO.h"
+#include "IO/IOHelpers.h"
+#include "IO/MeshDatabase.h"
+#include "IO/Writer.h"
+#include "IO/Xdmf.h"
+#include "common/MPI.h"
+#include "common/Utilities.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <sys/stat.h>
+#include <vector>
+
+
+#ifdef USE_HDF5
+
+
+std::string to_string( const ArraySize &s )
+{
+    std::string out = "[" + std::to_string( s[0] );
+    for ( size_t i = 1; i < s.ndim(); i++ )
+        out += "," + to_string( s[i] );
+    out += "]";
+    return out;
+}
+
+
+Xdmf::Center getXdmfType( IO::VariableType type )
+{
+    if ( type == IO::VariableType::NodeVariable ) {
+        return Xdmf::Center::Node;
+    } else if ( type == IO::VariableType::VolumeVariable ) {
+        return Xdmf::Center::Cell;
+    } else {
+        ERROR( "Variable type not supported" );
+    }
+    return Xdmf::Center::Null;
+}
+
+
+// Write a PointList mesh (and variables) to a file
+template<class TYPE>
+static void writeCoordinates( hid_t fid, const std::vector<Point> &points )
+{
+    std::vector<TYPE> x( points.size() ), y( points.size() ), z( points.size() );
+    for ( size_t i = 0; i < x.size(); i++ ) {
+        x[i] = points[i].x;
+        y[i] = points[i].y;
+        z[i] = points[i].z;
+    }
+    IO::HDF5::writeHDF5( fid, "x", x );
+    IO::HDF5::writeHDF5( fid, "y", y );
+    IO::HDF5::writeHDF5( fid, "z", z );
+}
+static void writeHDF5PointList( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &meshData, IO::MeshDatabase database, Xdmf &xmf )
+{
+    auto meshname    = database.domains[0].name;
+    const auto &mesh = dynamic_cast<IO::PointList &>( *meshData.mesh );
+    auto gid         = IO::HDF5::createGroup( fid, meshname );
+    if ( meshData.precision == IO::DataType::Double ) {
+        writeCoordinates<double>( gid, mesh.getPoints() );
+    } else if ( meshData.precision == IO::DataType::Float ) {
+        writeCoordinates<float>( gid, mesh.getPoints() );
+    } else {
+        ERROR( "Unsupported format" );
+    }
+    auto path   = filename + ":/" + meshname + "/";
+    auto domain = Xdmf::createPointMesh(
+        meshname, 3, mesh.getPoints().size(), path + "x", path + "y", path + "z" );
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        auto &var     = *meshData.vars[i];
+        auto data     = var.data;
+        auto rankType = Xdmf::RankType::Null;
+        if ( data.ndim() == 1 ) {
+            rankType = Xdmf::RankType::Scalar;
+        } else if ( data.ndim() == 2 && data.size( 1 ) == 3 ) {
+            // Vector data, need to permute for visit
+            rankType = Xdmf::RankType::Vector;
+            data     = data.permute( { 1, 0 } );
+        } else {
+            ERROR( "Unable to determine variable rank: " + to_string( var.data.size() ) );
+        }
+        if ( var.precision == IO::DataType::Double ) {
+            IO::HDF5::writeHDF5( gid, var.name, data );
+        } else if ( var.precision == IO::DataType::Float ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<float>() );
+        } else if ( var.precision == IO::DataType::Int ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<int>() );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+        domain.addVariable(
+            meshname, var.name, data.size(), rankType, Xdmf::Center::Node, path + var.name );
+    }
+    xmf.addMesh( meshData.meshName, domain );
+}
+// Write a TriMesh mesh (and variables) to a file
+static void writeHDF5TriMesh2( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &meshData, const IO::TriMesh &mesh, IO::MeshDatabase database,
+    Xdmf &xmf )
+{
+    auto meshname = database.domains[0].name;
+    auto gid      = IO::HDF5::createGroup( fid, meshname );
+    auto path     = filename + ":/" + meshname + "/";
+    // Write the verticies
+    if ( meshData.precision == IO::DataType::Double ) {
+        writeCoordinates<double>( gid, mesh.vertices->getPoints() );
+    } else if ( meshData.precision == IO::DataType::Float ) {
+        writeCoordinates<float>( gid, mesh.vertices->getPoints() );
+    } else {
+        ERROR( "Unsupported format" );
+    }
+    // Write the connectivity
+    Array<int> tri( 3, mesh.A.size() );
+    for ( size_t i = 0; i < mesh.A.size(); i++ ) {
+        tri( 0, i ) = mesh.A[i];
+        tri( 1, i ) = mesh.B[i];
+        tri( 2, i ) = mesh.C[i];
+    }
+    IO::HDF5::writeHDF5( gid, "tri", tri );
+    auto domain =
+        Xdmf::createUnstructuredMesh( meshname, 3, Xdmf::TopologyType::Triangle, tri.size( 1 ),
+            path + "tri", mesh.vertices->getPoints().size(), path + "x", path + "y", path + "z" );
+    // Write the variables
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        auto &var     = *meshData.vars[i];
+        auto data     = var.data;
+        auto rankType = Xdmf::RankType::Null;
+        if ( data.ndim() == 1 ) {
+            rankType = Xdmf::RankType::Scalar;
+        } else if ( data.ndim() == 2 && data.size( 1 ) == 3 ) {
+            // Vector data, need to permute for visit
+            rankType = Xdmf::RankType::Vector;
+            data     = data.permute( { 1, 0 } );
+        } else {
+            ERROR( "Unable to determine variable rank: " + to_string( var.data.size() ) );
+        }
+        if ( var.precision == IO::DataType::Double ) {
+            IO::HDF5::writeHDF5( gid, var.name, data );
+        } else if ( var.precision == IO::DataType::Float ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<float>() );
+        } else if ( var.precision == IO::DataType::Int ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<int>() );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+        domain.addVariable(
+            meshname, var.name, data.size(), rankType, getXdmfType( var.type ), path + var.name );
+    }
+    xmf.addMesh( meshData.meshName, domain );
+}
+static void writeHDF5TriMesh( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &meshData, IO::MeshDatabase database, Xdmf &xmf )
+{
+    const IO::TriMesh &mesh = dynamic_cast<IO::TriMesh &>( *meshData.mesh );
+    writeHDF5TriMesh2( fid, filename, meshData, mesh, database, xmf );
+}
+static void writeHDF5TriList( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &meshData, IO::MeshDatabase database, Xdmf &xmf )
+{
+    auto mesh = getTriMesh( meshData.mesh );
+    writeHDF5TriMesh2( fid, filename, meshData, *mesh, database, xmf );
+}
+// Write a DomainMesh mesh (and variables) to a file
+static void writeHDF5DomainMesh( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &meshData, IO::MeshDatabase database, Xdmf &xmf )
+{
+    auto &mesh    = dynamic_cast<IO::DomainMesh &>( *meshData.mesh );
+    auto meshname = database.domains[0].name;
+    auto gid      = IO::HDF5::createGroup( fid, meshname );
+    auto path     = filename + ":/" + meshname + "/";
+    // Write the mesh
+    RankInfoStruct info( mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz );
+    std::vector<double> range = { info.ix * mesh.Lx / info.nx, ( info.ix + 1 ) * mesh.Lx / info.nx,
+        info.jy * mesh.Ly / info.ny, ( info.jy + 1 ) * mesh.Ly / info.ny,
+        info.kz * mesh.Lz / info.nz, ( info.kz + 1 ) * mesh.Lz / info.nz };
+    std::vector<int> N        = { mesh.nx, mesh.ny, mesh.nz };
+    std::vector<int> rankinfo = { mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz };
+    IO::HDF5::writeHDF5( gid, "range", range );
+    IO::HDF5::writeHDF5( gid, "N", N );
+    IO::HDF5::writeHDF5( gid, "rankinfo", rankinfo );
+    // xmf.addUniformMesh( meshname, range, ArraySize( N[0], N[1], N[2] ) );
+    // Write a curvilinear mesh due to bug with vector data on nodes loading into visit
+    Array<float> x( N[0] + 1, N[1] + 1, N[2] + 1 );
+    Array<float> y( N[0] + 1, N[1] + 1, N[2] + 1 );
+    Array<float> z( N[0] + 1, N[1] + 1, N[2] + 1 );
+    double dx = ( range[1] - range[0] ) / N[0];
+    double dy = ( range[3] - range[2] ) / N[1];
+    double dz = ( range[5] - range[4] ) / N[2];
+    for ( int k = 0; k <= N[2]; k++ ) {
+        for ( int j = 0; j <= N[1]; j++ ) {
+            for ( int i = 0; i <= N[0]; i++ ) {
+                x( i, j, k ) = range[0] + dx * i;
+                y( i, j, k ) = range[2] + dy * j;
+                z( i, j, k ) = range[4] + dz * k;
+            }
+        }
+    }
+    IO::HDF5::writeHDF5( gid, "x", x );
+    IO::HDF5::writeHDF5( gid, "y", y );
+    IO::HDF5::writeHDF5( gid, "z", z );
+    auto domain = Xdmf::createCurvilinearMesh(
+        meshname, ArraySize( N[0], N[1], N[2] ), path + "x", path + "y", path + "z" );
+    // Write the variables
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        auto &var     = *meshData.vars[i];
+        auto data     = var.data;
+        auto rankType = Xdmf::RankType::Null;
+        if ( data.ndim() == 3 ) {
+            rankType = Xdmf::RankType::Scalar;
+        } else if ( data.ndim() == 4 && data.size( 3 ) == 3 ) {
+            // Vector data, need to permute for visit
+            rankType = Xdmf::RankType::Vector;
+            data     = data.permute( { 3, 0, 1, 2 } );
+        } else {
+            ERROR( "Unable to determine variable rank: " + to_string( var.data.size() ) );
+        }
+        if ( var.precision == IO::DataType::Double ) {
+            IO::HDF5::writeHDF5( gid, var.name, data );
+        } else if ( var.precision == IO::DataType::Float ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<float>() );
+        } else if ( var.precision == IO::DataType::Int ) {
+            IO::HDF5::writeHDF5( gid, var.name, data.cloneTo<int>() );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+        domain.addVariable(
+            meshname, var.name, data.size(), rankType, getXdmfType( var.type ), path + var.name );
+    }
+    IO::HDF5::closeGroup( gid );
+    xmf.addMesh( meshData.meshName, domain );
+}
+// Write a mesh (and variables) to a file
+static IO::MeshDatabase write_domain_hdf5( hid_t fid, const std::string &filename,
+    const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank, Xdmf &xmf )
+{
+    // Create the MeshDatabase
+    auto database = getDatabase( filename, mesh, format, rank );
+    if ( database.meshClass == "PointList" ) {
+        writeHDF5PointList( fid, filename, mesh, database, xmf );
+    } else if ( database.meshClass == "TriMesh" ) {
+        writeHDF5TriMesh( fid, filename, mesh, database, xmf );
+    } else if ( database.meshClass == "TriList" ) {
+        writeHDF5TriList( fid, filename, mesh, database, xmf );
+    } else if ( database.meshClass == "DomainMesh" ) {
+        writeHDF5DomainMesh( fid, filename, mesh, database, xmf );
+    } else {
+        ERROR( "Unknown mesh class" );
+    }
+    return database;
+}
+// Write the mesh data to hdf5
+std::vector<IO::MeshDatabase> writeMeshesHDF5( const std::vector<IO::MeshDataStruct> &meshData,
+    const std::string &path, IO::FileFormat format, int rank, Xdmf &xmf )
+{
+
+    std::vector<IO::MeshDatabase> meshes_written;
+    char filename[100], fullpath[200];
+    sprintf( filename, "%05i.h5", rank );
+    sprintf( fullpath, "%s/%s", path.c_str(), filename );
+    auto fid = IO::HDF5::openHDF5( fullpath, "w", IO::HDF5::Compression::GZIP );
+    for ( size_t i = 0; i < meshData.size(); i++ ) {
+        meshes_written.push_back(
+            write_domain_hdf5( fid, filename, meshData[i], format, rank, xmf ) );
+    }
+    IO::HDF5::closeHDF5( fid );
+    return meshes_written;
+}
+
+
+#else
+
+
+std::vector<IO::MeshDatabase> writeMeshesHDF5(
+    const std::vector<IO::MeshDataStruct> &, const std::string &, IO::FileFormat, int );
+{
+    return std::vector<IO::MeshDatabase>();
+}
+
+
+#endif
--- a/IO/HDF5_IO.cpp
+++ b/IO/HDF5_IO.cpp
@ -0,0 +1,585 @@
+#include "IO/HDF5_IO.h"
+#include "IO/HDF5_IO.hpp"
+#include "common/Array.h"
+#include "common/Utilities.h"
+
+#include <complex>
+#include <sstream>
+#include <string>
+#include <vector>
+
+
+namespace IO {
+namespace HDF5 {
+
+
+#ifdef USE_HDF5 // USE HDF5
+
+
+/************************************************************************
+ * HDF5 helper routines                                                  *
+ ************************************************************************/
+inline const void *H5Ptr( const void *x ) { return x == nullptr ? ( (void *) 1 ) : x; }
+bool H5Gexists( hid_t fid, const std::string &name )
+{
+    H5E_auto2_t func;
+    void *client;
+    H5Eget_auto2( H5E_DEFAULT, &func, &client );
+    H5Eset_auto2( H5E_DEFAULT, nullptr, nullptr );
+    int status = H5Gget_objinfo( fid, name.data(), 0, nullptr );
+    H5Eset_auto2( H5E_DEFAULT, func, client );
+    return status == 0;
+}
+bool H5Dexists( hid_t fid, const std::string &name )
+{
+    H5E_auto2_t func;
+    void *client;
+    H5Eget_auto2( H5E_DEFAULT, &func, &client );
+    H5Eset_auto2( H5E_DEFAULT, nullptr, nullptr );
+    hid_t dataset = H5Dopen2( fid, name.data(), H5P_DEFAULT );
+    H5Eset_auto2( H5E_DEFAULT, func, client );
+    bool exists = dataset > 0;
+    // if ( exists )
+    //    H5Dclose( dataset );
+    return exists;
+}
+hid_t createGroup( hid_t fid, const std::string &name )
+{
+    return H5Gcreate2( fid, name.data(), H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT );
+}
+hid_t openGroup( hid_t fid, const std::string &name )
+{
+    INSIST( H5Gexists( fid, name ), "Group " + name + " does not exist" );
+    return H5Gopen2( fid, name.data(), H5P_DEFAULT );
+}
+void closeGroup( hid_t gid ) { H5Gclose( gid ); }
+
+
+/************************************************************************
+ * Complex struct that is compatible with HDF5                           *
+ ************************************************************************/
+typedef struct {
+    double re;
+    double im;
+} complex_t;
+inline void convert( size_t N, const std::complex<double> *x, complex_t *y )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        y[i].re = x[i].real();
+        y[i].im = x[i].imag();
+    }
+}
+inline void convert( size_t N, const complex_t *x, std::complex<double> *y )
+{
+    for ( size_t i = 0; i < N; i++ ) {
+        y[i] = std::complex<double>( x[i].re, x[i].im );
+    }
+}
+
+
+/************************************************************************
+ * Get the HDF5 data type                                                *
+ ************************************************************************/
+template<>
+hid_t getHDF5datatype<bool>()
+{
+    return H5Tcopy( H5T_NATIVE_UCHAR );
+}
+template<>
+hid_t getHDF5datatype<char>()
+{
+    return H5Tcopy( H5T_NATIVE_CHAR );
+}
+template<>
+hid_t getHDF5datatype<uint8_t>()
+{
+    return H5Tcopy( H5T_NATIVE_UINT8 );
+}
+template<>
+hid_t getHDF5datatype<int8_t>()
+{
+    return H5Tcopy( H5T_NATIVE_INT8 );
+}
+template<>
+hid_t getHDF5datatype<uint16_t>()
+{
+    return H5Tcopy( H5T_NATIVE_UINT16 );
+}
+template<>
+hid_t getHDF5datatype<int16_t>()
+{
+    return H5Tcopy( H5T_NATIVE_INT16 );
+}
+template<>
+hid_t getHDF5datatype<int>()
+{
+    return H5Tcopy( H5T_NATIVE_INT );
+}
+template<>
+hid_t getHDF5datatype<unsigned int>()
+{
+    return H5Tcopy( H5T_NATIVE_UINT );
+}
+template<>
+hid_t getHDF5datatype<long int>()
+{
+    return H5Tcopy( H5T_NATIVE_LONG );
+}
+template<>
+hid_t getHDF5datatype<unsigned long int>()
+{
+    return H5Tcopy( H5T_NATIVE_ULONG );
+}
+template<>
+hid_t getHDF5datatype<float>()
+{
+    return H5Tcopy( H5T_NATIVE_FLOAT );
+}
+template<>
+hid_t getHDF5datatype<double>()
+{
+    return H5Tcopy( H5T_NATIVE_DOUBLE );
+}
+template<>
+hid_t getHDF5datatype<std::complex<double>>()
+{
+    hid_t datatype = H5Tcreate( H5T_COMPOUND, sizeof( complex_t ) );
+    H5Tinsert( datatype, "real", HOFFSET( complex_t, re ), H5T_NATIVE_DOUBLE );
+    H5Tinsert( datatype, "imag", HOFFSET( complex_t, im ), H5T_NATIVE_DOUBLE );
+    return datatype;
+}
+template<>
+hid_t getHDF5datatype<char *>()
+{
+    hid_t datatype = H5Tcopy( H5T_C_S1 );
+    H5Tset_size( datatype, H5T_VARIABLE );
+    return datatype;
+}
+
+
+/************************************************************************
+ * Read/write Array types                                                *
+ ************************************************************************/
+template<>
+void readHDF5<Array<std::string>>( hid_t fid, const std::string &name, Array<std::string> &data )
+{
+    if ( !H5Dexists( fid, name ) ) {
+        // Dataset does not exist
+        data.resize( 0 );
+        return;
+    }
+    hid_t dataset   = H5Dopen2( fid, name.data(), H5P_DEFAULT );
+    hid_t datatype  = H5Dget_type( dataset );
+    hid_t dataspace = H5Dget_space( dataset );
+    hsize_t dims0[10];
+    int ndim  = H5Sget_simple_extent_dims( dataspace, dims0, nullptr );
+    auto dims = convertSize( ndim, dims0 );
+    data.resize( dims );
+    hid_t datatype2 = getHDF5datatype<char *>();
+    if ( data.empty() ) {
+        // The data is empty
+    } else if ( H5Tequal( datatype, datatype2 ) ) {
+        // The type of Array and the data in HDF5 match
+        auto **tmp = new char *[data.length() * sizeof( char * )];
+        memset( tmp, 0, data.length() * sizeof( char * ) );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, tmp );
+        for ( size_t i = 0; i < data.length(); i++ )
+            data( i ) = std::string( tmp[i] );
+        H5Dvlen_reclaim( datatype, dataspace, H5P_DEFAULT, tmp );
+        delete[] tmp;
+    } else {
+        ERROR( "Unknown format for std::string" );
+    }
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Tclose( datatype2 );
+    H5Sclose( dataspace );
+}
+template<>
+void readHDF5<Array<std::complex<double>>>(
+    hid_t fid, const std::string &name, Array<std::complex<double>> &data )
+{
+    if ( !H5Dexists( fid, name ) ) {
+        // Dataset does not exist
+        data.resize( 0 );
+        return;
+    }
+    hid_t dataset   = H5Dopen2( fid, name.data(), H5P_DEFAULT );
+    hid_t datatype  = H5Dget_type( dataset );
+    hid_t dataspace = H5Dget_space( dataset );
+    hsize_t dims0[10];
+    int ndim  = H5Sget_simple_extent_dims( dataspace, dims0, nullptr );
+    auto dims = convertSize( ndim, dims0 );
+    data.resize( dims );
+    hid_t datatype2 = getHDF5datatype<std::complex<double>>();
+    if ( data.empty() ) {
+        // The data is empty
+    } else if ( H5Tequal( datatype, datatype2 ) ) {
+        // The type of Array and the data in HDF5 match
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.data() );
+    } else {
+        ERROR( "We need to convert data formats" );
+    }
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Tclose( datatype2 );
+    H5Sclose( dataspace );
+}
+// clang-format off
+#define readWriteHDF5Array( TYPE )                                                          \
+    template<>                                                                              \
+    void writeHDF5<Array<TYPE>>( hid_t fid, const std::string &name, const Array<TYPE> &data ) \
+    {                                                                                       \
+        writeHDF5ArrayDefault<TYPE>( fid, name, data );                                     \
+    }                                                                                       \
+    template<>                                                                              \
+    void readHDF5<Array<TYPE>>( hid_t fid, const std::string &name, Array<TYPE> &data ) \
+    {                                                                                       \
+        readHDF5ArrayDefault<TYPE>( fid, name, data );                                      \
+    }
+readWriteHDF5Array( bool )
+readWriteHDF5Array( char )
+readWriteHDF5Array( int8_t )
+readWriteHDF5Array( int16_t )
+readWriteHDF5Array( int32_t )
+readWriteHDF5Array( int64_t )
+readWriteHDF5Array( uint8_t )
+readWriteHDF5Array( uint16_t )
+readWriteHDF5Array( uint32_t )
+readWriteHDF5Array( uint64_t )
+readWriteHDF5Array( float )
+readWriteHDF5Array( double )
+    // clang-format on
+
+
+    /************************************************************************
+     * Read/write scalar types                                               *
+     ************************************************************************/
+    template<>
+    void readHDF5<std::string>( hid_t fid, const std::string &name, std::string &data )
+{
+    hid_t dataset   = H5Dopen2( fid, name.data(), H5P_DEFAULT );
+    hid_t datatype  = H5Dget_type( dataset );
+    hid_t datatype0 = getHDF5datatype<char *>();
+    if ( H5Tequal( datatype, datatype0 ) ) {
+        hid_t dataspace = H5Dget_space( dataset );
+        char *tmp[1]    = { nullptr };
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, tmp );
+        data = std::string( tmp[0] );
+        H5Dvlen_reclaim( datatype, dataspace, H5P_DEFAULT, tmp );
+        H5Sclose( dataspace );
+    } else {
+        Array<char> tmp;
+        readHDF5( fid, name, tmp );
+        data = std::string( tmp.data(), tmp.length() );
+    }
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Tclose( datatype0 );
+}
+template<>
+void writeHDF5<std::string>( hid_t fid, const std::string &name, const std::string &data )
+{
+    Array<char> tmp;
+    tmp.viewRaw( { data.length() }, (char *) data.data() );
+    writeHDF5( fid, name, tmp );
+}
+// clang-format off
+#define readWriteHDF5Scalar( TYPE )                                                         \
+    template<>                                                                              \
+    void writeHDF5<TYPE>( hid_t fid, const std::string &name, const TYPE &data )       \
+    {                                                                                       \
+        hid_t dataspace = H5Screate( H5S_SCALAR );                                          \
+        hid_t datatype  = getHDF5datatype<TYPE>();                                          \
+        hid_t dataset   = H5Dcreate2(                                                       \
+            fid, name.data(), datatype, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT ); \
+        H5Dwrite( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, H5Ptr( &data ) );       \
+        H5Dclose( dataset );                                                                \
+        H5Tclose( datatype );                                                               \
+        H5Sclose( dataspace );                                                              \
+    }                                                                                       \
+    template<>                                                                              \
+    void readHDF5<TYPE>( hid_t fid, const std::string &name, TYPE &data )              \
+    {                                                                                       \
+        Array<TYPE> tmp;                                                                    \
+        readHDF5( fid, name, tmp );                                                         \
+        INSIST( tmp.ndim() == 1 && tmp.length() == 1, "Error loading " + std::string( name ) ); \
+        data = tmp( 0 );                                                                    \
+    }
+readWriteHDF5Scalar( bool )
+readWriteHDF5Scalar( char )
+readWriteHDF5Scalar( int8_t )
+readWriteHDF5Scalar( int16_t )
+readWriteHDF5Scalar( int32_t )
+readWriteHDF5Scalar( int64_t )
+readWriteHDF5Scalar( uint8_t )
+readWriteHDF5Scalar( uint16_t )
+readWriteHDF5Scalar( uint32_t )
+readWriteHDF5Scalar( uint64_t )
+readWriteHDF5Scalar( float )
+readWriteHDF5Scalar( double )
+readWriteHDF5Scalar( std::complex<double> )
+    // clang-format on
+
+
+    /******************************************************************
+     * Create custom error handler                                     *
+     ******************************************************************/
+    herr_t hdf5_error_handler( hid_t err_stack, void * )
+{
+    FILE *fid = tmpfile();
+    H5Eprint2( err_stack, fid );
+    H5Eclear2( err_stack );
+    rewind( fid );
+    char msg[1024];
+    size_t N = fread( msg, 1, sizeof( msg ) - 1, fid );
+    fclose( fid );
+    msg[N]           = 0;
+    std::string msg2 = "Error calling HDF5 routine:\n";
+    ERROR( msg2 + msg );
+    return 0;
+}
+bool set_hdf5_error_handler()
+{
+    hid_t error_stack = 0;
+    H5E_auto2_t fun   = hdf5_error_handler;
+    H5Eset_auto2( error_stack, fun, nullptr );
+    return true;
+}
+bool global_is_hdf5_error_handler_set = set_hdf5_error_handler();
+
+
+/******************************************************************
+ * Open/close HDF5 files                                           *
+ ******************************************************************/
+hid_t openHDF5( const std::string &filename, const char *mode, Compression compress )
+{
+    // Set cache size to 3MBs and instruct the cache to discard the fully read chunk
+    auto pid = H5P_DEFAULT;
+    /*auto pid = H5Pcreate( H5P_FILE_ACCESS );
+    int nelemts;
+    size_t nslots, nbytes;
+    double w0;
+    H5Pget_cache(pid,& nelemts,& nslots,& nbytes,& w0);
+    H5Pset_cache(pid, nelemts, 1999, 3*1024*1024, 1.0); */
+    // Open the file
+    hid_t fid = 0;
+    if ( strcmp( mode, "r" ) == 0 ) {
+        fid = H5Fopen( filename.data(), H5F_ACC_RDONLY, pid );
+    } else if ( strcmp( mode, "w" ) == 0 ) {
+        fid = H5Fcreate( filename.data(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT );
+    } else if ( strcmp( mode, "rw" ) == 0 ) {
+        fid = H5Fopen( filename.data(), H5F_ACC_RDWR, H5P_DEFAULT );
+    } else {
+        ERROR( "Invalid mode for opening HDF5 file" );
+    }
+    if ( strcmp( mode, "w" ) == 0 ) {
+        if ( compress == Compression::None ) {
+            writeHDF5<int>( fid, "DefaultCompression", 0 );
+        } else if ( compress == Compression::GZIP ) {
+            writeHDF5<int>( fid, "DefaultCompression", 1 );
+        } else if ( compress == Compression::SZIP ) {
+            writeHDF5<int>( fid, "DefaultCompression", 2 );
+        } else {
+            ERROR( "Internal error" );
+        }
+    }
+    // H5Pclose( pid );
+    return fid;
+}
+void closeHDF5( hid_t fid )
+{
+    // Try to close any remaining objects (needed to ensure we can reopen the data if desired)
+    hid_t file[1000], set[1000], group[1000], type[1000], attr[1000];
+    size_t N_file  = H5Fget_obj_ids( fid, H5F_OBJ_FILE, 1000, file );
+    size_t N_set   = H5Fget_obj_ids( fid, H5F_OBJ_DATASET, 1000, set );
+    size_t N_group = H5Fget_obj_ids( fid, H5F_OBJ_GROUP, 1000, group );
+    size_t N_type  = H5Fget_obj_ids( fid, H5F_OBJ_DATATYPE, 1000, type );
+    size_t N_attr  = H5Fget_obj_ids( fid, H5F_OBJ_ATTR, 1000, attr );
+    for ( size_t i = 0; i < N_file; i++ ) {
+        if ( file[i] != fid )
+            H5Fclose( file[i] );
+    }
+    for ( size_t i = 0; i < N_set; i++ )
+        H5Dclose( set[i] );
+    for ( size_t i = 0; i < N_group; i++ )
+        H5Gclose( group[i] );
+    for ( size_t i = 0; i < N_type; i++ )
+        H5Tclose( type[i] );
+    for ( size_t i = 0; i < N_attr; i++ )
+        H5Aclose( attr[i] );
+    // Flush the data (needed to ensure we can reopen the data if desired)
+    unsigned intent;
+    H5Fget_intent( fid, &intent );
+    if ( intent == H5F_ACC_RDWR || intent == H5F_ACC_TRUNC )
+        H5Fflush( fid, H5F_SCOPE_GLOBAL );
+    // Close the file
+    H5Fclose( fid );
+}
+
+
+/************************************************************************
+ * Check if we support compression                                       *
+ ************************************************************************/
+Compression defaultCompression( hid_t fid )
+{
+    hid_t root = H5Gopen2( fid, "/", H5P_DEFAULT );
+    if ( !H5Dexists( root, "DefaultCompression" ) )
+        return Compression::None;
+    int tmp;
+    readHDF5( root, "DefaultCompression", tmp );
+    Compression compress = Compression::None;
+    if ( tmp == 0 ) {
+        compress = Compression::None;
+    } else if ( tmp == 1 ) {
+        compress = Compression::GZIP;
+    } else if ( tmp == 2 ) {
+        compress = Compression::SZIP;
+    } else {
+        ERROR( "Internal error" );
+    }
+    return compress;
+}
+
+
+/************************************************************************
+ * Create a default chunk size                                           *
+ ************************************************************************/
+hid_t createChunk( const std::vector<hsize_t> &dims, Compression compress )
+{
+    if ( compress == Compression::None || dims.empty() )
+        return H5P_DEFAULT;
+    hsize_t length = 1;
+    for ( auto d : dims )
+        length *= d;
+    if ( length < 512 )
+        return H5P_DEFAULT;
+    hid_t plist = H5Pcreate( H5P_DATASET_CREATE );
+    auto status = H5Pset_chunk( plist, dims.size(), dims.data() );
+    ASSERT( status == 0 );
+    if ( compress == Compression::GZIP ) {
+        status = H5Pset_deflate( plist, 7 );
+        ASSERT( status == 0 );
+    } else if ( compress == Compression::SZIP ) {
+        status = H5Pset_szip( plist, H5_SZIP_NN_OPTION_MASK, 16 );
+        ASSERT( status == 0 );
+    }
+    return plist;
+}
+
+
+/************************************************************************
+ * Write Array                                                           *
+ ************************************************************************/
+template<>
+void writeHDF5<Array<std::complex<double>>>(
+    hid_t fid, const std::string &name, const Array<std::complex<double>> &data )
+{
+    hid_t datatype = getHDF5datatype<std::complex<double>>();
+    // Copy the data
+    size_t N = data.length();
+    auto *y  = new complex_t[N];
+    convert( N, data.data(), y );
+    // Save the array
+    auto dim        = arraySize( data );
+    hid_t dataspace = H5Screate_simple( dim.size(), dim.data(), nullptr );
+    hid_t dataset =
+        H5Dcreate2( fid, name.data(), datatype, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT );
+    H5Dwrite( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, H5Ptr( y ) );
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Sclose( dataspace );
+    delete[] y;
+}
+template<>
+void writeHDF5<Array<std::string>>(
+    hid_t fid, const std::string &name, const Array<std::string> &data )
+{
+    auto dim        = arraySize( data );
+    hid_t dataspace = H5Screate_simple( dim.size(), dim.data(), nullptr );
+    auto **tmp      = new char *[data.length() + 1];
+    memset( tmp, 0, ( data.length() + 1 ) * sizeof( char * ) );
+    for ( size_t i = 0; i < data.length(); i++ ) {
+        tmp[i] = const_cast<char *>( data( i ).data() );
+    }
+    hid_t datatype = getHDF5datatype<char *>();
+    hid_t props    = H5Pcreate( H5P_DATASET_CREATE );
+    hid_t dataset  = H5Dcreate1( fid, name.data(), datatype, dataspace, props );
+    H5Dwrite( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, tmp );
+    H5Pclose( props );
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Sclose( dataspace );
+    delete[] tmp;
+}
+
+
+/************************************************************************
+ * Specializations for std::vector                                       *
+ ************************************************************************/
+template<>
+void readHDF5<std::vector<bool>>( hid_t fid, const std::string &name, std::vector<bool> &data )
+{
+    Array<bool> tmp;
+    readHDF5( fid, name, tmp );
+    data.resize( tmp.length() );
+    for ( size_t i = 0; i < data.size(); i++ )
+        data[i] = tmp( i );
+}
+template<>
+void writeHDF5<std::vector<bool>>( hid_t fid, const std::string &name, const std::vector<bool> &x )
+{
+    Array<bool> y( x.size() );
+    for ( size_t i = 0; i < x.size(); i++ )
+        y( i ) = x[i];
+    writeHDF5( fid, name, y );
+}
+
+
+/************************************************************************
+ * Explicit instantiations for std::vector                              *
+ ***********************************************************************/
+// clang-format off
+#define INSTANTIATE_STD_VECTOR( TYPE )                              \
+    template<> void readHDF5<std::vector<TYPE>>( hid_t fid, const std::string &name, std::vector<TYPE> &x ) \
+    {                                                               \
+        Array<TYPE> y;                                              \
+        readHDF5( fid, name, y );                                   \
+        x.resize( y.length() );                                     \
+        for ( size_t i = 0; i < x.size(); i++ )                     \
+            x[i] = y( i );                                          \
+    }                                                               \
+    template<> void writeHDF5<std::vector<TYPE>>( hid_t fid, const std::string &name, const std::vector<TYPE> &x ) \
+    {                                                               \
+        Array<TYPE> y;                                              \
+        y.viewRaw( { x.size() }, const_cast<TYPE*>( x.data() ) );   \
+        writeHDF5( fid, name, y );                                  \
+    }
+INSTANTIATE_STD_VECTOR( char )
+INSTANTIATE_STD_VECTOR( unsigned char )
+INSTANTIATE_STD_VECTOR( int )
+INSTANTIATE_STD_VECTOR( unsigned int )
+INSTANTIATE_STD_VECTOR( int16_t )
+INSTANTIATE_STD_VECTOR( uint16_t )
+INSTANTIATE_STD_VECTOR( int64_t )
+INSTANTIATE_STD_VECTOR( uint64_t )
+INSTANTIATE_STD_VECTOR( float )
+INSTANTIATE_STD_VECTOR( double )
+INSTANTIATE_STD_VECTOR( std::string )
+// clang-format on
+
+
+#else // No HDF5
+// Dummy implimentations for no HDF5
+hid_t openHDF5( const std::string &, const char *, Compression ) { return 0; }
+void closeHDF5( hid_t ) {}
+bool H5Gexists( hid_t, const std::string & ) { return false; }
+bool H5Dexists( hid_t, const std::string & ) { return false; }
+hid_t createGroup( hid_t, const std::string & ) { return 0; }
+hid_t openGroup( hid_t, const std::string & ) { return 0; }
+void closeGroup( hid_t ) {}
+#endif
+
+
+} // namespace HDF5
+} // namespace IO
--- a/IO/HDF5_IO.h
+++ b/IO/HDF5_IO.h
@ -0,0 +1,169 @@
+// This file contains helper functions and interfaces for reading/writing HDF5
+#ifndef included_HDF5_h
+#define included_HDF5_h
+
+#include "common/ArraySize.h"
+
+#include <cstring>
+#include <string>
+
+
+// Include the headers and define some basic types
+#ifdef USE_HDF5
+// Using HDF5
+#include "hdf5.h"
+#else
+// Not using HDF5
+typedef int hid_t;
+typedef size_t hsize_t;
+#endif
+
+
+namespace IO {
+namespace HDF5 {
+
+
+enum class Compression : uint8_t { None, GZIP, SZIP };
+
+
+/**
+ * \brief Open an HDF5 file
+ * \details This function opens and HDF5 file for reading/writing.
+ *     Once complete, we must close the file using closeHDF5
+ * @param[in] filename  File to open
+ * @param[in] mode      C string containing a file access mode. It can be:
+ *                      "r"    read: Open file for input operations. The file must exist.
+ *                      "w"    write: Create an empty file for output operations.
+ *                          If a file with the same name already exists, its contents
+ *                          are discarded and the file is treated as a new empty file.
+ *                      "rw" read+write: Open file for reading and writing.  The file must exist.
+ * @param[in] compress  Default compression
+ * @return              Return a handle to the file.
+ */
+hid_t openHDF5(
+    const std::string &filename, const char *mode, Compression compress = Compression::None );
+
+
+/**
+ * \brief Open an HDF5 file
+ * \details This function opens and HDF5 file for reading/writing
+ * @param[in] fid       File to open
+ */
+void closeHDF5( hid_t fid );
+
+
+/**
+ * \brief Retrun the the default compression
+ * \details This function returns the default compression used when the file was created
+ * @param[in] fid       File/Group id
+ */
+Compression defaultCompression( hid_t fid );
+
+
+/**
+ * \brief Open an HDF5 file
+ * \details This function create a chunk for HDF5
+ * @param[in] dims      Chunk size
+ * @param[in] compress  Compression to use
+ * @return              Return a handle to the file.
+ */
+hid_t createChunk( const std::vector<hsize_t> &dims, Compression compress );
+
+
+/**
+ * \brief Write a structure to HDF5
+ * \details This function writes a C++ class/struct to HDF5.
+ *    This is a templated function and users can impliment their own data
+ *    types by creating explicit instantiations for a given type.
+ *    There is no default instantiation except when compiled without HDF5 which is a no-op.
+ * @param[in] fid       File or group to write to
+ * @param[in] name      The name of the variable
+ * @param[in] data      The structure to write
+ */
+template<class T>
+void writeHDF5( hid_t fid, const std::string &name, const T &data );
+
+
+/**
+ * \brief Read a structure from HDF5
+ * \details This function reads a C++ class/struct from HDF5.
+ *    This is a templated function and users can impliment their own data
+ *    types by creating explicit instantiations for a given type.
+ *    There is no default instantiation except when compiled without HDF5 which is a no-op.
+ * @param[in] fid       File or group to read from
+ * @param[in] name      The name of the variable
+ * @param[out] data     The structure to read
+ */
+template<class T>
+void readHDF5( hid_t fid, const std::string &name, T &data );
+
+
+/**
+ * \brief Check if group exists
+ * \details This function checks if an HDF5 group exists in the file
+ * @param[in] fid       ID of group or database to read
+ * @param[in] name      The name of the group
+ */
+bool H5Gexists( hid_t fid, const std::string &name );
+
+
+/**
+ * \brief Check if dataset exists
+ * \details This function checks if an HDF5 dataset exists in the file
+ * @param[in] fid       File to open
+ * @param[in] name      The name of the dataset
+ */
+bool H5Dexists( hid_t fid, const std::string &name );
+
+
+/**
+ * \brief Create a group
+ * \details This function creates a new HDF5 group
+ * @param[in] fid       File or group to write to
+ * @param[in] name      The name of the group
+ */
+hid_t createGroup( hid_t fid, const std::string &name );
+
+
+/**
+ * \brief Open a group
+ * \details This function opens an HDF5 group
+ * @param[in] fid       File or group to write to
+ * @param[in] name      The name of the group
+ */
+hid_t openGroup( hid_t fid, const std::string &name );
+
+
+/**
+ * \brief Close a group
+ * \details This function closes an HDF5 group
+ * @param[in] fid       Group to close
+ */
+void closeGroup( hid_t fid );
+
+
+/**
+ * \brief Get HDF5 data type
+ * \details This function returns the id of the data type
+ */
+template<class T>
+hid_t getHDF5datatype();
+
+
+// Default no-op implimentations for use without HDF5
+// clang-format off
+#ifndef USE_HDF5
+template<class T> void readHDF5( hid_t, const std::string&, T& ) {}
+template<class T> void writeHDF5( hid_t, const std::string&, const T& ) {}
+template<class T> void readHDF5Array( hid_t, const std::string&, Array<T>& ) {}
+template<class T> void writeHDF5Array( hid_t, const std::string&, const Array<T>& ) {}
+template<class T> hid_t getHDF5datatype() { return 0; }
+#endif
+// clang-format on
+
+
+} // namespace HDF5
+} // namespace IO
+
+
+#endif
--- a/IO/HDF5_IO.hpp
+++ b/IO/HDF5_IO.hpp
@ -0,0 +1,348 @@
+// This file contains helper functions and interfaces for reading/writing HDF5
+#ifndef included_HDF5_hpp
+#define included_HDF5_hpp
+#ifdef USE_HDF5
+
+#include "IO/HDF5_IO.h"
+#include "common/Array.h"
+#include "common/Array.hpp"
+#include "common/Utilities.h"
+
+#include <array>
+#include <complex>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+
+namespace IO {
+namespace HDF5 {
+
+
+/********************************************************
+ *  External instantiations (scalar)                     *
+ ********************************************************/
+// clang-format off
+template<> void writeHDF5<char>( hid_t, const std::string &, const char & );
+template<> void readHDF5<char>( hid_t, const std::string &, char & );
+template<> void writeHDF5<bool>( hid_t, const std::string &, const bool & );
+template<> void readHDF5<bool>( hid_t, const std::string &, bool & );
+template<> void writeHDF5<int>( hid_t, const std::string &, const int & );
+template<> void readHDF5<int>( hid_t, const std::string &, int & );
+template<> void writeHDF5<long>( hid_t, const std::string &, const long & );
+template<> void readHDF5<long>( hid_t, const std::string &, long & );
+template<> void writeHDF5<float>( hid_t, const std::string &, const float & );
+template<> void readHDF5<float>( hid_t, const std::string &, float & );
+template<> void writeHDF5<double>( hid_t, const std::string &, const double & );
+template<> void readHDF5<double>( hid_t, const std::string &, double & );
+template<> void writeHDF5<unsigned char>( hid_t, const std::string &, const unsigned char & );
+template<> void readHDF5<unsigned char>( hid_t, const std::string &, unsigned char & );
+template<> void writeHDF5<unsigned int>( hid_t, const std::string &, const unsigned int & );
+template<> void readHDF5<unsigned int>( hid_t, const std::string &, unsigned int & );
+template<> void writeHDF5<unsigned long>( hid_t, const std::string &, const unsigned long & );
+template<> void readHDF5<unsigned long>( hid_t, const std::string &, unsigned long & );
+template<> void writeHDF5<std::string>( hid_t, const std::string &, const std::string & );
+template<> void readHDF5<std::string>( hid_t, const std::string &, std::string & );
+template<> void writeHDF5<std::complex<double>>( hid_t, const std::string &, const std::complex<double> & );
+template<> void readHDF5<std::complex<double>>( hid_t, const std::string &, std::complex<double> & );
+template<> void writeHDF5<std::complex<float>>( hid_t, const std::string &, const std::complex<float> & );
+template<> void readHDF5<std::complex<float>>( hid_t, const std::string &, std::complex<float> & );
+// clang-format on
+
+
+/********************************************************
+ *  External instantiations (Array)                      *
+ ********************************************************/
+// clang-format off
+template<> void writeHDF5<Array<char>>( hid_t, const std::string &, const Array<char> & );
+template<> void readHDF5<Array<char>>( hid_t, const std::string &, Array<char> & );
+template<> void writeHDF5<Array<bool>>( hid_t, const std::string &, const Array<bool> & );
+template<> void readHDF5<Array<bool>>( hid_t, const std::string &, Array<bool> & );
+template<> void writeHDF5<Array<int>>( hid_t, const std::string &, const Array<int> & );
+template<> void readHDF5<Array<int>>( hid_t, const std::string &, Array<int> & );
+template<> void writeHDF5<Array<long>>( hid_t, const std::string &, const Array<long> & );
+template<> void readHDF5<Array<long>>( hid_t, const std::string &, Array<long> & );
+template<> void writeHDF5<Array<float>>( hid_t, const std::string &, const Array<float> & );
+template<> void readHDF5<Array<float>>( hid_t, const std::string &, Array<float> & );
+template<> void writeHDF5<Array<double>>( hid_t, const std::string &, const Array<double> & );
+template<> void readHDF5<Array<double>>( hid_t, const std::string &, Array<double> & );
+template<> void writeHDF5<Array<unsigned char>>( hid_t, const std::string &, const Array<unsigned char> & );
+template<> void readHDF5<Array<unsigned char>>( hid_t, const std::string &, Array<unsigned char> & );
+template<> void writeHDF5<Array<unsigned int>>( hid_t, const std::string &, const Array<unsigned int> & );
+template<> void readHDF5<Array<unsigned int>>( hid_t, const std::string &, Array<unsigned int> & );
+template<> void writeHDF5<Array<unsigned long>>( hid_t, const std::string &, const Array<unsigned long> & );
+template<> void readHDF5<Array<unsigned long>>( hid_t, const std::string &, Array<unsigned long> & );
+template<> void writeHDF5<Array<std::string>>( hid_t, const std::string &, const Array<std::string> & );
+template<> void readHDF5<Array<std::string>>( hid_t, const std::string &, Array<std::string> & );
+template<> void writeHDF5<Array<std::string>>( hid_t, const std::string &, const Array<std::string> & );
+template<> void readHDF5<Array<std::string>>( hid_t, const std::string &, Array<std::string> & );
+template<> void writeHDF5<Array<std::complex<double>>>( hid_t, const std::string &, const Array<std::complex<double>> & );
+template<> void readHDF5<Array<std::complex<double>>>( hid_t, const std::string &, Array<std::complex<double>> & );
+template<> void writeHDF5<Array<std::complex<float>>>( hid_t, const std::string &, const Array<std::complex<float>> & );
+template<> void readHDF5<Array<std::complex<float>>>( hid_t, const std::string &, Array<std::complex<float>> & );
+// clang-format on
+
+
+/******************************************************************
+ * Default implimentation                                          *
+ ******************************************************************/
+/*template<class TYPE>
+void writeHDF5( hid_t fid, const std::string &name, const TYPE &x )
+{
+    NULL_USE( fid );
+    if constexpr ( is_shared_ptr<TYPE>::value ) {
+        // We are dealing with a std::shared_ptr
+        writeHDF5( fid, name, *x );
+    } else if constexpr ( is_vector<TYPE>::value ) {
+        // We are dealing with a std::vector
+        typedef decltype( *x.begin() ) TYPE2;
+        typedef typename std::remove_reference<TYPE2>::type TYPE3;
+        typedef typename std::remove_cv<TYPE3>::type TYPE4;
+        Array<TYPE4> y;
+        y.viewRaw( { x.size() }, const_cast<TYPE4 *>( x.data() ) );
+        writeHDF5( fid, name, y );
+    } else if constexpr ( std::is_array<TYPE>::value ) {
+        // We are dealing with a std::array
+        typedef decltype( *x.begin() ) TYPE2;
+        typedef typename std::remove_reference<TYPE2>::type TYPE3;
+        typedef typename std::remove_cv<TYPE3>::type TYPE4;
+        Array<TYPE4> y;
+        y.viewRaw( { x.size() }, const_cast<TYPE4 *>( x.data() ) );
+        writeHDF5( fid, name, y );
+    } else if constexpr ( is_Array<TYPE>::value ) {
+        // We are dealing with an Array
+        std::string typeName = Utilities::demangle( typeid( TYPE ).name() );
+        throw std::logic_error( "Unsupported type writeHDF5<Array<" + typeName + ">>" );
+    } else if constexpr ( std::is_same<TYPE, std::string>::value ) {
+        // We are dealing with a std::string (should be handled through specialization)
+        throw std::logic_error( "Internal error" );
+    } else if constexpr ( std::is_same<TYPE, std::string>::value ||
+                          std::is_same<TYPE, char *>::value ||
+                          std::is_same<TYPE, const char *>::value ) {
+        // We are dealing with a string or char array
+        writeHDF5( fid, name, std::string( x ) );
+    } else if constexpr ( has_size<TYPE>::value ) {
+        // We are dealing with a container
+        typedef decltype( *x.begin() ) TYPE2;
+        typedef typename std::remove_reference<TYPE2>::type TYPE3;
+        typedef typename std::remove_cv<TYPE3>::type TYPE4;
+        std::vector<TYPE4> x2( x.begin(), x.end() );
+        writeHDF5<std::vector<TYPE4>>( fid, name, x2 );
+    } else {
+        throw std::logic_error( "Unsupported type" );
+    }
+}
+template<class TYPE>
+void readHDF5( hid_t fid, const std::string &name, TYPE &x )
+{
+    NULL_USE( fid );
+    if constexpr ( is_shared_ptr<TYPE>::value ) {
+        // We are dealing with a std::shared_ptr
+        readHDF5( fid, name, *x );
+    } else if constexpr ( is_vector<TYPE>::value ) {
+        // We are dealing with a std::vector
+        typedef typename std::remove_reference<decltype( *x.begin() )>::type TYPE2;
+        Array<TYPE2> y;
+        readHDF5( fid, name, y );
+        x.resize( y.length() );
+        // Swap the elements in the arrays to use the move operator
+        for ( size_t i = 0; i < x.size(); i++ )
+            std::swap( x[i], y( i ) );
+    } else if constexpr ( std::is_array<TYPE>::value ) {
+        // We are dealing with a std::array
+        typedef typename std::remove_reference<decltype( *x.begin() )>::type TYPE2;
+        Array<TYPE2> y;
+        readHDF5( fid, name, y );
+        ASSERT( y.length() == x.size() );
+        // Swap the elements in the arrays to use the move operator
+        for ( size_t i = 0; i < x.size(); i++ )
+            std::swap( x[i], y( i ) );
+    } else if constexpr ( is_Array<TYPE>::value ) {
+        // We are dealing with an Array
+        std::string typeName = Utilities::demangle( typeid( TYPE ).name() );
+        throw std::logic_error( "Unsupported type readHDF5<Array<" + typeName + ">>" );
+    } else if constexpr ( std::is_same<TYPE, std::string>::value ) {
+        // We are dealing with a std::string (should be handled through specialization)
+        throw std::logic_error( "Internal error" );
+    } else if constexpr ( std::is_same<TYPE, std::string>::value ||
+                          std::is_same<TYPE, char *>::value ||
+                          std::is_same<TYPE, const char *>::value ) {
+        // We are dealing with a string or char array
+        throw std::logic_error(
+            "Reading data into a string, char*, const char* is not supported" );
+    } else if constexpr ( has_size<TYPE>::value ) {
+        // We are dealing with a container
+        typedef typename std::remove_reference<decltype( *x.begin() )>::type TYPE2;
+        Array<TYPE2> y;
+        readHDF5( fid, name, y );
+        if ( x.size() == y.length() ) {
+            auto it = x.begin();
+            for ( size_t i = 0; i < y.length(); i++, ++it )
+                *it = y( i );
+        } else {
+            throw std::logic_error( "Reading data into an arbitrary container is not finished" );
+        }
+    } else {
+        throw std::logic_error( "Unsupported type" );
+    }
+}*/
+
+
+/************************************************************************
+ * Helper function to get the size of an Array                           *
+ * Note that HDF5 uses C ordered arrays so we need to flip the dimensions*
+ ************************************************************************/
+template<class T>
+inline std::vector<hsize_t> arraySize( const Array<T> &x )
+{
+    int N   = x.ndim();
+    auto s1 = x.size();
+    std::vector<hsize_t> s2( std::max( N, 1 ), 0 );
+    for ( int i = 0; i < N; i++ )
+        s2[N - i - 1] = static_cast<hsize_t>( s1[i] );
+    return s2;
+}
+inline std::vector<size_t> convertSize( int N, const hsize_t *dims )
+{
+    if ( N == 0 )
+        return std::vector<size_t>( 1, 1 );
+    std::vector<size_t> size( N, 0 );
+    for ( int i = 0; i < N; i++ )
+        size[N - i - 1] = static_cast<size_t>( dims[i] );
+    return size;
+}
+
+
+/************************************************************************
+ * readAndConvertHDF5Data                                                *
+ ************************************************************************/
+template<class T>
+typename std::enable_if<std::is_integral<T>::value || std::is_floating_point<T>::value, void>::type
+readAndConvertHDF5Data( hid_t dataset, hid_t datatype, Array<T> &data )
+{
+    if ( H5Tequal( datatype, H5T_NATIVE_CHAR ) ) {
+        Array<char> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_UCHAR ) ) {
+        Array<unsigned char> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_INT8 ) ) {
+        Array<int8_t> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_UINT8 ) ) {
+        Array<uint8_t> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_INT ) ) {
+        Array<int> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_UINT ) ) {
+        Array<unsigned int> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_LONG ) ) {
+        Array<long int> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_ULONG ) ) {
+        Array<unsigned long int> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_FLOAT ) ) {
+        Array<float> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else if ( H5Tequal( datatype, H5T_NATIVE_DOUBLE ) ) {
+        Array<double> data2( data.size() );
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data2.data() );
+        data.copy( data2 );
+    } else {
+        ERROR( "We need to convert unknown data format" );
+    }
+}
+template<class T>
+typename std::enable_if<!std::is_integral<T>::value && !std::is_floating_point<T>::value,
+    void>::type
+readAndConvertHDF5Data( hid_t, hid_t, Array<T> & )
+{
+    ERROR( "Unable to convert data" );
+}
+
+
+/************************************************************************
+ * Default writeHDF5Array                                                *
+ ************************************************************************/
+template<class T>
+void writeHDF5ArrayDefault( hid_t fid, const std::string &name, const Array<T> &data )
+{
+    size_t N_bytes = data.length() * sizeof( T );
+    auto dim       = arraySize( data );
+    hid_t plist    = H5P_DEFAULT;
+    if ( N_bytes < 0x7500 ) {
+        // Use compact storage (limited to < 30K)
+        plist       = H5Pcreate( H5P_DATASET_CREATE );
+        auto status = H5Pset_layout( plist, H5D_COMPACT );
+        ASSERT( status == 0 );
+    } else if ( std::is_same<T, double>::value || std::is_same<T, float>::value ) {
+        // Use compression if availible
+        plist = createChunk( dim, defaultCompression( fid ) );
+    }
+    hid_t dataspace = H5Screate_simple( dim.size(), dim.data(), NULL );
+    hid_t datatype  = getHDF5datatype<T>();
+    hid_t dataset =
+        H5Dcreate2( fid, name.data(), datatype, dataspace, H5P_DEFAULT, plist, H5P_DEFAULT );
+    const void *ptr = data.data() == NULL ? ( (void *) 1 ) : data.data();
+    H5Dwrite( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, ptr );
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Sclose( dataspace );
+    if ( plist != H5P_DEFAULT )
+        H5Pclose( plist );
+}
+
+
+/************************************************************************
+ * Default readHDF5Array                                                 *
+ ************************************************************************/
+template<class T>
+void readHDF5ArrayDefault( hid_t fid, const std::string &name, Array<T> &data )
+{
+    if ( !H5Dexists( fid, name ) ) {
+        // Dataset does not exist
+        data.resize( 0 );
+        return;
+    }
+    hid_t dataset   = H5Dopen2( fid, name.data(), H5P_DEFAULT );
+    hid_t datatype  = H5Dget_type( dataset );
+    hid_t dataspace = H5Dget_space( dataset );
+    hsize_t dims0[10];
+    int ndim  = H5Sget_simple_extent_dims( dataspace, dims0, NULL );
+    auto dims = convertSize( ndim, dims0 );
+    data.resize( dims );
+    hid_t datatype2 = getHDF5datatype<T>();
+    if ( data.empty() ) {
+        // The data is empty
+    } else if ( H5Tequal( datatype, datatype2 ) ) {
+        // The type of Array and the data in HDF5 match
+        H5Dread( dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.data() );
+    } else {
+        // Try to convert the data
+        readAndConvertHDF5Data( dataset, datatype, data );
+    }
+    H5Dclose( dataset );
+    H5Tclose( datatype );
+    H5Tclose( datatype2 );
+    H5Sclose( dataspace );
+}
+
+
+} // namespace HDF5
+} // namespace IO
+
+
+#endif
+#endif
--- a/IO/Mesh.cpp
+++ b/IO/Mesh.cpp
@ -163,7 +163,7 @@ size_t PointList::numberPointsVar( VariableType type ) const
 }
 std::pair<size_t, void *> PointList::pack( int level ) const
 {
-    std::pair<size_t, void *> data_out( 0, NULL );
+    std::pair<size_t, void *> data_out( 0, nullptr );
    if ( level == 0 ) {
        data_out.first     = ( 2 + 3 * points.size() ) * sizeof( double );
        double *data_ptr   = new double[2 + 3 * points.size()];
@ -626,6 +626,8 @@ std::string getString( FileFormat type )
        return "new(single)";
    else if ( type == FileFormat::SILO )
        return "silo";
+    else if ( type == FileFormat::HDF5 )
+        return "hdf5";
    else
        ERROR( "Invalid type" );
    return "";
@ -641,6 +643,8 @@ FileFormat getFileFormat( const std::string &type_in )
        return FileFormat::NEW_SINGLE;
    else if ( type == "silo" || type == "4" )
        return FileFormat::SILO;
+    else if ( type == "hdf5" || type == "5" )
+        return FileFormat::HDF5;
    else
        ERROR( "Invalid type: " + type );
    return FileFormat::SILO;
--- a/IO/Mesh.h
+++ b/IO/Mesh.h
@ -39,7 +39,7 @@ enum class VariableType {
 };
 enum class DataType { Double, Float, Int, Null };
 enum class MeshType { PointMesh, SurfaceMesh, VolumeMesh, Unknown };
-enum class FileFormat { OLD, NEW, NEW_SINGLE, SILO };
+enum class FileFormat { OLD, NEW, NEW_SINGLE, SILO, HDF5 };


 //! Convert enums to/from strings (more future-proof than static_cast<int>)
--- a/IO/MeshDatabase.cpp
+++ b/IO/MeshDatabase.cpp
@ -396,7 +396,7 @@ std::vector<MeshDatabase> read( const std::string &filename )
    PROFILE_START( "read" );
    FILE *fid = fopen( filename.c_str(), "rb" );
    if ( fid == NULL )
-        ERROR( "Error opening file" );
+        ERROR( "Error opening file: " + filename );
    char *line = new char[10000];
    while ( std::fgets( line, 1000, fid ) != NULL ) {
        if ( line[0] < 32 ) {
--- a/IO/Reader.cpp
+++ b/IO/Reader.cpp
@ -14,15 +14,12 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "IO/Reader.h"
+#include "IO/HDF5_IO.h"
 #include "IO/IOHelpers.h"
 #include "IO/Mesh.h"
 #include "IO/MeshDatabase.h"
-#include "common/Utilities.h"
-
-#ifdef USE_SILO
 #include "IO/silo.h"
-#endif
-
+#include "common/Utilities.h"

 #include <ProfilerApp.h>
 #include <cstdio>
@ -77,14 +74,16 @@ std::vector<std::string> IO::readTimesteps( const std::string &path, const std::
        filename += "summary.LBM";
    } else if ( format == "silo" ) {
        filename += "LBM.visit";
+    } else if ( format == "hdf5" ) {
+        filename += "LBM.visit";
    } else if ( format == "auto" ) {
-        bool test_old  = fileExists( path + "/summary.LBM" );
-        bool test_silo = fileExists( path + "/LBM.visit" );
-        if ( test_old && test_silo ) {
+        bool test_old = fileExists( path + "/summary.LBM" );
+        bool test_new = fileExists( path + "/LBM.visit" );
+        if ( test_old && test_new ) {
            ERROR( "Unable to determine format (both summary.LBM and LBM.visit exist)" );
        } else if ( test_old ) {
            filename += "summary.LBM";
-        } else if ( test_silo ) {
+        } else if ( test_new ) {
            filename += "LBM.visit";
        } else {
            ERROR( "Unable to determine format (neither summary.LBM or LBM.visit exist)" );
@ -103,6 +102,9 @@ std::vector<std::string> IO::readTimesteps( const std::string &path, const std::
        std::string line( buf );
        line.resize( line.size() - 1 );
        auto pos = line.find( "summary.silo" );
+        if ( pos != std::string::npos )
+            line.resize( pos );
+        pos = line.find( "summary.xmf" );
        if ( pos != std::string::npos )
            line.resize( pos );
        if ( line.empty() )
@ -185,8 +187,8 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
        if ( count % 3 != 0 )
            ERROR( "Error reading file" );
        if ( meshDatabase.type == IO::MeshType::PointMesh ) {
-            size_t N = count / 3;
-            std::shared_ptr<PointList> pointlist( new PointList( N ) );
+            size_t N              = count / 3;
+            auto pointlist        = std::make_shared<PointList>( N );
            std::vector<Point> &P = pointlist->points;
            for ( size_t i = 0; i < N; i++ ) {
                P[i].x = data[3 * i + 0];
@ -197,8 +199,8 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
        } else if ( meshDatabase.type == IO::MeshType::SurfaceMesh ) {
            if ( count % 9 != 0 )
                ERROR( "Error reading file (2)" );
-            size_t N_tri = count / 9;
-            std::shared_ptr<TriList> trilist( new TriList( N_tri ) );
+            size_t N_tri          = count / 9;
+            auto trilist          = std::make_shared<TriList>( N_tri );
            std::vector<Point> &A = trilist->A;
            std::vector<Point> &B = trilist->B;
            std::vector<Point> &C = trilist->C;
@ -216,7 +218,7 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
            mesh = trilist;
        } else if ( meshDatabase.type == IO::MeshType::VolumeMesh ) {
            // this was never supported in the old format
-            mesh = std::shared_ptr<DomainMesh>( new DomainMesh() );
+            mesh = std::make_shared<DomainMesh>();
        } else {
            ERROR( "Unknown mesh type" );
        }
@ -237,13 +239,13 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
        fclose( fid );
        ASSERT( count == bytes );
        if ( meshDatabase.meshClass == "PointList" ) {
-            mesh.reset( new IO::PointList() );
+            mesh = std::make_shared<IO::PointList>();
        } else if ( meshDatabase.meshClass == "TriMesh" ) {
-            mesh.reset( new IO::TriMesh() );
+            mesh = std::make_shared<IO::TriMesh>();
        } else if ( meshDatabase.meshClass == "TriList" ) {
-            mesh.reset( new IO::TriList() );
+            mesh = std::make_shared<IO::TriList>();
        } else if ( meshDatabase.meshClass == "DomainMesh" ) {
-            mesh.reset( new IO::DomainMesh() );
+            mesh = std::make_shared<IO::DomainMesh>();
        } else {
            ERROR( "Unknown mesh class" );
        }
@ -258,7 +260,7 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
        if ( meshDatabase.meshClass == "PointList" ) {
            Array<double> coords = silo::readPointMesh<double>( fid, database.name );
            ASSERT( coords.size( 1 ) == 3 );
-            std::shared_ptr<IO::PointList> mesh2( new IO::PointList( coords.size( 0 ) ) );
+            auto mesh2 = std::make_shared<IO::PointList>( coords.size( 0 ) );
            for ( size_t i = 0; i < coords.size( 1 ); i++ ) {
                mesh2->points[i].x = coords( i, 0 );
                mesh2->points[i].y = coords( i, 1 );
@ -272,7 +274,7 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
            ASSERT( tri.size( 1 ) == 3 && coords.size( 1 ) == 3 );
            int N_tri   = tri.size( 0 );
            int N_point = coords.size( 0 );
-            std::shared_ptr<IO::TriMesh> mesh2( new IO::TriMesh( N_tri, N_point ) );
+            auto mesh2  = std::make_shared<IO::TriMesh>( N_tri, N_point );
            for ( int i = 0; i < N_point; i++ ) {
                mesh2->vertices->points[i].x = coords( i, 0 );
                mesh2->vertices->points[i].y = coords( i, 1 );
@ -295,14 +297,81 @@ std::shared_ptr<IO::Mesh> IO::getMesh( const std::string &path, const std::strin
            silo::readUniformMesh( fid, database.name, range, N );
            auto rankinfo = silo::read<int>( fid, database.name + "_rankinfo" );
            RankInfoStruct rank_data( rankinfo[0], rankinfo[1], rankinfo[2], rankinfo[3] );
-            mesh.reset( new IO::DomainMesh( rank_data, N[0], N[1], N[2], range[1] - range[0],
-                range[3] - range[2], range[5] - range[4] ) );
+            mesh = std::make_shared<IO::DomainMesh>( rank_data, N[0], N[1], N[2],
+                range[1] - range[0], range[3] - range[2], range[5] - range[4] );
        } else {
            ERROR( "Unknown mesh class" );
        }
        silo::close( fid );
 #else
        ERROR( "Build without silo support" );
+#endif
+    } else if ( meshDatabase.format == FileFormat::HDF5 ) {
+        // Reading an hdf5 file
+#ifdef USE_HDF5
+        auto &database = meshDatabase.domains[domain];
+        auto filename  = path + "/" + timestep + "/" + database.file;
+        auto fid       = IO::HDF5::openHDF5( filename, "r" );
+        auto gid       = IO::HDF5::openGroup( fid, database.name );
+        if ( meshDatabase.meshClass == "PointList" ) {
+            std::vector<double> x, y, z;
+            IO::HDF5::readHDF5( gid, "x", x );
+            IO::HDF5::readHDF5( gid, "y", y );
+            IO::HDF5::readHDF5( gid, "z", z );
+            ASSERT( y.size() == x.size() && z.size() == x.size() );
+            auto mesh2 = std::make_shared<IO::PointList>( x.size() );
+            for ( size_t i = 0; i < x.size(); i++ ) {
+                mesh2->points[i].x = x[i];
+                mesh2->points[i].y = y[i];
+                mesh2->points[i].z = z[i];
+            }
+            mesh = mesh2;
+        } else if ( meshDatabase.meshClass == "TriMesh" || meshDatabase.meshClass == "TriList" ) {
+            // Read the points
+            std::vector<double> x, y, z;
+            IO::HDF5::readHDF5( gid, "x", x );
+            IO::HDF5::readHDF5( gid, "y", y );
+            IO::HDF5::readHDF5( gid, "z", z );
+            // Read the triangles
+            Array<int> tri;
+            IO::HDF5::readHDF5( gid, "tri", tri );
+            ASSERT( tri.size( 0 ) == 3 );
+            size_t N_tri   = tri.size( 1 );
+            size_t N_point = x.size();
+            auto mesh2     = std::make_shared<IO::TriMesh>( N_tri, N_point );
+            for ( size_t i = 0; i < N_point; i++ ) {
+                mesh2->vertices->points[i].x = x[i];
+                mesh2->vertices->points[i].y = y[i];
+                mesh2->vertices->points[i].z = z[i];
+            }
+            for ( size_t i = 0; i < N_tri; i++ ) {
+                mesh2->A[i] = tri( 0, i );
+                mesh2->B[i] = tri( 1, i );
+                mesh2->C[i] = tri( 2, i );
+            }
+            if ( meshDatabase.meshClass == "TriMesh" ) {
+                mesh = mesh2;
+            } else if ( meshDatabase.meshClass == "TriList" ) {
+                auto trilist = IO::getTriList( std::dynamic_pointer_cast<IO::Mesh>( mesh2 ) );
+                mesh         = trilist;
+            }
+        } else if ( meshDatabase.meshClass == "DomainMesh" ) {
+            std::vector<double> range;
+            std::vector<int> N;
+            std::vector<int> rankinfo;
+            IO::HDF5::readHDF5( gid, "range", range );
+            IO::HDF5::readHDF5( gid, "N", N );
+            IO::HDF5::readHDF5( gid, "rankinfo", rankinfo );
+            RankInfoStruct rank_data( rankinfo[0], rankinfo[1], rankinfo[2], rankinfo[3] );
+            mesh = std::make_shared<IO::DomainMesh>( rank_data, N[0], N[1], N[2],
+                range[1] - range[0], range[3] - range[2], range[5] - range[4] );
+        } else {
+            ERROR( "Unknown mesh class" );
+        }
+        IO::HDF5::closeGroup( gid );
+        IO::HDF5::closeHDF5( fid );
+#else
+        ERROR( "Build without hdf5 support" );
 #endif
    } else {
        ERROR( "Unknown format" );
@ -337,7 +406,7 @@ std::shared_ptr<IO::Variable> IO::getVariable( const std::string &path, const st
        size_t N              = atol( values[2].c_str() );
        size_t bytes          = atol( values[3].c_str() );
        std::string precision = values[4];
-        var                   = std::shared_ptr<IO::Variable>( new IO::Variable() );
+        var                   = std::make_shared<IO::Variable>();
        var->dim              = dim;
        var->type             = getVariableType( type );
        var->name             = variable;
@ -356,7 +425,7 @@ std::shared_ptr<IO::Variable> IO::getVariable( const std::string &path, const st
        auto variableDatabase = meshDatabase.getVariableDatabase( variable );
        std::string filename  = path + "/" + timestep + "/" + database.file;
        auto fid              = silo::open( filename, silo::READ );
-        var.reset( new Variable( variableDatabase.dim, variableDatabase.type, variable ) );
+        var = std::make_shared<Variable>( variableDatabase.dim, variableDatabase.type, variable );
        if ( meshDatabase.meshClass == "PointList" ) {
            var->data = silo::readPointMeshVariable<double>( fid, variable );
        } else if ( meshDatabase.meshClass == "TriMesh" || meshDatabase.meshClass == "TriList" ) {
@ -370,7 +439,30 @@ std::shared_ptr<IO::Variable> IO::getVariable( const std::string &path, const st
 #else
        ERROR( "Build without silo support" );
 #endif
-
+    } else if ( meshDatabase.format == FileFormat::HDF5 ) {
+        // Reading an hdf5 file
+#ifdef USE_HDF5
+        auto &database   = meshDatabase.domains[domain];
+        auto varDatabase = meshDatabase.getVariableDatabase( variable );
+        auto filename    = path + "/" + timestep + "/" + database.file;
+        var      = std::make_shared<Variable>( varDatabase.dim, varDatabase.type, variable );
+        auto fid = IO::HDF5::openHDF5( filename, "r" );
+        auto gid = IO::HDF5::openGroup( fid, database.name );
+        IO::HDF5::readHDF5( gid, var->name, var->data );
+        IO::HDF5::closeHDF5( fid );
+        if ( meshDatabase.meshClass == "PointList" || meshDatabase.meshClass == "TriMesh" ||
+             meshDatabase.meshClass == "TriList" ) {
+            if ( var->data.ndim() == 2 && var->data.size( 0 ) == 3 )
+                var->data = var->data.permute( { 1, 0 } );
+        } else if ( meshDatabase.meshClass == "DomainMesh" ) {
+            if ( var->data.ndim() == 4 && var->data.size( 0 ) == 3 )
+                var->data = var->data.permute( { 1, 2, 3, 0 } );
+        } else {
+            ERROR( "Unknown mesh class" );
+        }
+#else
+        ERROR( "Build without silo support" );
+#endif
    } else {
        ERROR( "Unknown format" );
    }
--- a/IO/SiloWriter.cpp
+++ b/IO/SiloWriter.cpp
@ -0,0 +1,251 @@
+#include "IO/HDF5_IO.h"
+#include "IO/IOHelpers.h"
+#include "IO/MeshDatabase.h"
+#include "IO/Writer.h"
+#include "IO/silo.h"
+#include "common/MPI.h"
+#include "common/Utilities.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <sys/stat.h>
+#include <vector>
+
+
+#ifdef USE_SILO
+
+
+// Write a PointList mesh (and variables) to a file
+template<class TYPE>
+static void writeSiloPointMesh(
+    DBfile *fid, const IO::PointList &mesh, const std::string &meshname )
+{
+    const auto &points = mesh.getPoints();
+    std::vector<TYPE> x( points.size() ), y( points.size() ), z( points.size() );
+    for ( size_t i = 0; i < x.size(); i++ ) {
+        x[i] = points[i].x;
+        y[i] = points[i].y;
+        z[i] = points[i].z;
+    }
+    const TYPE *coords[] = { x.data(), y.data(), z.data() };
+    IO::silo::writePointMesh<TYPE>( fid, meshname, 3, points.size(), coords );
+}
+static void writeSiloPointList(
+    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
+{
+    const IO::PointList &mesh  = dynamic_cast<IO::PointList &>( *meshData.mesh );
+    const std::string meshname = database.domains[0].name;
+    if ( meshData.precision == IO::DataType::Double ) {
+        writeSiloPointMesh<double>( fid, mesh, meshname );
+    } else if ( meshData.precision == IO::DataType::Float ) {
+        writeSiloPointMesh<float>( fid, mesh, meshname );
+    } else {
+        ERROR( "Unsupported format" );
+    }
+    const auto &points = mesh.getPoints();
+    std::vector<double> x( points.size() ), y( points.size() ), z( points.size() );
+    for ( size_t i = 0; i < x.size(); i++ ) {
+        x[i] = points[i].x;
+        y[i] = points[i].y;
+        z[i] = points[i].z;
+    }
+    const double *coords[] = { x.data(), y.data(), z.data() };
+    IO::silo::writePointMesh( fid, meshname, 3, points.size(), coords );
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        const IO::Variable &var = *meshData.vars[i];
+        if ( var.precision == IO::DataType::Double ) {
+            IO::silo::writePointMeshVariable( fid, meshname, var.name, var.data );
+        } else if ( var.precision == IO::DataType::Float ) {
+            Array<float> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writePointMeshVariable( fid, meshname, var.name, data2 );
+        } else if ( var.precision == IO::DataType::Int ) {
+            Array<int> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writePointMeshVariable( fid, meshname, var.name, data2 );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+    }
+}
+// Write a TriMesh mesh (and variables) to a file
+template<class TYPE>
+static void writeSiloTriMesh( DBfile *fid, const IO::TriMesh &mesh, const std::string &meshname )
+{
+    const auto &points = mesh.vertices->getPoints();
+    std::vector<TYPE> x( points.size() ), y( points.size() ), z( points.size() );
+    for ( size_t i = 0; i < x.size(); i++ ) {
+        x[i] = points[i].x;
+        y[i] = points[i].y;
+        z[i] = points[i].z;
+    }
+    const TYPE *coords[] = { x.data(), y.data(), z.data() };
+    const int *tri[]     = { mesh.A.data(), mesh.B.data(), mesh.C.data() };
+    IO::silo::writeTriMesh<TYPE>( fid, meshname, 3, 2, points.size(), coords, mesh.A.size(), tri );
+}
+static void writeSiloTriMesh2( DBfile *fid, const IO::MeshDataStruct &meshData,
+    const IO::TriMesh &mesh, IO::MeshDatabase database )
+{
+    const std::string meshname = database.domains[0].name;
+    if ( meshData.precision == IO::DataType::Double ) {
+        writeSiloTriMesh<double>( fid, mesh, meshname );
+    } else if ( meshData.precision == IO::DataType::Float ) {
+        writeSiloTriMesh<float>( fid, mesh, meshname );
+    } else {
+        ERROR( "Unsupported format" );
+    }
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        const IO::Variable &var = *meshData.vars[i];
+        if ( var.precision == IO::DataType::Double ) {
+            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, var.data, var.type );
+        } else if ( var.precision == IO::DataType::Float ) {
+            Array<float> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, data2, var.type );
+        } else if ( var.precision == IO::DataType::Int ) {
+            Array<int> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, data2, var.type );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+    }
+}
+static void writeSiloTriMesh(
+    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
+{
+    const IO::TriMesh &mesh = dynamic_cast<IO::TriMesh &>( *meshData.mesh );
+    writeSiloTriMesh2( fid, meshData, mesh, database );
+}
+static void writeSiloTriList(
+    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
+{
+    auto mesh = getTriMesh( meshData.mesh );
+    writeSiloTriMesh2( fid, meshData, *mesh, database );
+}
+// Write a DomainMesh mesh (and variables) to a file
+static void writeSiloDomainMesh(
+    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
+{
+    const IO::DomainMesh &mesh = dynamic_cast<IO::DomainMesh &>( *meshData.mesh );
+    RankInfoStruct info( mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz );
+    std::array<double, 6> range = { info.ix * mesh.Lx / info.nx,
+        ( info.ix + 1 ) * mesh.Lx / info.nx, info.jy * mesh.Ly / info.ny,
+        ( info.jy + 1 ) * mesh.Ly / info.ny, info.kz * mesh.Lz / info.nz,
+        ( info.kz + 1 ) * mesh.Lz / info.nz };
+    std::array<int, 3> N        = { mesh.nx, mesh.ny, mesh.nz };
+    auto meshname               = database.domains[0].name;
+    IO::silo::writeUniformMesh<3>( fid, meshname, range, N );
+    IO::silo::write<int>(
+        fid, meshname + "_rankinfo", { mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz } );
+    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
+        const auto &var = *meshData.vars[i];
+        if ( var.precision == IO::DataType::Double ) {
+            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, var.data, var.type );
+        } else if ( var.precision == IO::DataType::Float ) {
+            Array<float> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, data2, var.type );
+        } else if ( var.precision == IO::DataType::Int ) {
+            Array<int> data2( var.data.size() );
+            data2.copy( var.data );
+            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, data2, var.type );
+        } else {
+            ERROR( "Unsupported format" );
+        }
+    }
+}
+// Write a mesh (and variables) to a file
+static IO::MeshDatabase write_domain_silo( DBfile *fid, const std::string &filename,
+    const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank )
+{
+    // Create the MeshDatabase
+    auto database = getDatabase( filename, mesh, format, rank );
+    if ( database.meshClass == "PointList" ) {
+        writeSiloPointList( fid, mesh, database );
+    } else if ( database.meshClass == "TriMesh" ) {
+        writeSiloTriMesh( fid, mesh, database );
+    } else if ( database.meshClass == "TriList" ) {
+        writeSiloTriList( fid, mesh, database );
+    } else if ( database.meshClass == "DomainMesh" ) {
+        writeSiloDomainMesh( fid, mesh, database );
+    } else {
+        ERROR( "Unknown mesh class" );
+    }
+    return database;
+}
+// Write the summary file for silo
+std::pair<int, int> getSiloMeshType( const std::string &meshClass )
+{
+    int meshType = 0;
+    int varType  = 0;
+    if ( meshClass == "PointList" ) {
+        meshType = DB_POINTMESH;
+        varType  = DB_POINTVAR;
+    } else if ( meshClass == "TriMesh" ) {
+        meshType = DB_UCDMESH;
+        varType  = DB_UCDVAR;
+    } else if ( meshClass == "TriList" ) {
+        meshType = DB_UCDMESH;
+        varType  = DB_UCDVAR;
+    } else if ( meshClass == "DomainMesh" ) {
+        meshType = DB_QUAD_RECT;
+        varType  = DB_QUADVAR;
+    } else {
+        ERROR( "Unknown mesh class" );
+    }
+    return std::make_pair( meshType, varType );
+}
+void writeSiloSummary(
+    const std::vector<IO::MeshDatabase> &meshes_written, const std::string &filename )
+{
+    auto fid = IO::silo::open( filename, IO::silo::CREATE );
+    for ( const auto &data : meshes_written ) {
+        auto type = getSiloMeshType( data.meshClass );
+        std::vector<int> meshTypes( data.domains.size(), type.first );
+        std::vector<int> varTypes( data.domains.size(), type.second );
+        std::vector<std::string> meshNames;
+        for ( const auto &tmp : data.domains )
+            meshNames.push_back( tmp.file + ":" + tmp.name );
+        IO::silo::writeMultiMesh( fid, data.name, meshNames, meshTypes );
+        for ( const auto &variable : data.variables ) {
+            std::vector<std::string> varnames;
+            for ( const auto &tmp : data.domains )
+                varnames.push_back( tmp.file + ":" + variable.name );
+            IO::silo::writeMultiVar( fid, variable.name, varnames, varTypes );
+        }
+    }
+    IO::silo::close( fid );
+}
+// Write the mesh data to silo
+std::vector<IO::MeshDatabase> writeMeshesSilo( const std::vector<IO::MeshDataStruct> &meshData,
+    const std::string &path, IO::FileFormat format, int rank )
+{
+    std::vector<IO::MeshDatabase> meshes_written;
+    char filename[100], fullpath[200];
+    sprintf( filename, "%05i.silo", rank );
+    sprintf( fullpath, "%s/%s", path.c_str(), filename );
+    auto fid = IO::silo::open( fullpath, IO::silo::CREATE );
+    for ( size_t i = 0; i < meshData.size(); i++ ) {
+        auto mesh = meshData[i].mesh;
+        meshes_written.push_back( write_domain_silo( fid, filename, meshData[i], format, rank ) );
+    }
+    IO::silo::close( fid );
+    return meshes_written;
+}
+
+
+#else
+
+
+// Write the mesh data to silo
+std::vector<IO::MeshDatabase> writeMeshesSilo(
+    const std::vector<IO::MeshDataStruct> &, const std::string &, IO::FileFormat, int )
+{
+    return std::vector<IO::MeshDatabase>();
+}
+void writeSiloSummary( const std::vector<IO::MeshDatabase> &, const std::string & );
+
+
+#endif
--- a/IO/Writer.cpp
+++ b/IO/Writer.cpp
@ -14,12 +14,15 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "IO/Writer.h"
+#include "IO/HDF5_IO.h"
 #include "IO/IOHelpers.h"
 #include "IO/MeshDatabase.h"
-#include "IO/silo.h"
+#include "IO/Xdmf.h"
 #include "common/MPI.h"
 #include "common/Utilities.h"

+#include "ProfilerApp.h"
+
 #include <algorithm>
 #include <memory>
 #include <set>
@ -27,7 +30,17 @@
 #include <vector>


-enum class Format { OLD, NEW, SILO, UNKNOWN };
+enum class Format { OLD, NEW, SILO, HDF5, UNKNOWN };
+
+
+/****************************************************
+ * External declerations                             *
+ ****************************************************/
+std::vector<IO::MeshDatabase> writeMeshesSilo(
+    const std::vector<IO::MeshDataStruct> &, const std::string &, IO::FileFormat, int );
+void writeSiloSummary( const std::vector<IO::MeshDatabase> &, const std::string & );
+std::vector<IO::MeshDatabase> writeMeshesHDF5(
+    const std::vector<IO::MeshDataStruct> &, const std::string &, IO::FileFormat, int, Xdmf & );


 /****************************************************
@ -90,6 +103,8 @@ void IO::initialize( const std::string &path, const std::string &format, bool ap
        global_IO_format = Format::NEW;
    else if ( format == "silo" )
        global_IO_format = Format::SILO;
+    else if ( format == "hdf5" )
+        global_IO_format = Format::HDF5;
    else
        ERROR( "Unknown format" );
    int rank = Utilities::MPI( MPI_COMM_WORLD ).getRank();
@ -98,7 +113,7 @@ void IO::initialize( const std::string &path, const std::string &format, bool ap
        std::string filename;
        if ( global_IO_format == Format::OLD || global_IO_format == Format::NEW )
            filename = global_IO_path + "/summary.LBM";
-        else if ( global_IO_format == Format::SILO )
+        else if ( global_IO_format == Format::SILO || global_IO_format == Format::HDF5 )
            filename = global_IO_path + "/LBM.visit";
        else
            ERROR( "Unknown format" );
@ -131,10 +146,10 @@ static std::vector<IO::MeshDatabase> writeMeshesOrigFormat(
        domain.file   = filename;
        domain.offset = 0;
        mesh_entry.domains.push_back( domain );
-        if ( !meshData[i].vars.empty() ) {
+        static bool printVariableSupportMsg = true;
+        if ( !meshData[i].vars.empty() && printVariableSupportMsg ) {
+            printVariableSupportMsg = false;
            printf( "Warning: variables are not supported with this format (original)\n" );
-            // for (size_t j=0; j<meshData[i].vars.size(); j++)
-            //    mesh_entry.variables.push_back( meshData[i].vars[j]->name );
        }
        const std::string meshClass = mesh->className();
        if ( meshClass == "PointList" ) {
@ -185,7 +200,7 @@ static std::vector<IO::MeshDatabase> writeMeshesOrigFormat(


 // Create the database entry for the mesh data
-static IO::MeshDatabase getDatabase(
+IO::MeshDatabase IO::getDatabase(
    const std::string &filename, const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank )
 {
    char domainname[100];
@ -227,6 +242,7 @@ static IO::MeshDatabase getDatabase(
 static IO::MeshDatabase write_domain( FILE *fid, const std::string &filename,
    const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank )
 {
+    ASSERT( !mesh.meshName.empty() );
    const int level = 0;
    // Create the MeshDatabase
    IO::MeshDatabase database = getDatabase( filename, mesh, format, rank );
@ -249,6 +265,8 @@ static IO::MeshDatabase write_domain( FILE *fid, const std::string &filename,
        size_t N        = mesh.vars[i]->data.length();
        size_t N_mesh   = mesh.mesh->numberPointsVar( mesh.vars[i]->type );
        ASSERT( N == dim * N_mesh );
+        ASSERT( !type.empty() );
+        ASSERT( !variable.name.empty() );
        fprintf( fid, "Var: %s-%05i-%s: %i, %s, %lu, %lu, double\n", database.name.c_str(), rank,
            variable.name.c_str(), dim, type.data(), N_mesh, N * sizeof( double ) );
        fwrite( mesh.vars[i]->data.data(), sizeof( double ), N, fid );
@ -258,212 +276,6 @@ static IO::MeshDatabase write_domain( FILE *fid, const std::string &filename,
 }


-#ifdef USE_SILO
-// Write a PointList mesh (and variables) to a file
-template<class TYPE>
-static void writeSiloPointMesh(
-    DBfile *fid, const IO::PointList &mesh, const std::string &meshname )
-{
-    const auto &points = mesh.getPoints();
-    std::vector<TYPE> x( points.size() ), y( points.size() ), z( points.size() );
-    for ( size_t i = 0; i < x.size(); i++ ) {
-        x[i] = points[i].x;
-        y[i] = points[i].y;
-        z[i] = points[i].z;
-    }
-    const TYPE *coords[] = { x.data(), y.data(), z.data() };
-    IO::silo::writePointMesh<TYPE>( fid, meshname, 3, points.size(), coords );
-}
-static void writeSiloPointList(
-    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
-{
-    const IO::PointList &mesh  = dynamic_cast<IO::PointList &>( *meshData.mesh );
-    const std::string meshname = database.domains[0].name;
-    if ( meshData.precision == IO::DataType::Double ) {
-        writeSiloPointMesh<double>( fid, mesh, meshname );
-    } else if ( meshData.precision == IO::DataType::Float ) {
-        writeSiloPointMesh<float>( fid, mesh, meshname );
-    } else {
-        ERROR( "Unsupported format" );
-    }
-    const auto &points = mesh.getPoints();
-    std::vector<double> x( points.size() ), y( points.size() ), z( points.size() );
-    for ( size_t i = 0; i < x.size(); i++ ) {
-        x[i] = points[i].x;
-        y[i] = points[i].y;
-        z[i] = points[i].z;
-    }
-    const double *coords[] = { x.data(), y.data(), z.data() };
-    IO::silo::writePointMesh( fid, meshname, 3, points.size(), coords );
-    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
-        const IO::Variable &var = *meshData.vars[i];
-        if ( var.precision == IO::DataType::Double ) {
-            IO::silo::writePointMeshVariable( fid, meshname, var.name, var.data );
-        } else if ( var.precision == IO::DataType::Float ) {
-            Array<float> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writePointMeshVariable( fid, meshname, var.name, data2 );
-        } else if ( var.precision == IO::DataType::Int ) {
-            Array<int> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writePointMeshVariable( fid, meshname, var.name, data2 );
-        } else {
-            ERROR( "Unsupported format" );
-        }
-    }
-}
-// Write a TriMesh mesh (and variables) to a file
-template<class TYPE>
-static void writeSiloTriMesh( DBfile *fid, const IO::TriMesh &mesh, const std::string &meshname )
-{
-    const auto &points = mesh.vertices->getPoints();
-    std::vector<TYPE> x( points.size() ), y( points.size() ), z( points.size() );
-    for ( size_t i = 0; i < x.size(); i++ ) {
-        x[i] = points[i].x;
-        y[i] = points[i].y;
-        z[i] = points[i].z;
-    }
-    const TYPE *coords[] = { x.data(), y.data(), z.data() };
-    const int *tri[]     = { mesh.A.data(), mesh.B.data(), mesh.C.data() };
-    IO::silo::writeTriMesh<TYPE>( fid, meshname, 3, 2, points.size(), coords, mesh.A.size(), tri );
-}
-static void writeSiloTriMesh2( DBfile *fid, const IO::MeshDataStruct &meshData,
-    const IO::TriMesh &mesh, IO::MeshDatabase database )
-{
-    const std::string meshname = database.domains[0].name;
-    if ( meshData.precision == IO::DataType::Double ) {
-        writeSiloTriMesh<double>( fid, mesh, meshname );
-    } else if ( meshData.precision == IO::DataType::Float ) {
-        writeSiloTriMesh<float>( fid, mesh, meshname );
-    } else {
-        ERROR( "Unsupported format" );
-    }
-    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
-        const IO::Variable &var = *meshData.vars[i];
-        if ( var.precision == IO::DataType::Double ) {
-            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, var.data, var.type );
-        } else if ( var.precision == IO::DataType::Float ) {
-            Array<float> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, data2, var.type );
-        } else if ( var.precision == IO::DataType::Int ) {
-            Array<int> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writeTriMeshVariable( fid, 3, meshname, var.name, data2, var.type );
-        } else {
-            ERROR( "Unsupported format" );
-        }
-    }
-}
-static void writeSiloTriMesh(
-    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
-{
-    const IO::TriMesh &mesh = dynamic_cast<IO::TriMesh &>( *meshData.mesh );
-    writeSiloTriMesh2( fid, meshData, mesh, database );
-}
-static void writeSiloTriList(
-    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
-{
-    auto mesh = getTriMesh( meshData.mesh );
-    writeSiloTriMesh2( fid, meshData, *mesh, database );
-}
-// Write a DomainMesh mesh (and variables) to a file
-static void writeSiloDomainMesh(
-    DBfile *fid, const IO::MeshDataStruct &meshData, IO::MeshDatabase database )
-{
-    const IO::DomainMesh &mesh = dynamic_cast<IO::DomainMesh &>( *meshData.mesh );
-    RankInfoStruct info( mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz );
-    std::array<double, 6> range = { info.ix * mesh.Lx / info.nx,
-        ( info.ix + 1 ) * mesh.Lx / info.nx, info.jy * mesh.Ly / info.ny,
-        ( info.jy + 1 ) * mesh.Ly / info.ny, info.kz * mesh.Lz / info.nz,
-        ( info.kz + 1 ) * mesh.Lz / info.nz };
-    std::array<int, 3> N        = { mesh.nx, mesh.ny, mesh.nz };
-    auto meshname               = database.domains[0].name;
-    IO::silo::writeUniformMesh<3>( fid, meshname, range, N );
-    IO::silo::write<int>(
-        fid, meshname + "_rankinfo", { mesh.rank, mesh.nprocx, mesh.nprocy, mesh.nprocz } );
-    for ( size_t i = 0; i < meshData.vars.size(); i++ ) {
-        const auto &var = *meshData.vars[i];
-        if ( var.precision == IO::DataType::Double ) {
-            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, var.data, var.type );
-        } else if ( var.precision == IO::DataType::Float ) {
-            Array<float> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, data2, var.type );
-        } else if ( var.precision == IO::DataType::Int ) {
-            Array<int> data2( var.data.size() );
-            data2.copy( var.data );
-            IO::silo::writeUniformMeshVariable<3>( fid, meshname, N, var.name, data2, var.type );
-        } else {
-            ERROR( "Unsupported format" );
-        }
-    }
-}
-// Write a mesh (and variables) to a file
-static IO::MeshDatabase write_domain_silo( DBfile *fid, const std::string &filename,
-    const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank )
-{
-    // Create the MeshDatabase
-    auto database = getDatabase( filename, mesh, format, rank );
-    if ( database.meshClass == "PointList" ) {
-        writeSiloPointList( fid, mesh, database );
-    } else if ( database.meshClass == "TriMesh" ) {
-        writeSiloTriMesh( fid, mesh, database );
-    } else if ( database.meshClass == "TriList" ) {
-        writeSiloTriList( fid, mesh, database );
-    } else if ( database.meshClass == "DomainMesh" ) {
-        writeSiloDomainMesh( fid, mesh, database );
-    } else {
-        ERROR( "Unknown mesh class" );
-    }
-    return database;
-}
-// Write the summary file for silo
-std::pair<int, int> getSiloMeshType( const std::string &meshClass )
-{
-    int meshType = 0;
-    int varType  = 0;
-    if ( meshClass == "PointList" ) {
-        meshType = DB_POINTMESH;
-        varType  = DB_POINTVAR;
-    } else if ( meshClass == "TriMesh" ) {
-        meshType = DB_UCDMESH;
-        varType  = DB_UCDVAR;
-    } else if ( meshClass == "TriList" ) {
-        meshType = DB_UCDMESH;
-        varType  = DB_UCDVAR;
-    } else if ( meshClass == "DomainMesh" ) {
-        meshType = DB_QUAD_RECT;
-        varType  = DB_QUADVAR;
-    } else {
-        ERROR( "Unknown mesh class" );
-    }
-    return std::make_pair( meshType, varType );
-}
-void writeSiloSummary(
-    const std::vector<IO::MeshDatabase> &meshes_written, const std::string &filename )
-{
-    auto fid = IO::silo::open( filename, IO::silo::CREATE );
-    for ( const auto &data : meshes_written ) {
-        auto type = getSiloMeshType( data.meshClass );
-        std::vector<int> meshTypes( data.domains.size(), type.first );
-        std::vector<int> varTypes( data.domains.size(), type.second );
-        std::vector<std::string> meshNames;
-        for ( const auto &tmp : data.domains )
-            meshNames.push_back( tmp.file + ":" + tmp.name );
-        IO::silo::writeMultiMesh( fid, data.name, meshNames, meshTypes );
-        for ( const auto &variable : data.variables ) {
-            std::vector<std::string> varnames;
-            for ( const auto &tmp : data.domains )
-                varnames.push_back( tmp.file + ":" + variable.name );
-            IO::silo::writeMultiVar( fid, variable.name, varnames, varTypes );
-        }
-    }
-    IO::silo::close( fid );
-}
-#endif
-
-
 // Write the mesh data in the new format
 static std::vector<IO::MeshDatabase> writeMeshesNewFormat(
    const std::vector<IO::MeshDataStruct> &meshData, const std::string &path, IO::FileFormat format,
@ -474,43 +286,14 @@ static std::vector<IO::MeshDatabase> writeMeshesNewFormat(
    sprintf( filename, "%05i", rank );
    sprintf( fullpath, "%s/%s", path.c_str(), filename );
    FILE *fid = fopen( fullpath, "wb" );
-    for ( size_t i = 0; i < meshData.size(); i++ ) {
-        std::shared_ptr<IO::Mesh> mesh = meshData[i].mesh;
+    ASSERT( fid != nullptr );
+    for ( size_t i = 0; i < meshData.size(); i++ )
        meshes_written.push_back( write_domain( fid, filename, meshData[i], format, rank ) );
-    }
    fclose( fid );
    return meshes_written;
 }


-// Write the mesh data to silo
-static std::vector<IO::MeshDatabase> writeMeshesSilo(
-    const std::vector<IO::MeshDataStruct> &meshData, const std::string &path, IO::FileFormat format,
-    int rank )
-{
-#ifdef USE_SILO
-    std::vector<IO::MeshDatabase> meshes_written;
-    char filename[100], fullpath[200];
-    sprintf( filename, "%05i.silo", rank );
-    sprintf( fullpath, "%s/%s", path.c_str(), filename );
-    auto fid = IO::silo::open( fullpath, IO::silo::CREATE );
-    for ( size_t i = 0; i < meshData.size(); i++ ) {
-        auto mesh = meshData[i].mesh;
-        meshes_written.push_back( write_domain_silo( fid, filename, meshData[i], format, rank ) );
-    }
-    IO::silo::close( fid );
-    return meshes_written;
-#else
-    NULL_USE( meshData );
-    NULL_USE( path );
-    NULL_USE( format );
-    NULL_USE( rank );
-    ERROR( "Application built without silo support" );
-    return std::vector<IO::MeshDatabase>();
-#endif
-}
-
-
 /****************************************************
 * Write the mesh data                               *
 ****************************************************/
@ -528,6 +311,7 @@ void IO::writeData( const std::string &subdir, const std::vector<IO::MeshDataStr
    std::string path = global_IO_path + "/" + subdir;
    recursiveMkdir( path, S_IRWXU | S_IRGRP );
    // Write the mesh files
+    Xdmf xmf;
    std::vector<IO::MeshDatabase> meshes_written;
    if ( global_IO_format == Format::OLD ) {
        // Write the original triangle format
@ -538,24 +322,28 @@ void IO::writeData( const std::string &subdir, const std::vector<IO::MeshDataStr
    } else if ( global_IO_format == Format::SILO ) {
        // Write silo
        meshes_written = writeMeshesSilo( meshData, path, IO::FileFormat::SILO, rank );
+    } else if ( global_IO_format == Format::HDF5 ) {
+        // Write hdf5
+        meshes_written = writeMeshesHDF5( meshData, path, IO::FileFormat::HDF5, rank, xmf );
    } else {
        ERROR( "Unknown format" );
    }
    // Gather a complete list of files on rank 0
    meshes_written = gatherAll( meshes_written, comm );
+    // Gather xmf file (if applicable)
+    if ( global_IO_format == Format::HDF5 ) {
+        xmf.gather( comm );
+    }
    // Write the summary files
    if ( rank == 0 ) {
        // Write the summary file for the current timestep
-        char filename[200];
-        sprintf( filename, "%s/LBM.summary", path.c_str() );
-        write( meshes_written, filename );
-// Write summary silo file if needed
-#ifdef USE_SILO
+        write( meshes_written, path + "/LBM.summary" );
+        // Write summary file if needed
        if ( global_IO_format == Format::SILO ) {
-            sprintf( filename, "%s/summary.silo", path.c_str() );
-            writeSiloSummary( meshes_written, filename );
+            writeSiloSummary( meshes_written, path + "/summary.silo" );
+        } else if ( global_IO_format == Format::HDF5 ) {
+            xmf.write( path + "/summary.xmf" );
        }
-#endif
        // Add the timestep to the global summary file
        if ( global_IO_format == Format::OLD || global_IO_format == Format::NEW ) {
            auto filename = global_IO_path + "/summary.LBM";
@ -567,6 +355,11 @@ void IO::writeData( const std::string &subdir, const std::vector<IO::MeshDataStr
            FILE *fid     = fopen( filename.c_str(), "ab" );
            fprintf( fid, "%s/summary.silo\n", subdir.c_str() );
            fclose( fid );
+        } else if ( global_IO_format == Format::HDF5 ) {
+            auto filename = global_IO_path + "/LBM.visit";
+            FILE *fid     = fopen( filename.c_str(), "ab" );
+            fprintf( fid, "%s/summary.xmf\n", subdir.c_str() );
+            fclose( fid );
        } else {
            ERROR( "Unknown format" );
        }
--- a/IO/Writer.h
+++ b/IO/Writer.h
@ -36,11 +36,13 @@ namespace IO {
 * @param[in] format        The data format to use:
 *                              old - Old mesh format
 *                                    (provided for backward compatibility, cannot write variables)
- *                              new - New format, 1 file/process silo - Silo
+ *                              new - New format, 1 file/process
+ *                              silo - Silo
+ *                              hdf5 - HDF5 + XMDF
 * @param[in] append        Append any existing data (default is false)
 */
 void initialize(
-    const std::string &path = "", const std::string &format = "silo", bool append = false );
+    const std::string &path = "", const std::string &format = "hdf5", bool append = false );


 /*!
@ -70,6 +72,11 @@ inline void writeData(
 }


+// Create the database entry for the mesh data
+IO::MeshDatabase getDatabase(
+    const std::string &filename, const IO::MeshDataStruct &mesh, IO::FileFormat format, int rank );
+
+
 } // namespace IO

 #endif
--- a/IO/Xdmf.cpp
+++ b/IO/Xdmf.cpp
@ -0,0 +1,620 @@
+#include "IO/Xdmf.h"
+
+#include "common/Array.h"
+#include "common/UtilityMacros.h"
+
+
+ArraySize squeeze( const ArraySize &x )
+{
+    int Nd      = 0;
+    size_t N[5] = { 1 };
+    for ( size_t i = 0; i < x.maxDim(); i++ ) {
+        if ( x[i] != 1 )
+            N[Nd++] = x[i];
+    }
+    return ArraySize( std::max( Nd, 1 ), N );
+}
+
+
+// Helper functions
+static void addDataItem(
+    FILE *xmf, const std::string &indent, ArraySize size, const std::string &location )
+{
+    size = squeeze( size );
+    if ( size.ndim() == 1 ) {
+        fprintf( xmf, "%s<DataItem Dimensions=\"%lu\"", indent.data(), size[0] );
+    } else if ( size.ndim() == 2 ) {
+        fprintf( xmf, "%s<DataItem Dimensions=\"%lu %lu\"", indent.data(), size[1], size[0] );
+    } else if ( size.ndim() == 3 ) {
+        fprintf( xmf, "%s<DataItem Dimensions=\"%lu %lu %lu\"", indent.data(), size[2], size[1],
+            size[0] );
+    } else if ( size.ndim() == 4 ) {
+        fprintf( xmf, "%s<DataItem Dimensions=\"%lu %lu %lu %lu\"", indent.data(), size[3], size[2],
+            size[1], size[0] );
+    } else {
+        ERROR( "Invalid number of dimensions" );
+    }
+    fprintf( xmf, " Format=\"HDF\">\n" );
+    fprintf( xmf, "%s  %s\n", indent.data(), location.data() );
+    fprintf( xmf, "%s</DataItem>\n", indent.data() );
+}
+template<class TYPE>
+static void addVariable( FILE *xmf, const std::string &indent, const std::string &name,
+    const std::string &type, const std::string &center, ArraySize size,
+    const std::string &location )
+{
+    fprintf( xmf, "%s<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"%s\">\n", indent.data(),
+        name.data(), type.data(), center.data() );
+    addDataItem( xmf, indent + "  ", size, location );
+    fprintf( xmf, "%s</Attribute>\n", indent.data() );
+}
+
+
+/****************************************************************
+ * Enum functions                                                *
+ ****************************************************************/
+/*template<class TYPE>
+static Xdmf::DataType getDataType()
+{
+    if ( std::is_same<TYPE, char>::value )
+        return Xdmf::DataType::Char;
+    else if ( std::is_same<TYPE, int32_t>::value )
+        return Xdmf::DataType::Int32;
+    else if ( std::is_same<TYPE, int64_t>::value )
+        return Xdmf::DataType::Int64;
+    else if ( std::is_same<TYPE, uint32_t>::value )
+        return Xdmf::DataType::Uint32;
+    else if ( std::is_same<TYPE, uint64_t>::value )
+        return Xdmf::DataType::Uint64;
+    else if ( std::is_same<TYPE, float>::value )
+        return Xdmf::DataType::Float;
+    else if ( std::is_same<TYPE, double>::value )
+        return Xdmf::DataType::Double;
+    else
+        ERROR( "Invalid type" );
+}*/
+static const char *TopologyTypeNames[]  = { "", "Polyvertex", "Polyline", "Polygon", "Triangle",
+    "Quadrilateral", "Tetrahedron", "Pyramid", "Wedge", "Hexahedron", "Edge_3", "Triangle_6",
+    "Quadrilateral_8", "Tetrahedron_10", "Pyramid_13", "Wedge_15", "Hexahedron_20", "Mixed",
+    "CurvilinearMesh2D", "CurvilinearMesh3D", "RectangularMesh2D", "RectangularMesh3D",
+    "UniformMesh2D", "UniformMesh3D" };
+static const uint8_t TopologyTypeDOFs[] = { 0, 1, 2, 0, 3, 4, 4, 5, 6, 8, 3, 6, 8, 10, 13, 15, 20,
+    0, 0, 0, 0, 0, 0, 0 };
+
+
+/****************************************************************
+ * Create a mesh                                                 *
+ ****************************************************************/
+Xdmf::MeshData Xdmf::createPointMesh( const std::string &name, uint8_t NDIM, size_t N,
+    const std::string &x, const std::string &y, const std::string &z )
+{
+    return createUnstructuredMesh( name, NDIM, TopologyType::Polyvertex, N, "", N, x, y, z );
+}
+Xdmf::MeshData Xdmf::createUniformMesh(
+    const std::string &name, const std::vector<double> &range, ArraySize size )
+{
+    ASSERT( range.size() == 2 * size.ndim() );
+    MeshData data;
+    data.name = name;
+    data.size = size;
+    if ( size.ndim() == 2 )
+        data.type = TopologyType::UniformMesh2D;
+    else if ( size.ndim() == 3 )
+        data.type = TopologyType::UniformMesh3D;
+    else
+        ERROR( "# of dimensions != 2 or 3" );
+    for ( int i = 0; i < 2 * size.ndim(); i++ )
+        data.range[i] = range[i];
+    return data;
+}
+Xdmf::MeshData Xdmf::createCurvilinearMesh( const std::string &name, ArraySize size,
+    const std::string &x, const std::string &y, const std::string &z )
+{
+    MeshData data;
+    data.name = name;
+    if ( size.ndim() == 2 )
+        data.type = TopologyType::CurvilinearMesh2D;
+    else if ( size.ndim() == 3 )
+        data.type = TopologyType::CurvilinearMesh3D;
+    else
+        ERROR( "Invalid size for Curvilinear mesh" );
+    data.size = size;
+    data.x    = x;
+    data.y    = y;
+    data.z    = z;
+    return data;
+}
+Xdmf::MeshData Xdmf::createUnstructuredMesh( const std::string &name, uint8_t NDIM,
+    TopologyType type, size_t NumElements, const std::string &dofMap, size_t NumNodes,
+    const std::string &x, const std::string &y, const std::string &z )
+{
+    ASSERT( type != TopologyType::Null );
+    MeshData data;
+    data.name   = name;
+    data.type   = type;
+    data.size   = { NDIM, NumElements, NumNodes };
+    data.dofMap = dofMap;
+    data.x      = x;
+    data.y      = y;
+    data.z      = z;
+    return data;
+}
+
+
+/****************************************************************
+ * Add a variable                                                *
+ ****************************************************************/
+void Xdmf::MeshData::addVariable( const std::string &meshName, const std::string &varName,
+    ArraySize varSize, RankType rank, Center center, const std::string &varData )
+{
+    VarData var;
+    var.name     = varName;
+    var.size     = varSize;
+    var.data     = varData;
+    var.rankType = rank;
+    var.center   = center;
+    vars.push_back( std::move( var ) );
+}
+
+
+/****************************************************************
+ * Add a mesh domain                                             *
+ ****************************************************************/
+void Xdmf::addMesh( const std::string &meshName, const MeshData &domain )
+{
+    auto &domains = d_meshData[meshName];
+    for ( const auto &domain2 : domains )
+        ASSERT( domain2.name != domain.name );
+    domains.push_back( domain );
+}
+
+
+/****************************************************************
+ * Write a variable                                              *
+ ****************************************************************/
+static void writeVariable( FILE *fid, const Xdmf::VarData &var, const std::string &indent )
+{
+    // Write the variable name
+    fprintf( fid, "%s<Attribute Name=\"%s\"", indent.data(), var.name.data() );
+    // Write the variable type
+    if ( var.rankType == Xdmf::RankType::Scalar ) {
+        fprintf( fid, " AttributeType=\"Scalar\"" );
+    } else if ( var.rankType == Xdmf::RankType::Vector ) {
+        fprintf( fid, " AttributeType=\"Vector\"" );
+    } else if ( var.rankType == Xdmf::RankType::Tensor ) {
+        fprintf( fid, " AttributeType=\"Tensor\"" );
+    } else if ( var.rankType == Xdmf::RankType::Tensor6 ) {
+        fprintf( fid, " AttributeType=\"Tensor6\"" );
+    } else if ( var.rankType == Xdmf::RankType::Matrix ) {
+        fprintf( fid, " AttributeType=\"Matrix\"" );
+    } else if ( var.rankType == Xdmf::RankType::GlobalID ) {
+        fprintf( fid, " AttributeType=\"GlobalID\"" );
+    } else {
+        ERROR( "Unknown center type" );
+    }
+    // Write the variable centering
+    if ( var.center == Xdmf::Center::Node ) {
+        fprintf( fid, " Center=\"Node\">\n" );
+    } else if ( var.center == Xdmf::Center::Cell ) {
+        fprintf( fid, " Center=\"Cell\">\n" );
+    } else if ( var.center == Xdmf::Center::Grid ) {
+        fprintf( fid, " Center=\"Grid\">\n" );
+    } else if ( var.center == Xdmf::Center::Face ) {
+        fprintf( fid, " Center=\"Face\">\n" );
+    } else if ( var.center == Xdmf::Center::Edge ) {
+        fprintf( fid, " Center=\"Edge\">\n" );
+    } else if ( var.center == Xdmf::Center::Other ) {
+        fprintf( fid, " Center=\"Other\">\n" );
+    } else {
+        ERROR( "Unknown center type" );
+    }
+    // Write the data item
+    addDataItem( fid, indent + "  ", var.size, var.data );
+    // Finished
+    fprintf( fid, "%s</Attribute>\n", indent.data() );
+}
+
+
+/****************************************************************
+ * Write the mesh grid                                           *
+ ****************************************************************/
+static void writeMeshGrid( FILE *fid, const Xdmf::MeshData &mesh, const std::string &indent )
+{
+    const char *s = indent.data();
+    double x0[3]  = { mesh.range[0], mesh.range[2], mesh.range[4] };
+    double dx[3]  = { ( mesh.range[1] - mesh.range[0] ) / mesh.size[0],
+        ( mesh.range[3] - mesh.range[2] ) / mesh.size[1],
+        ( mesh.range[5] - mesh.range[4] ) / mesh.size[2] };
+    switch ( mesh.type ) {
+    case Xdmf::TopologyType::UniformMesh2D:
+        // Write a uniform 2d mesh
+        fprintf( fid, "%s<Grid Name=\"%s\" GridType=\"Uniform\">\n", s, mesh.name.data() );
+        fprintf( fid,
+            "%s  <Topology TopologyType=\"2DCoRectMesh\" NumberOfElements=\"%lu %lu\"/>\n", s,
+            mesh.size[1] + 1, mesh.size[0] + 1 );
+        fprintf( fid, "%s  <Geometry GeometryType=\"ORIGIN_DXDY\">\n", s );
+        fprintf(
+            fid, "%s    <DataItem  Format=\"XML\" NumberType=\"float\" Dimensions=\"2\">\n", s );
+        fprintf( fid, "%s      %0.12e  %0.12e\n", s, x0[0], x0[1] );
+        fprintf( fid, "%s    </DataItem>\n", s );
+        fprintf(
+            fid, "%s    <DataItem  Format=\"XML\" NumberType=\"float\" Dimensions=\"2\">\n", s );
+        fprintf( fid, "%s       %0.12e  %0.12e\n", s, dx[0], dx[1] );
+        fprintf( fid, "%s    </DataItem>\n", s );
+        fprintf( fid, "%s  </Geometry>\n", s );
+        break;
+    case Xdmf::TopologyType::UniformMesh3D:
+        // Write a uniform 3d mesh
+        fprintf( fid, "%s<Grid Name=\"%s\" GridType=\"Uniform\">\n", s, mesh.name.data() );
+        fprintf( fid,
+            "%s  <Topology TopologyType=\"3DCoRectMesh\" NumberOfElements=\"%lu %lu\"/>\n", s,
+            mesh.size[1] + 1, mesh.size[0] + 1 );
+        fprintf( fid, "%s  <Geometry GeometryType=\"ORIGIN_DXDYDZ\">\n", s );
+        fprintf(
+            fid, "%s    <DataItem  Format=\"XML\" NumberType=\"float\" Dimensions=\"3\">\n", s );
+        fprintf( fid, "%s      %0.12e  %0.12e  %0.12e\n", s, x0[0], x0[1], x0[2] );
+        fprintf( fid, "%s    </DataItem>\n", s );
+        fprintf(
+            fid, "%s    <DataItem  Format=\"XML\" NumberType=\"float\" Dimensions=\"3\">\n", s );
+        fprintf( fid, "%s       %0.12e  %0.12e  %0.12e\n", s, dx[0], dx[1], dx[2] );
+        fprintf( fid, "%s    </DataItem>\n", s );
+        fprintf( fid, "%s  </Geometry>\n", s );
+        break;
+    case Xdmf::TopologyType::CurvilinearMesh2D:
+        // Write a 2D curvillinear mesh
+        fprintf( fid, "%s<Grid Name=\"%s\" GridType=\"Uniform\">\n", s, mesh.name.data() );
+        fprintf( fid, "%s  <Topology TopologyType=\"2DSMesh\" NumberOfElements=\"%lu %lu\"/>\n", s,
+            mesh.size[1] + 1, mesh.size[0] + 1 );
+        fprintf( fid, "%s  <Geometry GeometryType=\"X_Y\">\n", s );
+        addDataItem( fid, indent + "    ", mesh.size + 1, mesh.x );
+        addDataItem( fid, indent + "    ", mesh.size + 1, mesh.y );
+        fprintf( fid, "%s  </Geometry>\n", s );
+        break;
+    case Xdmf::TopologyType::CurvilinearMesh3D:
+        // Write a 3D curvillinear mesh
+        fprintf( fid, "%s<Grid Name=\"%s\" GridType=\"Uniform\">\n", s, mesh.name.data() );
+        fprintf( fid, "%s  <Topology TopologyType=\"3DSMesh\" NumberOfElements=\"%lu %lu %lu\"/>\n",
+            s, mesh.size[2] + 1, mesh.size[1] + 1, mesh.size[0] + 1 );
+        fprintf( fid, "%s  <Geometry GeometryType=\"X_Y_Z\">\n", s );
+        addDataItem( fid, indent + "    ", mesh.size + 1, mesh.x );
+        addDataItem( fid, indent + "    ", mesh.size + 1, mesh.y );
+        addDataItem( fid, indent + "    ", mesh.size + 1, mesh.z );
+        fprintf( fid, "%s  </Geometry>\n", s );
+        break;
+    case Xdmf::TopologyType::Polyvertex:
+    case Xdmf::TopologyType::Polyline:
+    case Xdmf::TopologyType::Polygon:
+    case Xdmf::TopologyType::Triangle:
+    case Xdmf::TopologyType::Quadrilateral:
+    case Xdmf::TopologyType::Tetrahedron:
+    case Xdmf::TopologyType::Pyramid:
+    case Xdmf::TopologyType::Wedge:
+    case Xdmf::TopologyType::Hexahedron:
+    case Xdmf::TopologyType::Edge_3:
+    case Xdmf::TopologyType::Triangle_6:
+    case Xdmf::TopologyType::Quadrilateral_8:
+    case Xdmf::TopologyType::Tetrahedron_10:
+    case Xdmf::TopologyType::Pyramid_13:
+    case Xdmf::TopologyType::Wedge_15:
+    case Xdmf::TopologyType::Hexahedron_20:
+        // Write an unstructured mesh
+        {
+            int NDIM      = mesh.size[0];
+            size_t Nelem  = mesh.size[1];
+            size_t Nnode  = mesh.size[2];
+            uint8_t Ndofs = TopologyTypeDOFs[static_cast<int>( mesh.type )];
+            auto type     = TopologyTypeNames[static_cast<int>( mesh.type )];
+            fprintf( fid, "%s<Grid Name=\"%s\">\n", s, mesh.name.data() );
+            fprintf( fid, "%s  <Topology TopologyType=\"%s\"", s, type );
+            fprintf( fid, " NumberOfElements=\"%lu\">\n", Nelem );
+            if ( !mesh.dofMap.empty() )
+                addDataItem( fid, indent + "    ", { Ndofs, Nelem }, mesh.dofMap );
+            fprintf( fid, "%s  </Topology>\n", s );
+            if ( NDIM == 2 ) {
+                if ( mesh.y.empty() ) {
+                    fprintf( fid, "%s  <Geometry GeometryType=\"XY\">\n", s );
+                    addDataItem( fid, indent + "    ", { 2, Nnode }, mesh.x );
+                } else {
+                    fprintf( fid, "%s  <Geometry GeometryType=\"X_Y\">\n", s );
+                    addDataItem( fid, indent + "    ", Nnode, mesh.x );
+                    addDataItem( fid, indent + "    ", Nnode, mesh.y );
+                }
+            } else if ( NDIM == 3 ) {
+                if ( mesh.y.empty() ) {
+                    fprintf( fid, "%s  <Geometry GeometryType=\"XYZ\">\n", s );
+                    addDataItem( fid, indent + "    ", { 2, Nnode }, mesh.x );
+                } else {
+                    fprintf( fid, "%s  <Geometry GeometryType=\"X_Y_Z\">\n", s );
+                    addDataItem( fid, indent + "    ", Nnode, mesh.x );
+                    addDataItem( fid, indent + "    ", Nnode, mesh.y );
+                    addDataItem( fid, indent + "    ", Nnode, mesh.z );
+                }
+            } else {
+                ERROR( "Dimensions other than 2 or 3 are not supported" );
+            }
+            fprintf( fid, "%s  </Geometry>\n", s );
+        }
+        break;
+    default: {
+        auto msg = "Invalid mesh type: " + std::to_string( static_cast<int>( mesh.type ) ) + " - " +
+                   mesh.name;
+        ERROR( msg );
+    }
+    }
+    // Write the variables
+    for ( const auto &var : mesh.vars )
+        writeVariable( fid, var, indent + "  " );
+    fprintf( fid, "%s  </Grid>\n", s );
+}
+
+
+/****************************************************************
+ * Write the XDMF xml file                                       *
+ ****************************************************************/
+void Xdmf::write( const std::string &filename ) const
+{
+    // Create XDMF file
+    auto fid = fopen( filename.data(), "w" );
+    fprintf( fid, "<?xml version=\"1.0\" ?>\n" );
+    fprintf( fid, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n" );
+    fprintf( fid, "<Xdmf Version=\"2.0\">\n" );
+    fprintf( fid, "<Domain>\n" );
+    // Write an empty mesh to enable collections to work properly
+    fprintf( fid, "  <Grid Name=\"\" GridType=\"Uniform\"></Grid>\n\n" );
+    // Write each mesh
+    for ( const auto &data : d_meshData ) {
+        auto name    = data.first;
+        auto domains = data.second;
+        if ( domains.empty() )
+            continue;
+        if ( domains.size() == 1u && name == domains[0].name ) {
+            writeMeshGrid( fid, domains[0], "  " );
+        } else {
+            fprintf( fid, "  <Grid Name=\"%s\" GridType=\"Collection\">\n", name.data() );
+            for ( const auto &domain : domains )
+                writeMeshGrid( fid, domain, "    " );
+            fprintf( fid, "  </Grid>\n\n" );
+        }
+    }
+    fprintf( fid, "</Domain>\n" );
+    fprintf( fid, "</Xdmf>\n" );
+    fclose( fid );
+}
+
+
+/****************************************************************
+ * Pack/Unpack data                                              *
+ ****************************************************************/
+template<class T>
+typename std::enable_if<std::is_trivially_copyable<T>::value, size_t>::type size( const T & )
+{
+    return sizeof( T );
+}
+template<class T>
+typename std::enable_if<std::is_trivially_copyable<T>::value, char *>::type pack(
+    char *ptr, const T &x )
+{
+    memcpy( ptr, &x, sizeof( T ) );
+    return ptr + sizeof( T );
+}
+template<class T>
+typename std::enable_if<std::is_trivially_copyable<T>::value, char *>::type unpack(
+    char *ptr, T &x )
+{
+    memcpy( &x, ptr, sizeof( T ) );
+    return ptr + sizeof( T );
+}
+static size_t size( const std::string &str ) { return sizeof( int ) + str.size(); }
+static char *pack( char *ptr, const std::string &str )
+{
+    int N = str.size();
+    memcpy( ptr, &N, sizeof( int ) );
+    ptr += sizeof( int );
+    memcpy( ptr, str.data(), str.size() );
+    ptr += str.size();
+    return ptr;
+}
+static char *unpack( char *ptr, std::string &str )
+{
+    int N = 0;
+    memcpy( &N, ptr, sizeof( int ) );
+    ASSERT( N >= 0 && N < 1000 );
+    ptr += sizeof( int );
+    str = std::string( ptr, N );
+    ptr += N;
+    return ptr;
+}
+static size_t size( const Xdmf::VarData &data )
+{
+    size_t bytes = 0;
+    bytes += size( data.name );
+    bytes += size( data.size );
+    bytes += size( data.rankType );
+    bytes += size( data.center );
+    bytes += size( data.data );
+    return bytes;
+}
+static char *pack( char *ptr, const Xdmf::VarData &data )
+{
+    ptr = pack( ptr, data.name );
+    ptr = pack( ptr, data.size );
+    ptr = pack( ptr, data.rankType );
+    ptr = pack( ptr, data.center );
+    ptr = pack( ptr, data.data );
+    return ptr;
+}
+static char *unpack( char *ptr, Xdmf::VarData &data )
+{
+    int rankType = 0, center = 0;
+    ptr           = unpack( ptr, data.name );
+    ptr           = unpack( ptr, data.size );
+    ptr           = unpack( ptr, rankType );
+    ptr           = unpack( ptr, center );
+    ptr           = unpack( ptr, data.data );
+    data.rankType = static_cast<Xdmf::RankType>( rankType );
+    data.center   = static_cast<Xdmf::Center>( center );
+    return ptr;
+}
+static size_t size( const Xdmf::MeshData &data )
+{
+    int N_vars   = data.vars.size();
+    size_t bytes = 0;
+    bytes += size( data.name );
+    bytes += size( data.type );
+    bytes += size( data.size );
+    bytes += size( data.range );
+    bytes += size( data.x );
+    bytes += size( data.y );
+    bytes += size( data.z );
+    bytes += size( N_vars );
+    for ( int i = 0; i < N_vars; i++ )
+        bytes += size( data.vars[i] );
+    return bytes;
+}
+static char *pack( char *ptr, const Xdmf::MeshData &data )
+{
+    int N_vars = data.vars.size();
+    ptr        = pack( ptr, data.name );
+    ptr        = pack( ptr, data.type );
+    ptr        = pack( ptr, data.size );
+    ptr        = pack( ptr, data.range );
+    ptr        = pack( ptr, data.x );
+    ptr        = pack( ptr, data.y );
+    ptr        = pack( ptr, data.z );
+    ptr        = pack( ptr, N_vars );
+    for ( int i = 0; i < N_vars; i++ )
+        ptr = pack( ptr, data.vars[i] );
+    return ptr;
+}
+static char *unpack( char *ptr, Xdmf::MeshData &data )
+{
+    int N_vars = 0;
+    ptr        = unpack( ptr, data.name );
+    ptr        = unpack( ptr, data.type );
+    ptr        = unpack( ptr, data.size );
+    ptr        = unpack( ptr, data.range );
+    ptr        = unpack( ptr, data.x );
+    ptr        = unpack( ptr, data.y );
+    ptr        = unpack( ptr, data.z );
+    ptr        = unpack( ptr, N_vars );
+    data.vars.resize( N_vars );
+    for ( int i = 0; i < N_vars; i++ )
+        ptr = unpack( ptr, data.vars[i] );
+    return ptr;
+}
+static size_t size( const std::vector<Xdmf::MeshData> &data )
+{
+    size_t bytes = 0;
+    int N        = data.size();
+    bytes += size( N );
+    for ( int i = 0; i < N; i++ )
+        bytes += size( data[i] );
+    return bytes;
+}
+static char *pack( char *ptr, const std::vector<Xdmf::MeshData> &data )
+{
+    int N = data.size();
+    ptr   = pack( ptr, N );
+    for ( int i = 0; i < N; i++ )
+        ptr = pack( ptr, data[i] );
+    return ptr;
+}
+static char *unpack( char *ptr, std::vector<Xdmf::MeshData> &data )
+{
+    data.clear();
+    int N = data.size();
+    ptr   = unpack( ptr, N );
+    data.resize( N );
+    for ( int i = 0; i < N; i++ )
+        ptr = unpack( ptr, data[i] );
+    return ptr;
+}
+static size_t size( const std::map<std::string, std::vector<Xdmf::MeshData>> &data )
+{
+    size_t bytes = 0;
+    int N_map    = data.size();
+    bytes += size( N_map );
+    for ( const auto &tmp : data ) {
+        bytes += size( tmp.first );
+        bytes += size( tmp.second );
+    }
+    return bytes;
+}
+static char *pack( char *ptr, const std::map<std::string, std::vector<Xdmf::MeshData>> &data )
+{
+    int N_map = data.size();
+    ptr       = pack( ptr, N_map );
+    for ( const auto &tmp : data ) {
+        ptr = pack( ptr, tmp.first );
+        ptr = pack( ptr, tmp.second );
+    }
+    return ptr;
+}
+static char *unpack( char *ptr, std::map<std::string, std::vector<Xdmf::MeshData>> &data )
+{
+    data.clear();
+    int N_map = data.size();
+    ptr       = unpack( ptr, N_map );
+    for ( int i = 0; i < N_map; i++ ) {
+        std::string name;
+        std::vector<Xdmf::MeshData> data2;
+        ptr        = unpack( ptr, name );
+        ptr        = unpack( ptr, data2 );
+        data[name] = std::move( data2 );
+    }
+    return ptr;
+}
+
+
+/****************************************************************
+ * Gather all data to rank 0                                     *
+ ****************************************************************/
+void Xdmf::gather( const Utilities::MPI &comm )
+{
+    if ( comm.getRank() == 0 ) {
+        for ( int i = 1; i < comm.getSize(); i++ ) {
+            // Recieve the data
+            size_t N_meshes = 0, N_bytes = 0;
+            comm.recv( &N_meshes, 1, i, 717 );
+            comm.recv( &N_bytes, 1, i, 718 );
+            auto buf = new char[N_bytes];
+            comm.recv( buf, N_bytes, i, 719 );
+            // Unpack the data
+            std::map<std::string, std::vector<MeshData>> data;
+            unpack( buf, data );
+            delete[] buf;
+            // Add the meshes
+            for ( auto tmp : data ) {
+                const auto &name    = tmp.first;
+                const auto &domains = tmp.second;
+                if ( domains.size() == 1u && domains[0].name == name ) {
+                    // We are dealing with a single mesh
+                    ASSERT( d_meshData.find( name ) == d_meshData.end() );
+                    d_meshData.insert( tmp );
+                } else {
+                    // Add the domains
+                    auto &meshes = d_meshData[name];
+                    for ( auto domain : domains ) {
+                        for ( const auto &tmp : meshes )
+                            ASSERT( tmp.name != domain.name );
+                        meshes.push_back( domain );
+                    }
+                }
+            }
+        }
+    } else {
+        // Send the number of meshes
+        size_t N_meshes = d_meshData.size();
+        comm.send( &N_meshes, 1, 0, 717 );
+        // Pack the send data
+        size_t N_bytes = size( d_meshData );
+        comm.send( &N_bytes, 1, 0, 718 );
+        auto buf = new char[N_bytes];
+        pack( buf, d_meshData );
+        // Send the data to rank 0
+        comm.send( buf, N_bytes, 0, 719 );
+        delete[] buf;
+        // Clear the internal data
+        d_meshData.clear();
+    }
+}
--- a/IO/Xdmf.h
+++ b/IO/Xdmf.h
@ -0,0 +1,130 @@
+#include "IO/HDF5_IO.h"
+#include "common/Array.h"
+#include "common/MPI.h"
+#include "common/Utilities.h"
+
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+
+// Helper class to write/read XDMF files
+class Xdmf
+{
+public:
+    enum class TopologyType {
+        Null = 0,
+        Polyvertex,
+        Polyline,
+        Polygon,
+        Triangle,
+        Quadrilateral,
+        Tetrahedron,
+        Pyramid,
+        Wedge,
+        Hexahedron,
+        Edge_3,
+        Triangle_6,
+        Quadrilateral_8,
+        Tetrahedron_10,
+        Pyramid_13,
+        Wedge_15,
+        Hexahedron_20,
+        Mixed,
+        CurvilinearMesh2D,
+        CurvilinearMesh3D,
+        RectangularMesh2D,
+        RectangularMesh3D,
+        UniformMesh2D,
+        UniformMesh3D,
+    };
+    enum class DataType { Null = 0, Char, Int32, Int64, Uint32, Uint64, Float, Double };
+    enum class RankType { Null = 0, Scalar, Vector, Tensor, Tensor6, Matrix, GlobalID };
+    enum class Center { Null = 0, Node, Edge, Face, Cell, Grid, Other };
+
+    struct VarData {
+        std::string name;  // Variable name
+        ArraySize size;    // Size of variable
+        RankType rankType; // Rank order of data
+        Center center;     // Variable centering
+        std::string data;  // Variable data
+    };
+
+    struct MeshData {
+        std::string name;          // Name of mesh
+        TopologyType type;         // Type of mesh
+        ArraySize size;            // Size of mesh (meaning depends on mesh type)
+        double range[6];           // Range of the mesh (only used for UniformMesh2D/UniformMesh3D)
+        std::string x;             // x coordinates (or xy/xyz coordinates)
+        std::string y;             // y coordinates
+        std::string z;             // z coordinates
+        std::string dofMap;        // mesh connectivity
+        std::vector<VarData> vars; // Variables
+        MeshData() : type( TopologyType::Null ), range{ 0 } {}
+        //! Add a variable
+        void addVariable( const std::string &meshName, const std::string &varName,
+            ArraySize varSize, RankType rank, Center center, const std::string &varData );
+    };
+
+
+public:
+    //! Add a Point mesh
+    static MeshData createPointMesh( const std::string &name, uint8_t NDIM, size_t N,
+        const std::string &x, const std::string &y = "", const std::string &z = "" );
+
+    /*!
+     * @brief  Add a uniform mesh
+     * @details  This function adds a uniform rectangular mesh
+     * @param[in] name          The name of the mesh
+     * @param[in] range         The range of the mesh [ x_min, x_max, y_min, y_max, z_min, z_max ]
+     * @param[in] size          The number of cells in the mesh
+     */
+    static MeshData createUniformMesh(
+        const std::string &name, const std::vector<double> &range, ArraySize size );
+
+    /*!
+     * @brief  Add a Curvilinear mesh
+     * @details  This function adds a curvilinear mesh
+     * @param[in] name          The name of the mesh
+     * @param[in] size          The number of cells in the mesh
+     * @param[in] x             The x coordinates or the xy/xyz coordinates
+     * @param[in] y             The y coordinates (may be null)
+     * @param[in] z             The z coordinates (may be null)
+     */
+    static MeshData createCurvilinearMesh( const std::string &name, ArraySize size,
+        const std::string &x, const std::string &y, const std::string &z = "" );
+
+    /*!
+     * @brief  Add an unstructured mesh
+     * @details  This function adds an unstructerd mesh to the class to write.
+     *    The mesh may be one of several unsupported unstructured mesh types.
+     *    This function does not support mixed elements.
+     * @param[in] name          The name of the mesh
+     * @param[in] NDIM          The number of physical dimensions
+     * @param[in] type          The element type
+     * @param[in] NumElements   The number of elements
+     * @param[in] dofMap        The connectivity information (type x NumElements)
+     * @param[in] x             The x coordinates or the xy/xyz coordinates
+     * @param[in] y             The y coordinates (may be null)
+     * @param[in] z             The z coordinates (may be null)
+     */
+    static MeshData createUnstructuredMesh( const std::string &name, uint8_t NDIM,
+        TopologyType type, size_t NumElements, const std::string &dofMap, size_t NumNodes,
+        const std::string &x, const std::string &y = "", const std::string &z = "" );
+
+
+public:
+    //! Add a sub-domain
+    void addMesh( const std::string &meshName, const MeshData &domain );
+
+    //! Gather all data to rank 0
+    void gather( const Utilities::MPI &comm );
+
+    //! Write the xml file
+    void write( const std::string &filename ) const;
+
+
+private:
+    std::map<std::string, std::vector<MeshData>> d_meshData;
+};
--- a/IO/netcdf.cpp
+++ b/IO/netcdf.cpp
@ -511,7 +511,7 @@ template void write<double>( int fid, const std::string &var, const std::vector<
    const Array<double> &data, const RankInfoStruct &info );


-}; // namespace netcdf
+} // namespace netcdf

 #else

--- a/IO/netcdf.h
+++ b/IO/netcdf.h
@ -153,5 +153,7 @@ void write( int fid, const std::string &var, const std::vector<int> &dimids,
    const Array<TYPE> &data, const RankInfoStruct &rank_info );


-}; // namespace netcdf
+} // namespace netcdf
+
+
 #endif
--- a/IO/silo.cpp
+++ b/IO/silo.cpp
@ -25,7 +25,8 @@
 #include <silo.h>


-namespace IO::silo {
+namespace IO {
+namespace silo {


 /****************************************************
@ -114,7 +115,8 @@ void writeMultiVar( DBfile *fid, const std::string &varname,
 }


-}; // namespace IO::silo
+} // namespace silo
+} // namespace IO


 #else
--- a/IO/silo.hpp
+++ b/IO/silo.hpp
@ -275,11 +275,11 @@ Array<TYPE> readUniformMeshVariable( DBfile *fid, const std::string &varname )
        copyData<TYPE>( data2, type, var->vals[i] );
        memcpy( &data( 0, i ), data2.data(), var->nels * sizeof( TYPE ) );
    }
-    DBFreeQuadvar( var );
    std::vector<size_t> dims( var->ndims + 1, var->nvals );
    for ( int d = 0; d < var->ndims; d++ )
        dims[d] = var->dims[d];
    data.reshape( dims );
+    DBFreeQuadvar( var );
    return data;
 }

--- a/StackTrace/StackTrace.cpp
+++ b/StackTrace/StackTrace.cpp
@ -856,7 +856,7 @@ static void getFileAndLineObject( staticVector<StackTrace::stack_info*,blockSize
            char *buf = tmp2;
            if ( buf[0] != '?' && buf[0] != 0 ) {
                size_t j = 0;
-                for ( j = 0; j < 4095 && buf[j] != ':'; j++ ) {
+                for ( j = 0; j < 1024 && buf[j] != ':'; j++ ) {
                }
                buf[j] = 0;
                copy( buf, info[i]->filename, info[i]->filenamePath );
--- a/common/Array.cpp
+++ b/common/Array.cpp
@ -1,117 +1,74 @@
+// clang-format off
 #include "common/Array.h"
 #include "common/Array.hpp"
+#include "common/Utilities.h"

 #include <complex>


-/********************************************************
- *  ArraySize                                            *
- ********************************************************/
-ArraySize::ArraySize( const std::vector<size_t>& N )
-{
-    d_ndim = N.size();
-    d_N[0] = 0;
-    d_N[1] = 1;
-    d_N[2] = 1;
-    d_N[3] = 1;
-    d_N[4] = 1;
-    for ( size_t i = 0; i < d_ndim; i++ )
-        d_N[i] = N[i];
-    d_length = 1;
-    for ( unsigned long i : d_N )
-        d_length *= i;
-    if ( d_ndim == 0 )
-        d_length = 0;
-}
-
-
 /********************************************************
 *  Explicit instantiations of Array                     *
 ********************************************************/
-template class Array<char, FunctionTable>;
-template class Array<uint8_t, FunctionTable>;
-template class Array<uint16_t, FunctionTable>;
-template class Array<uint32_t, FunctionTable>;
-template class Array<uint64_t, FunctionTable>;
-template class Array<int8_t, FunctionTable>;
-template class Array<int16_t, FunctionTable>;
-template class Array<int32_t, FunctionTable>;
-template class Array<int64_t, FunctionTable>;
-template class Array<float, FunctionTable>;
-template class Array<double, FunctionTable>;
+template class Array<char,FunctionTable>;
+template class Array<uint8_t,FunctionTable>;
+template class Array<uint16_t,FunctionTable>;
+template class Array<uint32_t,FunctionTable>;
+template class Array<uint64_t,FunctionTable>;
+template class Array<int8_t,FunctionTable>;
+template class Array<int16_t,FunctionTable>;
+template class Array<int32_t,FunctionTable>;
+template class Array<int64_t,FunctionTable>;
+template class Array<float,FunctionTable>;
+template class Array<double,FunctionTable>;
+template class Array<long double,FunctionTable>;


 /********************************************************
 *  Explicit instantiations of Array<bool>               *
 ********************************************************/
-// clang-format off
-template Array<bool, FunctionTable>::Array();
-template Array<bool, FunctionTable>::~Array();
-template Array<bool, FunctionTable>::Array( size_t );
-template Array<bool, FunctionTable>::Array( size_t, size_t );
-template Array<bool, FunctionTable>::Array( size_t, size_t, size_t );
-template Array<bool, FunctionTable>::Array( size_t, size_t, size_t, size_t );
-template Array<bool, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
-template Array<bool, FunctionTable>::Array( const std::vector<size_t>&, const bool* );
-template Array<bool, FunctionTable>::Array( std::string );
-template Array<bool, FunctionTable>::Array( std::initializer_list<bool> );
-template Array<bool, FunctionTable>::Array( const Array<bool, FunctionTable>& );
-template Array<bool, FunctionTable>::Array( Array<bool, FunctionTable>&& );
-template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( const Array<bool, FunctionTable>& );
-template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( Array<bool, FunctionTable>&& );
-template Array<bool, FunctionTable>& Array<bool, FunctionTable>::operator=( const std::vector<bool>& );
-template void Array<bool, FunctionTable>::fill(bool const&);
-template void Array<bool, FunctionTable>::clear();
-template bool Array<bool, FunctionTable>::operator==(Array<bool, FunctionTable> const&) const;
-template void Array<bool, FunctionTable>::resize( ArraySize const& );
-// clang-format on
+instantiateArrayConstructors( bool )
+template Array<bool,FunctionTable>& Array<bool,FunctionTable>::operator=( const std::vector<bool>& );
+template void Array<bool,FunctionTable>::clear();
+template bool Array<bool,FunctionTable>::operator==(Array<bool,FunctionTable> const&) const;
+template void Array<bool,FunctionTable>::resize( ArraySize const& );


 /********************************************************
 *  Explicit instantiations of Array<std::complex>       *
 ********************************************************/
-// clang-format off
-template Array<std::complex<double>, FunctionTable>::Array();
-template Array<std::complex<double>, FunctionTable>::~Array();
-template Array<std::complex<double>, FunctionTable>::Array( size_t );
-template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t );
-template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t );
-template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t, size_t );
-template Array<std::complex<double>, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
-template Array<std::complex<double>, FunctionTable>::Array( const std::vector<size_t>&, const std::complex<double>* );
-template Array<std::complex<double>, FunctionTable>::Array( std::initializer_list<std::complex<double>> );
-template Array<std::complex<double>, FunctionTable>::Array( const Range<std::complex<double>>& range );
-template Array<std::complex<double>, FunctionTable>::Array( const Array<std::complex<double>, FunctionTable>& );
-template Array<std::complex<double>, FunctionTable>::Array( Array<std::complex<double>, FunctionTable>&& );
-template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( const Array<std::complex<double>, FunctionTable>& );
-template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( Array<std::complex<double>, FunctionTable>&& );
-template Array<std::complex<double>, FunctionTable>& Array<std::complex<double>, FunctionTable>::operator=( const std::vector<std::complex<double>>& );
-template void Array<std::complex<double>, FunctionTable>::resize( ArraySize const& );
-template void Array<std::complex<double>, FunctionTable>::viewRaw( ArraySize const&, std::complex<double>*, bool, bool );
-template void Array<std::complex<double>, FunctionTable>::fill(std::complex<double> const&);
-template void Array<std::complex<double>, FunctionTable>::clear();
-template bool Array<std::complex<double>, FunctionTable>::operator==(Array<std::complex<double>, FunctionTable> const&) const;
-template Array<std::complex<double>, FunctionTable> Array<std::complex<double>, FunctionTable>::repmat(std::vector<unsigned long, std::allocator<unsigned long> > const&) const;
-// clang-format on
+instantiateArrayConstructors( std::complex<float> )
+instantiateArrayConstructors( std::complex<double> )
+template void Array<std::complex<float>,FunctionTable>::resize( ArraySize const& );
+template void Array<std::complex<double>,FunctionTable>::resize( ArraySize const& );
+template Array<std::complex<double>,FunctionTable>& Array<std::complex<double>,FunctionTable>::operator=(std::vector<std::complex<double>> const&);
+template Array<std::complex<float>,FunctionTable>& Array<std::complex<float>,FunctionTable>::operator=(std::vector<std::complex<float>> const&);
+template void Array<std::complex<float>,FunctionTable>::clear();
+template void Array<std::complex<double>,FunctionTable>::clear();
+template bool Array<std::complex<float>,FunctionTable>::operator==(Array<std::complex<float>,FunctionTable> const&) const;
+template bool Array<std::complex<double>,FunctionTable>::operator==(Array<std::complex<double>,FunctionTable> const&) const;
+template Array<std::complex<float>,FunctionTable> Array<std::complex<float>,FunctionTable>::repmat(std::vector<unsigned long> const&) const;
+template Array<std::complex<double>,FunctionTable> Array<std::complex<double>,FunctionTable>::repmat(std::vector<unsigned long> const&) const;
+template void Array<std::complex<float>,FunctionTable>::copySubset(std::vector<unsigned long> const&, Array<std::complex<float>,FunctionTable> const&);
+template void Array<std::complex<double>,FunctionTable>::copySubset(std::vector<unsigned long> const&, Array<std::complex<double>,FunctionTable> const&);
+template Array<std::complex<float>,FunctionTable> Array<std::complex<float>,FunctionTable>::subset(std::vector<unsigned long> const&) const;
+template Array<std::complex<double>,FunctionTable> Array<std::complex<double>,FunctionTable>::subset(std::vector<unsigned long> const&) const;
+template bool Array<std::complex<float>,FunctionTable>::NaNs() const;
+template bool Array<std::complex<double>,FunctionTable>::NaNs() const;


 /********************************************************
 *  Explicit instantiations of Array<std::string>        *
 ********************************************************/
-// clang-format off
-template Array<std::string, FunctionTable>::Array();
-template Array<std::string, FunctionTable>::~Array();
-template Array<std::string, FunctionTable>::Array( size_t );
-template Array<std::string, FunctionTable>::Array( size_t, size_t );
-template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t );
-template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t, size_t );
-template Array<std::string, FunctionTable>::Array( size_t, size_t, size_t, size_t, size_t );
-template Array<std::string, FunctionTable>::Array( const std::vector<size_t>&, const std::string* );
-template Array<std::string, FunctionTable>::Array( std::initializer_list<std::string> );
-template Array<std::string, FunctionTable>::Array( const Array<std::string, FunctionTable>& );
-template Array<std::string, FunctionTable>::Array( Array<std::string, FunctionTable>&& );
-template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( const Array<std::string, FunctionTable>& );
-template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( Array<std::string, FunctionTable>&& );
-template Array<std::string, FunctionTable>& Array<std::string, FunctionTable>::operator=( const std::vector<std::string>& );
-template void Array<std::string, FunctionTable>::resize( ArraySize const& );
-// clang-format on
+instantiateArrayConstructors( std::string )
+template void Array<std::string,FunctionTable>::resize( ArraySize const& );
+template void Array<std::string,FunctionTable>::clear();
+template Array<std::string, FunctionTable> &Array<std::string, FunctionTable>::
+operator=( const std::vector<std::string> & );
+template bool Array<std::string>::operator==(Array<std::string> const&) const;
+
+
+#if defined( USING_ICC )
+ENABLE_WARNINGS
+#endif
+
+
--- a/common/Array.h
+++ b/common/Array.h
@ -24,6 +24,7 @@
 #include <initializer_list>
 #include <iostream>
 #include <memory>
+#include <string>
 #include <vector>


@ -90,13 +91,7 @@ public: // Constructors / assignment operators
     * @param N             Number of elements in each dimension
     * @param data          Optional raw array to copy the src data
     */
-    explicit Array( const std::vector<size_t> &N, const TYPE *data = NULL );
-
-    /*!
-     * Create a 1D Array with the range
-     * @param range         Range of the data
-     */
-    explicit Array( const Range<TYPE> &range );
+    explicit Array( const std::vector<size_t> &N, const TYPE *data = nullptr );

    /*!
     * Create a 1D Array using a string that mimic's MATLAB
@ -110,6 +105,12 @@ public: // Constructors / assignment operators
     */
    Array( std::initializer_list<TYPE> data );

+    /*!
+     * Create a 2D Array with the given initializer lists
+     * @param data          Input data
+     */
+    Array( std::initializer_list<std::initializer_list<TYPE>> data );
+

    /*!
     * Copy constructor
@ -160,7 +161,7 @@ public: // Views/copies/subset
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    static std::unique_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> &data );
+    static std::unique_ptr<Array> view( const ArraySize &N, std::shared_ptr<TYPE> data );


    /*!
@ -168,8 +169,8 @@ public: // Views/copies/subset
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    static std::unique_ptr<const Array> constView(
-        const ArraySize &N, std::shared_ptr<const TYPE> const &data );
+    static std::unique_ptr<const Array> constView( const ArraySize &N,
+                                                   std::shared_ptr<const TYPE> const &data );


    /*!
@ -183,7 +184,7 @@ public: // Views/copies/subset
     * @param N             Number of elements in each dimension
     * @param data          Pointer to the data
     */
-    void view2( const ArraySize &N, std::shared_ptr<TYPE> const &data );
+    void view2( const ArraySize &N, std::shared_ptr<TYPE> data );

    /*!
     * Make this object a view of the raw data (expert use only).
@ -218,14 +219,30 @@ public: // Views/copies/subset
     */
    void viewRaw( const ArraySize &N, TYPE *data, bool isCopyable = true, bool isFixedSize = true );

+    /*!
+     * Create an array view of the given data (expert use only).
+     * Use view2( N, shared_ptr(data,[](TYPE*){}) ) instead.
+     *   Note: this interface is not recommended as it does not protect from
+     *   the src data being deleted while still being used by the Array.
+     *   Additionally for maximum performance it does not set the internal shared_ptr
+     *   so functions like getPtr and resize will not work correctly.
+     * @param N             Number of elements in each dimension
+     * @param data          Pointer to the data
+     */
+    static inline Array staticView( const ArraySize &N, TYPE *data )
+    {
+        Array x;
+        x.viewRaw( N, data, true, true );
+        return x;
+    }

    /*!
     * Convert an array of one type to another.  This may or may not allocate new memory.
     * @param array         Input array
     */
    template<class TYPE2>
-    static inline std::unique_ptr<Array<TYPE2, FUN, Allocator>> convert(
-        std::shared_ptr<Array<TYPE, FUN, Allocator>> array )
+    static inline std::unique_ptr<Array<TYPE2, FUN, Allocator>>
+    convert( std::shared_ptr<Array<TYPE, FUN, Allocator>> array )
    {
        auto array2 = std::make_unique<Array<TYPE2>>( array->size() );
        array2.copy( *array );
@ -238,8 +255,8 @@ public: // Views/copies/subset
     * @param array         Input array
     */
    template<class TYPE2>
-    static inline std::unique_ptr<const Array<TYPE2, FUN, Allocator>> convert(
-        std::shared_ptr<const Array<TYPE, FUN, Allocator>> array )
+    static inline std::unique_ptr<const Array<TYPE2, FUN, Allocator>>
+    convert( std::shared_ptr<const Array<TYPE, FUN, Allocator>> array )
    {
        auto array2 = std::make_unique<Array<TYPE2>>( array->size() );
        array2.copy( *array );
@ -251,8 +268,8 @@ public: // Views/copies/subset
     * Copy and convert data from another array to this array
     * @param array         Source array
     */
-    template<class TYPE2>
-    void inline copy( const Array<TYPE2, FUN, Allocator> &array )
+    template<class TYPE2, class FUN2, class Allocator2>
+    void inline copy( const Array<TYPE2, FUN2, Allocator2> &array )
    {
        resize( array.size() );
        copy( array.data() );
@ -261,51 +278,55 @@ public: // Views/copies/subset
    /*!
     * Copy and convert data from a raw vector to this array.
     *    Note: The current array must be allocated to the proper size first.
-     * @param array         Source array
+     * @param data          Source data
     */
    template<class TYPE2>
-    void inline copy( const TYPE2 *data )
-    {
-        for ( size_t i = 0; i < d_size.length(); i++ )
-            d_data[i] = static_cast<TYPE>( data[i] );
-    }
+    inline void copy( const TYPE2 *data );

    /*!
     * Copy and convert data from this array to a raw vector.
-     * @param array         Source array
+     * @param data          Source data
     */
    template<class TYPE2>
-    void inline copyTo( TYPE2 *data ) const
-    {
-        for ( size_t i = 0; i < d_size.length(); i++ )
-            data[i] = static_cast<TYPE2>( d_data[i] );
-    }
+    inline void copyTo( TYPE2 *data ) const;

    /*!
     * Copy and convert data from this array to a new array
     */
    template<class TYPE2>
-    Array<TYPE2, FUN, Allocator> inline cloneTo() const
+    Array<TYPE2, FUN, std::allocator<TYPE2>> inline cloneTo() const
    {
-        Array<TYPE2, FUN> dst( this->size() );
+        Array<TYPE2, FUN, std::allocator<TYPE2>> dst( this->size() );
        copyTo( dst.data() );
        return dst;
    }

+
    /*! swap the raw data pointers for the Arrays after checking for compatibility */
    void swap( Array &other );

+
    /*!
     * Fill the array with the given value
-     * @param value         Value to fill
+     * @param y         Value to fill
     */
-    void fill( const TYPE &value );
+    inline void fill( const TYPE &y )
+    {
+        for ( auto &x : *this )
+            x = y;
+    }

    /*!
     * Scale the array by the given value
-     * @param scale         Value to scale by
+     * @param y         Value to scale by
     */
-    void scale( const TYPE &scale );
+    template<class TYPE2>
+    inline void scale( const TYPE2 &y )
+    {
+        for ( auto &x : *this )
+            x *= y;
+    }
+

    /*!
     * Set the values of this array to pow(base, exp)
@ -314,6 +335,7 @@ public: // Views/copies/subset
     */
    void pow( const Array &base, const TYPE &exp );

+
    //! Destructor
    ~Array();

@ -342,6 +364,10 @@ public: // Views/copies/subset
    inline bool empty() const { return d_size.length() == 0; }


+    //! Return true if the Array is not empty
+    inline operator bool() const { return d_size.length() != 0; }
+
+
    /*!
     * Resize the Array
     * @param N             NUmber of elements
@ -387,6 +413,12 @@ public: // Views/copies/subset
    void reshape( const ArraySize &N );


+    /*!
+     * Remove singleton dimensions.
+     */
+    void squeeze();
+
+
    /*!
     * Reshape the Array so that the number of dimensions is the
     *    max of ndim and the largest dim>1.
@ -515,8 +547,8 @@ public: // Accessors
     * @param i3            The third index
     * @param i4            The fourth index
     */
-    ARRAY_ATTRIBUTE inline const TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4 ) const
+    ARRAY_ATTRIBUTE inline const TYPE &
+    operator()( size_t i1, size_t i2, size_t i3, size_t i4 ) const
    {
        return d_data[d_size.index( i1, i2, i3, i4 )];
    }
@ -542,8 +574,8 @@ public: // Accessors
     * @param i4            The fourth index
     * @param i5            The fifth index
     */
-    ARRAY_ATTRIBUTE inline const TYPE &operator()(
-        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
+    ARRAY_ATTRIBUTE inline const TYPE &
+    operator()( size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
    {
        return d_data[d_size.index( i1, i2, i3, i4, i5 )];
    }
@ -616,6 +648,12 @@ public: // Math operations
    //! Concatenates the arrays along the dimension dim.
    static Array cat( const std::vector<Array> &x, int dim = 0 );

+    //! Concatenates the arrays along the dimension dim.
+    static Array cat( const std::initializer_list<Array> &x, int dim = 0 );
+
+    //! Concatenates the arrays along the dimension dim.
+    static Array cat( size_t N_array, const Array *x, int dim );
+
    //! Concatenates a given array with the current array
    void cat( const Array &x, int dim = 0 );

@ -671,20 +709,37 @@ public: // Math operations
    TYPE mean( const std::vector<Range<size_t>> &index ) const;

    //! Find all elements that match the operator
-    std::vector<size_t> find(
-        const TYPE &value, std::function<bool( const TYPE &, const TYPE & )> compare ) const;
+    std::vector<size_t> find( const TYPE &value,
+                              std::function<bool( const TYPE &, const TYPE & )> compare ) const;


    //! Print an array
-    void print(
-        std::ostream &os, const std::string &name = "A", const std::string &prefix = "" ) const;
-
-    //! Multiply two arrays
-    static Array multiply( const Array &a, const Array &b );
+    void
+    print( std::ostream &os, const std::string &name = "A", const std::string &prefix = "" ) const;

    //! Transpose an array
    Array reverseDim() const;

+    /*!
+     * @brief  Shift dimensions
+     * @details  Shifts the dimensions of the array by N.  When N is positive,
+     *    shiftDim shifts the dimensions to the left and wraps the
+     *    N leading dimensions to the end.  When N is negative,
+     *    shiftDim shifts the dimensions to the right and pads with singletons.
+     * @param N             Desired shift
+     */
+    Array shiftDim( int N ) const;
+
+    /*!
+     * @brief   Permute array dimensions
+     * @details  Rearranges the dimensions of the array so that they
+     *    are in the order specified by the vector index.
+     *    The array produced has the same values as A but the order of the subscripts
+     *    needed to access any particular element are rearranged as specified.
+     * @param index        Desired order of the subscripts
+     */
+    Array permute( const std::vector<uint8_t> &index ) const;
+
    //! Replicate an array a given number of times in each direction
    Array repmat( const std::vector<size_t> &N ) const;

@ -692,8 +747,8 @@ public: // Math operations
    Array coarsen( const Array &filter ) const;

    //! Coarsen an array using the given filter
-    Array coarsen(
-        const std::vector<size_t> &ratio, std::function<TYPE( const Array & )> filter ) const;
+    Array coarsen( const std::vector<size_t> &ratio,
+                   std::function<TYPE( const Array & )> filter ) const;

    /*!
     * Perform a element-wise operation y = f(x)
@ -708,8 +763,9 @@ public: // Math operations
     * @param[in] x             The first array
     * @param[in] y             The second array
     */
-    static Array transform(
-        std::function<TYPE( const TYPE &, const TYPE & )> fun, const Array &x, const Array &y );
+    static Array transform( std::function<TYPE( const TYPE &, const TYPE & )> fun,
+                            const Array &x,
+                            const Array &y );

    /*!
     * axpby operation: this = alpha*x + beta*this
@ -723,7 +779,13 @@ public: // Math operations
     * Linear interpolation
     * @param[in] x             Position as a decimal index
     */
-    TYPE interp( const std::vector<double> &x ) const;
+    inline TYPE interp( const std::vector<double> &x ) const { return interp( x.data() ); }
+
+    /*!
+     * Linear interpolation
+     * @param[in] x             Position as a decimal index
+     */
+    TYPE interp( const double *x ) const;

    /**
     * \fn equals (Array & const rhs, TYPE tol )
@ -746,8 +808,10 @@ private:
    inline void checkSubsetIndex( const std::vector<Range<size_t>> &range ) const;
    inline std::vector<Range<size_t>> convert( const std::vector<size_t> &index ) const;
    static inline void getSubsetArrays( const std::vector<Range<size_t>> &range,
-        std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &inc,
-        std::array<size_t, 5> &N );
+                                        std::array<size_t, 5> &first,
+                                        std::array<size_t, 5> &last,
+                                        std::array<size_t, 5> &inc,
+                                        std::array<size_t, 5> &N );
 };


@ -772,8 +836,8 @@ inline Array<TYPE, FUN, Allocator> operator+(
    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
 {
    Array<TYPE, FUN, Allocator> c;
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
-    FUN::transform( fun, a, b, c );
+    const auto &op = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    FUN::transform( op, a, b, c );
    return c;
 }
 template<class TYPE, class FUN, class Allocator>
@ -781,30 +845,78 @@ inline Array<TYPE, FUN, Allocator> operator-(
    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
 {
    Array<TYPE, FUN, Allocator> c;
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a - b; };
-    FUN::transform( fun, a, b, c );
+    const auto &op = []( const TYPE &a, const TYPE &b ) { return a - b; };
+    FUN::transform( op, a, b, c );
    return c;
 }
 template<class TYPE, class FUN, class Allocator>
 inline Array<TYPE, FUN, Allocator> operator*(
    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
 {
-    return Array<TYPE, FUN, Allocator>::multiply( a, b );
+    Array<TYPE, FUN, Allocator> c;
+    FUN::multiply( a, b, c );
+    return c;
 }
 template<class TYPE, class FUN, class Allocator>
 inline Array<TYPE, FUN, Allocator> operator*(
    const Array<TYPE, FUN, Allocator> &a, const std::vector<TYPE> &b )
 {
-    Array<TYPE, FUN, Allocator> b2;
+    Array<TYPE, FUN, Allocator> b2, c;
    b2.viewRaw( { b.size() }, const_cast<TYPE *>( b.data() ) );
-    return Array<TYPE, FUN, Allocator>::multiply( a, b2 );
+    FUN::multiply( a, b2, c );
+    return c;
+}
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator*( const TYPE &a,
+                                              const Array<TYPE, FUN, Allocator> &b )
+{
+    auto c = b;
+    c.scale( a );
+    return c;
+}
+template<class TYPE, class FUN, class Allocator>
+inline Array<TYPE, FUN, Allocator> operator*( const Array<TYPE, FUN, Allocator> &a,
+                                              const TYPE &b )
+{
+    auto c = a;
+    c.scale( b );
+    return c;
+}
+
+
+/********************************************************
+ *  Copy array                                           *
+ ********************************************************/
+template<class TYPE, class FUN, class Allocator>
+template<class TYPE2>
+inline void Array<TYPE, FUN, Allocator>::copy( const TYPE2 *data )
+{
+    if ( std::is_same<TYPE, TYPE2>::value ) {
+        std::copy( data, data + d_size.length(), d_data );
+    } else {
+        for ( size_t i = 0; i < d_size.length(); i++ )
+            d_data[i] = static_cast<TYPE>( data[i] );
+    }
+}
+template<class TYPE, class FUN, class Allocator>
+template<class TYPE2>
+inline void Array<TYPE, FUN, Allocator>::copyTo( TYPE2 *data ) const
+{
+    if ( std::is_same<TYPE, TYPE2>::value ) {
+        std::copy( d_data, d_data + d_size.length(), data );
+    } else {
+        for ( size_t i = 0; i < d_size.length(); i++ )
+            data[i] = static_cast<TYPE2>( d_data[i] );
+    }
 }


 /********************************************************
 *  Convience typedefs                                   *
+ *  Copy array                                           *
 ********************************************************/
 typedef Array<double> DoubleArray;
 typedef Array<int> IntArray;

+
 #endif
--- a/common/Array.hpp
+++ b/common/Array.hpp
@ -49,65 +49,46 @@
 /********************************************************
 *  External instantiations                              *
 ********************************************************/
-extern template class Array<char, FunctionTable>;
-extern template class Array<uint8_t, FunctionTable>;
-extern template class Array<uint16_t, FunctionTable>;
-extern template class Array<uint32_t, FunctionTable>;
-extern template class Array<uint64_t, FunctionTable>;
-extern template class Array<int8_t, FunctionTable>;
-extern template class Array<int16_t, FunctionTable>;
-extern template class Array<int32_t, FunctionTable>;
-extern template class Array<int64_t, FunctionTable>;
-extern template class Array<double, FunctionTable>;
-extern template class Array<float, FunctionTable>;
+extern template class Array<char>;
+extern template class Array<uint8_t>;
+extern template class Array<uint16_t>;
+extern template class Array<uint32_t>;
+extern template class Array<uint64_t>;
+extern template class Array<int8_t>;
+extern template class Array<int16_t>;
+extern template class Array<int32_t>;
+extern template class Array<int64_t>;
+extern template class Array<double>;
+extern template class Array<float>;


 /********************************************************
- *  Helper functions                                     *
+ *  Macros to help instantiate functions                 *
 ********************************************************/
-template<class TYPE>
-inline typename std::enable_if<std::is_integral<TYPE>::value, size_t>::type getRangeSize(
-    const Range<TYPE> &range )
-{
-    return ( static_cast<int64_t>( range.j ) - static_cast<int64_t>( range.i ) ) /
-           static_cast<int64_t>( range.k );
-}
-template<class TYPE>
-inline typename std::enable_if<std::is_floating_point<TYPE>::value, size_t>::type getRangeSize(
-    const Range<TYPE> &range )
-{
-    double tmp = static_cast<double>( ( range.j - range.i ) ) / static_cast<double>( range.k );
-    return static_cast<size_t>( floor( tmp + 1e-12 ) + 1 );
-}
-template<class TYPE>
-inline typename std::enable_if<std::is_same<TYPE, std::complex<float>>::value ||
-                                   std::is_same<TYPE, std::complex<double>>::value,
-    size_t>::type
-getRangeSize( const Range<TYPE> &range )
-{
-    double tmp = std::real( ( range.j - range.i ) / ( range.k ) );
-    return static_cast<size_t>( floor( tmp + 1e-12 ) + 1 );
-}
-template<class TYPE>
-inline typename std::enable_if<std::is_integral<TYPE>::value, TYPE>::type getRangeValue(
-    const Range<TYPE> &range, size_t index )
-{
-    return range.i + index * range.k;
-}
-template<class TYPE>
-inline typename std::enable_if<std::is_floating_point<TYPE>::value, TYPE>::type getRangeValue(
-    const Range<TYPE> &range, size_t index )
-{
-    return range.k * ( range.i / range.k + index );
-}
-template<class TYPE>
-inline typename std::enable_if<std::is_same<TYPE, std::complex<float>>::value ||
-                                   std::is_same<TYPE, std::complex<double>>::value,
-    TYPE>::type
-getRangeValue( const Range<TYPE> &range, size_t index )
-{
-    return range.k * ( range.i / range.k + static_cast<TYPE>( index ) );
-}
+// clang-format off
+#define instantiateArrayConstructors( TYPE )                                    \
+    template Array<TYPE>::Array();                                              \
+    template Array<TYPE>::~Array();                                             \
+    template Array<TYPE>::Array( const ArraySize & );                           \
+    template Array<TYPE>::Array( size_t );                                      \
+    template Array<TYPE>::Array( size_t, size_t );                              \
+    template Array<TYPE>::Array( size_t, size_t, size_t );                      \
+    template Array<TYPE>::Array( size_t, size_t, size_t, size_t );              \
+    template Array<TYPE>::Array( size_t, size_t, size_t, size_t, size_t );      \
+    template Array<TYPE>::Array( const std::vector<size_t> &, const TYPE * );   \
+    template Array<TYPE>::Array( std::initializer_list<TYPE> );                 \
+    template Array<TYPE>::Array( std::initializer_list<std::initializer_list<TYPE>> ); \
+    template Array<TYPE>::Array( const Array<TYPE> & );                         \
+    template Array<TYPE>::Array( Array<TYPE> && );                              \
+    template void Array<TYPE>::reshape( ArraySize const& );                     \
+    template void Array<TYPE>::squeeze();                                       \
+    template std::unique_ptr<const Array<TYPE>>                                 \
+        Array<TYPE>::constView(ArraySize const&, std::shared_ptr<TYPE const> const&); \
+    template void Array<TYPE>::viewRaw( ArraySize const&, TYPE*, bool, bool );  \
+    template void Array<TYPE>::view2(ArraySize const&, std::shared_ptr<TYPE> ); \
+    template Array<TYPE> &Array<TYPE>::operator=( const Array<TYPE> & );        \
+    template Array<TYPE> &Array<TYPE>::operator=( Array<TYPE> && );
+// clang-format on


 /********************************************************
@ -158,19 +139,8 @@ Array<TYPE, FUN, Allocator>::Array( const std::vector<size_t> &N, const TYPE *da
    : d_isCopyable( true ), d_isFixedSize( false )
 {
    allocate( N );
-    if ( data ) {
-        for ( size_t i = 0; i < d_size.length(); i++ )
-            d_data[i] = data[i];
-    }
-}
-template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator>::Array( const Range<TYPE> &range )
-    : d_isCopyable( true ), d_isFixedSize( false )
-{
-    size_t N = getRangeSize( range );
-    allocate( { N } );
-    for ( size_t i = 0; i < N; i++ )
-        d_data[i] = getRangeValue( range, i );
+    if ( data )
+        copy( data );
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator>::Array( std::string str ) : d_isCopyable( true ), d_isFixedSize( false )
@ -248,8 +218,12 @@ Array<TYPE, FUN, Allocator>::Array( std::string str ) : d_isCopyable( true ), d_
            i2 = str.length();
    }
    allocate( data.size() );
-    for ( size_t i = 0; i < data.size(); i++ )
-        d_data[i] = data[i];
+    if ( std::is_same<TYPE, bool>::value ) {
+        for ( size_t i = 0; i < data.size(); i++ )
+            d_data[i] = data[i];
+    } else {
+        copy( data.data() );
+    }
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator>::Array( std::initializer_list<TYPE> x )
@ -261,19 +235,38 @@ Array<TYPE, FUN, Allocator>::Array( std::initializer_list<TYPE> x )
        d_data[i] = *it;
 }
 template<class TYPE, class FUN, class Allocator>
+Array<TYPE, FUN, Allocator>::Array( std::initializer_list<std::initializer_list<TYPE>> x )
+    : d_isCopyable( true ), d_isFixedSize( false )
+{
+    size_t Nx = x.size();
+    size_t Ny = 0;
+    for ( const auto y : x )
+        Ny = std::max<size_t>( Ny, y.size() );
+    allocate( { Nx, Ny } );
+    auto itx = x.begin();
+    for ( size_t i = 0; i < x.size(); ++i, ++itx ) {
+        auto ity = itx->begin();
+        for ( size_t j = 0; j < itx->size(); ++j, ++ity ) {
+            d_data[i + j * Nx] = *ity;
+        }
+    }
+}
+template<class TYPE, class FUN, class Allocator>
 void Array<TYPE, FUN, Allocator>::allocate( const ArraySize &N )
 {
    if ( d_isFixedSize )
        throw std::logic_error( "Array cannot be resized" );
    d_size      = N;
    auto length = d_size.length();
-    if ( length == 0 )
-        d_ptr.reset();
-    else
-        d_ptr.reset( new ( std::nothrow ) TYPE[length], []( TYPE *p ) { delete[] p; } );
-    d_data = d_ptr.get();
-    if ( length > 0 && d_data == nullptr )
-        throw std::logic_error( "Failed to allocate array" );
+    d_data      = nullptr;
+    if ( length > 0 ) {
+        try {
+            d_data = new TYPE[length];
+        } catch ( ... ) {
+            throw std::logic_error( "Failed to allocate array" );
+        }
+    }
+    d_ptr.reset( d_data, []( TYPE *p ) { delete[] p; } );
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator>::Array( const Array &rhs )
@ -282,18 +275,19 @@ Array<TYPE, FUN, Allocator>::Array( const Array &rhs )
    if ( !rhs.d_isCopyable )
        throw std::logic_error( "Array cannot be copied" );
    allocate( rhs.size() );
-    for ( size_t i = 0; i < d_size.length(); i++ )
-        d_data[i] = rhs.d_data[i];
+    copy( rhs.d_data );
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator>::Array( Array &&rhs )
    : d_isCopyable( rhs.d_isCopyable ),
      d_isFixedSize( rhs.d_isFixedSize ),
      d_size( rhs.d_size ),
-      d_data( rhs.d_data )
+      d_data( rhs.d_data ),
+      d_ptr( std::move( rhs.d_ptr ) )
 {
+    rhs.d_size = ArraySize();
    rhs.d_data = nullptr;
-    d_ptr      = std::move( rhs.d_ptr );
+    rhs.d_ptr  = nullptr;
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator=( const Array &rhs )
@ -302,9 +296,8 @@ Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator=( const Array
        return *this;
    if ( !rhs.d_isCopyable )
        throw std::logic_error( "Array cannot be copied" );
-    this->allocate( rhs.size() );
-    for ( size_t i = 0; i < d_size.length(); i++ )
-        this->d_data[i] = rhs.d_data[i];
+    allocate( rhs.size() );
+    copy( rhs.d_data );
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
@ -317,15 +310,17 @@ Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator=( Array &&rhs
    d_size        = rhs.d_size;
    d_data        = rhs.d_data;
    d_ptr         = std::move( rhs.d_ptr );
+    rhs.d_size    = ArraySize();
    rhs.d_data    = nullptr;
+    rhs.d_ptr     = nullptr;
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator=( const std::vector<TYPE> &rhs )
 {
-    this->allocate( ArraySize( rhs.size() ) );
+    allocate( ArraySize( rhs.size() ) );
    for ( size_t i = 0; i < rhs.size(); i++ )
-        this->d_data[i] = rhs[i];
+        d_data[i] = rhs[i];
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
@ -363,9 +358,9 @@ static inline void moveValues( const ArraySize &N1, const ArraySize &N2, TYPE *d
        }
    }
 }
-template<bool test, class TYPE>
-static inline typename std::enable_if<test, void>::type copyValues(
-    const ArraySize &N1, const ArraySize &N2, const TYPE *data1, TYPE *data2 )
+template<class TYPE>
+static inline void
+copyValues( const ArraySize &N1, const ArraySize &N2, const TYPE *data1, TYPE *data2 )
 {
    for ( size_t i5 = 0; i5 < std::min( N1[4], N2[4] ); i5++ ) {
        for ( size_t i4 = 0; i4 < std::min( N1[3], N2[3] ); i4++ ) {
@ -381,12 +376,6 @@ static inline typename std::enable_if<test, void>::type copyValues(
        }
    }
 }
-template<bool test, class TYPE>
-static inline typename std::enable_if<!test, void>::type copyValues(
-    const ArraySize &, const ArraySize &, const TYPE *, TYPE * )
-{
-    throw std::logic_error( "No copy constructor" );
-}


 /********************************************************
@ -413,9 +402,11 @@ void Array<TYPE, FUN, Allocator>::resize( const ArraySize &N )
        if ( data0.use_count() <= 1 ) {
            // We own the data, use std:move
            moveValues( N0, N, data0.get(), d_data );
-        } else {
+        } else if ( std::is_copy_constructible<TYPE>::value ) {
            // We do not own the data, copy
-            copyValues<std::is_copy_constructible<TYPE>::value, TYPE>( N0, N, data0.get(), d_data );
+            copyValues( N0, N, data0.get(), d_data );
+        } else {
+            throw std::logic_error( "No copy constructor" );
        }
    }
 }
@ -444,7 +435,7 @@ void Array<TYPE, FUN, Allocator>::resizeDim( int dim, size_t N, const TYPE &valu


 /********************************************************
- *  Reshape the array                                     *
+ *  Reshape/squeeze the array                            *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
 void Array<TYPE, FUN, Allocator>::reshape( const ArraySize &N )
@ -453,6 +444,85 @@ void Array<TYPE, FUN, Allocator>::reshape( const ArraySize &N )
        throw std::logic_error( "reshape is not allowed to change the array size" );
    d_size = N;
 }
+template<class TYPE, class FUN, class Allocator>
+void Array<TYPE, FUN, Allocator>::squeeze()
+{
+    d_size.squeeze();
+}
+
+
+/********************************************************
+ *  Shift/permute the array                              *
+ ********************************************************/
+template<class TYPE, class FUN, class Allocator>
+Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::shiftDim( int N ) const
+{
+    if ( N > 0 )
+        N = N % d_size.ndim();
+    if ( N == 0 ) {
+        // No shift required
+        return *this;
+    } else if ( N > 0 ) {
+        // Shift to the left and wrap
+        std::vector<uint8_t> index( d_size.ndim() );
+        size_t i = 0;
+        for ( size_t j=N; j<index.size(); j++, i++)
+            index[i] = j;
+        for ( size_t j=0; i<index.size(); j++, i++)
+            index[i] = j;
+        return permute( index );
+    } else  {
+        // Shift to the right (padding with singletons)
+        N = -N;
+        ASSERT( d_size.ndim() + N < (int) ArraySize::maxDim() );
+        size_t dims[10] = { 1 };
+        for ( int i = 0; i < ndim(); i++ )
+            dims[N+i] = d_size[i];
+        auto y = *this;
+        y.reshape( ArraySize( ndim() + N, dims ) );
+        return y;
+    }
+}
+template<class TYPE, class FUN, class Allocator>
+Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::permute( const std::vector<uint8_t> &index ) const
+{
+    // Check the permutation
+    ASSERT( (int) index.size() == ndim() );
+    for ( int i=0; i < ndim(); i++) {
+        ASSERT( index[i] < ndim() );
+        for ( int j=0; j < i; j++)
+            ASSERT( index[i] != index[j] );
+    }
+    // Create the new Array
+    size_t dims[5] = { 1u, 1u, 1u, 1u, 1u };
+    for ( size_t i=0; i<index.size(); i++)
+        dims[i] = d_size[index[i]];
+    Array y( ArraySize( ndim(), dims ) );
+    y.fill( -1 );
+    ASSERT( y.length() == this->length() );
+    // Fill the data
+    size_t N[5] = { 1u, 1u, 1u, 1u, 1u };
+    for ( int i=0; i < ndim(); i++) {
+        std::array<size_t, 5> ijk = { 0, 0, 0, 0, 0 };
+        ijk[index[i]] = 1;
+        N[i] = d_size.index( ijk );
+    }
+    size_t tmp = ( dims[0] - 1 ) * N[0] + ( dims[1] - 1 ) * N[1] + ( dims[2] - 1 ) * N[2] + ( dims[3] - 1 ) * N[3] + ( dims[4] - 1 ) * N[4] + 1;
+    ASSERT( tmp == length() );
+    for ( size_t i4 = 0; i4 < dims[4]; i4++ ) {
+        for ( size_t i3 = 0; i3 < dims[3]; i3++ ) {
+            for ( size_t i2 = 0; i2 < dims[2]; i2++ ) {
+                for ( size_t i1 = 0; i1 < dims[1]; i1++ ) {
+                    for ( size_t i0 = 0; i0 < dims[0]; i0++ ) {
+                        size_t index2 = i0 * N[0] + i1 * N[1] + i2 * N[2] + i3 * N[3] + i4 * N[4];
+                        y( i0, i1, i2, i3, i4 ) = d_data[index2];
+                    }
+                }
+            }
+        }
+    }
+    return y;
+}


 /********************************************************
@ -460,8 +530,8 @@ void Array<TYPE, FUN, Allocator>::reshape( const ArraySize &N )
 ********************************************************/
 // Helper function to check subset indices
 template<class TYPE, class FUN, class Allocator>
-inline void Array<TYPE, FUN, Allocator>::checkSubsetIndex(
-    const std::vector<Range<size_t>> &range ) const
+inline void
+Array<TYPE, FUN, Allocator>::checkSubsetIndex( const std::vector<Range<size_t>> &range ) const
 {
    bool test = (int) range.size() == d_size.ndim();
    for ( size_t d = 0; d < range.size(); d++ )
@ -470,8 +540,8 @@ inline void Array<TYPE, FUN, Allocator>::checkSubsetIndex(
        throw std::logic_error( "indices for subset are invalid" );
 }
 template<class TYPE, class FUN, class Allocator>
-std::vector<Range<size_t>> Array<TYPE, FUN, Allocator>::convert(
-    const std::vector<size_t> &index ) const
+std::vector<Range<size_t>>
+Array<TYPE, FUN, Allocator>::convert( const std::vector<size_t> &index ) const
 {
    std::vector<Range<size_t>> range( d_size.ndim() );
    if ( index.size() % 2 != 0 || static_cast<int>( index.size() / 2 ) < d_size.ndim() )
@ -483,8 +553,10 @@ std::vector<Range<size_t>> Array<TYPE, FUN, Allocator>::convert(
 // Helper function to return dimensions for the subset array
 template<class TYPE, class FUN, class Allocator>
 void Array<TYPE, FUN, Allocator>::getSubsetArrays( const std::vector<Range<size_t>> &index,
-    std::array<size_t, 5> &first, std::array<size_t, 5> &last, std::array<size_t, 5> &inc,
-    std::array<size_t, 5> &N )
+                                                   std::array<size_t, 5> &first,
+                                                   std::array<size_t, 5> &last,
+                                                   std::array<size_t, 5> &inc,
+                                                   std::array<size_t, 5> &N )
 {
    first.fill( 0 );
    last.fill( 0 );
@ -499,8 +571,8 @@ void Array<TYPE, FUN, Allocator>::getSubsetArrays( const std::vector<Range<size_
    }
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::subset(
-    const std::vector<Range<size_t>> &index ) const
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::subset( const std::vector<Range<size_t>> &index ) const
 {
    // Get the subset indicies
    checkSubsetIndex( index );
@ -508,9 +580,8 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::subset(
    getSubsetArrays( index, first, last, inc, N1 );
    ArraySize S1( d_size.ndim(), N1.data() );
    // Create the new array
-    Array<TYPE> subset_array( S1 );
+    Array<TYPE, FUN, Allocator> subset_array( S1 );
    // Fill the new array
-    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
    TYPE *subset_data = subset_array.data();
    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
@ -527,22 +598,21 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::subset(
    return subset_array;
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::subset(
-    const std::vector<size_t> &index ) const
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::subset( const std::vector<size_t> &index ) const
 {
    auto range = convert( index );
    return subset( range );
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::copySubset(
-    const std::vector<Range<size_t>> &index, const Array<TYPE, FUN, Allocator> &subset )
+void Array<TYPE, FUN, Allocator>::copySubset( const std::vector<Range<size_t>> &index,
+                                              const Array<TYPE, FUN, Allocator> &subset )
 {
    // Get the subset indices
    checkSubsetIndex( index );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( index, first, last, inc, N1 );
    // Copy the sub-array
-    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
    const TYPE *src_data = subset.data();
    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
@ -558,15 +628,14 @@ void Array<TYPE, FUN, Allocator>::copySubset(
    }
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::addSubset(
-    const std::vector<Range<size_t>> &index, const Array<TYPE, FUN, Allocator> &subset )
+void Array<TYPE, FUN, Allocator>::addSubset( const std::vector<Range<size_t>> &index,
+                                             const Array<TYPE, FUN, Allocator> &subset )
 {
    // Get the subset indices
    checkSubsetIndex( index );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( index, first, last, inc, N1 );
    // add the sub-array
-    static_assert( ArraySize::maxDim() == 5, "Not programmed for more than 5 dimensions" );
    for ( size_t i4 = first[4], k1 = 0; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
            for ( size_t i2 = first[2]; i2 <= last[2]; i2 += inc[2] ) {
@ -581,16 +650,16 @@ void Array<TYPE, FUN, Allocator>::addSubset(
    }
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::copySubset(
-    const std::vector<size_t> &index, const Array<TYPE, FUN, Allocator> &subset )
+void Array<TYPE, FUN, Allocator>::copySubset( const std::vector<size_t> &index,
+                                              const Array<TYPE, FUN, Allocator> &subset )
 {
    auto range = convert( index );
    copySubset( range, subset );
 }

 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::addSubset(
-    const std::vector<size_t> &index, const Array<TYPE, FUN, Allocator> &subset )
+void Array<TYPE, FUN, Allocator>::addSubset( const std::vector<size_t> &index,
+                                             const Array<TYPE, FUN, Allocator> &subset )
 {
    auto range = convert( index );
    addSubset( range, subset );
@ -618,8 +687,8 @@ bool Array<TYPE, FUN, Allocator>::operator==( const Array &rhs ) const
 *  Get a view of an C array                             *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
-std::unique_ptr<Array<TYPE, FUN, Allocator>> Array<TYPE, FUN, Allocator>::view(
-    const ArraySize &N, std::shared_ptr<TYPE> &data )
+std::unique_ptr<Array<TYPE, FUN, Allocator>>
+Array<TYPE, FUN, Allocator>::view( const ArraySize &N, std::shared_ptr<TYPE> data )
 {
    auto array    = std::make_unique<Array<TYPE, FUN, Allocator>>();
    array->d_size = N;
@ -628,8 +697,9 @@ std::unique_ptr<Array<TYPE, FUN, Allocator>> Array<TYPE, FUN, Allocator>::view(
    return array;
 }
 template<class TYPE, class FUN, class Allocator>
-std::unique_ptr<const Array<TYPE, FUN, Allocator>> Array<TYPE, FUN, Allocator>::constView(
-    const ArraySize &N, std::shared_ptr<const TYPE> const &data )
+std::unique_ptr<const Array<TYPE, FUN, Allocator>>
+Array<TYPE, FUN, Allocator>::constView( const ArraySize &N,
+                                        std::shared_ptr<const TYPE> const &data )
 {
    auto array    = std::make_unique<Array<TYPE, FUN, Allocator>>();
    array->d_size = N;
@ -644,15 +714,17 @@ void Array<TYPE, FUN, Allocator>::view2( Array<TYPE, FUN, Allocator> &src )
    d_data = src.d_data;
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::view2( const ArraySize &N, std::shared_ptr<TYPE> const &data )
+void Array<TYPE, FUN, Allocator>::view2( const ArraySize &N, std::shared_ptr<TYPE> data )
 {
    d_size = N;
    d_ptr  = data;
    d_data = d_ptr.get();
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::viewRaw(
-    const ArraySize &N, TYPE *data, bool isCopyable, bool isFixedSize )
+void Array<TYPE, FUN, Allocator>::viewRaw( const ArraySize &N,
+                                           TYPE *data,
+                                           bool isCopyable,
+                                           bool isFixedSize )
 {
    d_isCopyable  = isCopyable;
    d_isFixedSize = isFixedSize;
@ -676,20 +748,8 @@ void Array<TYPE, FUN, Allocator>::swap( Array &other )
    std::swap( d_ptr, other.d_ptr );
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::fill( const TYPE &value )
-{
-    for ( size_t i = 0; i < d_size.length(); i++ )
-        d_data[i] = value;
-}
-template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::scale( const TYPE &value )
-{
-    for ( size_t i = 0; i < d_size.length(); i++ )
-        d_data[i] *= value;
-}
-template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::pow(
-    const Array<TYPE, FUN, Allocator> &baseArray, const TYPE &exp )
+void Array<TYPE, FUN, Allocator>::pow( const Array<TYPE, FUN, Allocator> &baseArray,
+                                       const TYPE &exp )
 {
    // not insisting on the shapes being the same
    // but insisting on the total size being the same
@ -706,8 +766,8 @@ void Array<TYPE, FUN, Allocator>::pow(
 *  Replicate the array                                  *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::repmat(
-    const std::vector<size_t> &N_rep ) const
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::repmat( const std::vector<size_t> &N_rep ) const
 {
    std::vector<size_t> N2( d_size.begin(), d_size.end() );
    if ( N2.size() < N_rep.size() )
@ -721,7 +781,6 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::repmat(
        N2[d] *= N_rep[d];
    }
    Array<TYPE, FUN, Allocator> y( N2 );
-    static_assert( ArraySize::maxDim() <= 5, "Not programmed for dimensions > 5" );
    TYPE *y2 = y.data();
    for ( size_t i4 = 0, index = 0; i4 < N1[4]; i4++ ) {
        for ( size_t j4 = 0; j4 < Nr[4]; j4++ ) {
@ -763,7 +822,7 @@ bool Array<TYPE, FUN, Allocator>::NaNs() const
 template<class TYPE, class FUN, class Allocator>
 TYPE Array<TYPE, FUN, Allocator>::mean( void ) const
 {
-    TYPE x = this->sum() / d_size.length();
+    TYPE x = sum() / d_size.length();
    return x;
 }
 template<class TYPE, class FUN, class Allocator>
@ -845,7 +904,6 @@ TYPE Array<TYPE, FUN, Allocator>::min( const std::vector<Range<size_t>> &range )
    checkSubsetIndex( range );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( range, first, last, inc, N1 );
-    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
    TYPE x = std::numeric_limits<TYPE>::max();
    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
@ -868,7 +926,6 @@ TYPE Array<TYPE, FUN, Allocator>::max( const std::vector<Range<size_t>> &range )
    checkSubsetIndex( range );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( range, first, last, inc, N1 );
-    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
    TYPE x = std::numeric_limits<TYPE>::min();
    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
@ -891,7 +948,6 @@ TYPE Array<TYPE, FUN, Allocator>::sum( const std::vector<Range<size_t>> &range )
    checkSubsetIndex( range );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( range, first, last, inc, N1 );
-    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
    TYPE x = 0;
    for ( size_t i4 = first[4]; i4 <= last[4]; i4 += inc[4] ) {
        for ( size_t i3 = first[3]; i3 <= last[3]; i3 += inc[3] ) {
@ -914,7 +970,6 @@ TYPE Array<TYPE, FUN, Allocator>::mean( const std::vector<Range<size_t>> &range
    checkSubsetIndex( range );
    std::array<size_t, 5> first, last, inc, N1;
    getSubsetArrays( range, first, last, inc, N1 );
-    static_assert( ArraySize::maxDim() <= 5, "Function programmed for more than 5 dimensions" );
    size_t n = 1;
    for ( auto &d : N1 )
        n *= d;
@ -951,8 +1006,9 @@ TYPE Array<TYPE, FUN, Allocator>::mean( const std::vector<size_t> &index ) const
 *  Find all elements that match the given operation     *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
-std::vector<size_t> Array<TYPE, FUN, Allocator>::find(
-    const TYPE &value, std::function<bool( const TYPE &, const TYPE & )> compare ) const
+std::vector<size_t>
+Array<TYPE, FUN, Allocator>::find( const TYPE &value,
+                                   std::function<bool( const TYPE &, const TYPE & )> compare ) const
 {
    std::vector<size_t> result;
    result.reserve( d_size.length() );
@ -968,8 +1024,9 @@ std::vector<size_t> Array<TYPE, FUN, Allocator>::find(
 *  Print an array to an output stream                   *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::print(
-    std::ostream &os, const std::string &name, const std::string &prefix ) const
+void Array<TYPE, FUN, Allocator>::print( std::ostream &os,
+                                         const std::string &name,
+                                         const std::string &prefix ) const
 {
    if ( d_size.ndim() == 1 ) {
        for ( size_t i = 0; i < d_size[0]; i++ )
@ -993,12 +1050,11 @@ void Array<TYPE, FUN, Allocator>::print(
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::reverseDim() const
 {
-    size_t N2[ArraySize::maxDim()];
+    size_t N2[5];
    for ( int d = 0; d < ArraySize::maxDim(); d++ )
        N2[d] = d_size[ArraySize::maxDim() - d - 1];
    ArraySize S2( ArraySize::maxDim(), N2 );
    Array<TYPE, FUN, Allocator> y( S2 );
-    static_assert( ArraySize::maxDim() == 5, "Not programmed for dimensions other than 5" );
    TYPE *y2 = y.data();
    for ( size_t i0 = 0; i0 < d_size[0]; i0++ ) {
        for ( size_t i1 = 0; i1 < d_size[1]; i1++ ) {
@ -1023,8 +1079,8 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::reverseDim() const
 *  Coarsen the array                                    *
 ********************************************************/
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen(
-    const Array<TYPE, FUN, Allocator> &filter ) const
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::coarsen( const Array<TYPE, FUN, Allocator> &filter ) const
 {
    auto S2 = size();
    for ( size_t i = 0; i < S2.size(); i++ ) {
@ -1044,8 +1100,9 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen(
                for ( size_t k2 = 0; k2 < Nh[2]; k2++ ) {
                    for ( size_t j2 = 0; j2 < Nh[1]; j2++ ) {
                        for ( size_t i2 = 0; i2 < Nh[0]; i2++ ) {
-                            tmp += filter( i2, j2, k2 ) * this->operator()( i1 *Nh[0] + i2,
-                                                              j1 * Nh[1] + j2, k1 * Nh[2] + k2 );
+                            tmp += filter( i2, j2, k2 ) * operator()( i1 *Nh[0] + i2,
+                                                                      j1 * Nh[1] + j2,
+                                                                      k1 * Nh[2] + k2 );
                        }
                    }
                }
@ -1056,7 +1113,8 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen(
    return y;
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen( const std::vector<size_t> &ratio,
+Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen(
+    const std::vector<size_t> &ratio,
    std::function<TYPE( const Array<TYPE, FUN, Allocator> & )> filter ) const
 {
    if ( ratio.size() != d_size.ndim() )
@ -1077,7 +1135,7 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::coarsen( const std::vec
                for ( size_t k2 = 0; k2 < ratio[2]; k2++ ) {
                    for ( size_t j2 = 0; j2 < ratio[1]; j2++ ) {
                        for ( size_t i2 = 0; i2 < ratio[0]; i2++ ) {
-                            tmp( i2, j2, k2 ) = this->operator()(
+                            tmp( i2, j2, k2 ) = operator()(
                                i1 *ratio[0] + i2, j1 * ratio[1] + j2, k1 * ratio[2] + k2 );
                        }
                    }
@ -1102,13 +1160,25 @@ void Array<TYPE, FUN, Allocator>::cat( const Array<TYPE, FUN, Allocator> &x, int
    *this = cat( tmp, dim );
 }
 template<class TYPE, class FUN, class Allocator>
+Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::initializer_list<Array> &x,
+                                                              int dim )
+{
+    return cat( x.size(), x.begin(), dim );
+}
+template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::vector<Array> &x, int dim )
 {
-    if ( x.empty() )
+    return cat( x.size(), x.data(), dim );
+}
+template<class TYPE, class FUN, class Allocator>
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::cat( size_t N_array, const Array *x, int dim )
+{
+    if ( N_array == 0 )
        return Array<TYPE, FUN, Allocator>();
    // Check that the dimensions match
    bool check = true;
-    for ( size_t i = 1; i < x.size(); i++ ) {
+    for ( size_t i = 1; i < N_array; i++ ) {
        check = check && x[i].ndim() == x[0].ndim();
        for ( int d = 0; d < x[0].ndim(); d++ )
            if ( d != dim )
@ -1118,7 +1188,7 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::vector<
        throw std::logic_error( "Array dimensions do not match for concatenation" );
    // Create the output array
    auto size = x[0].d_size;
-    for ( size_t i = 1; i < x.size(); i++ )
+    for ( size_t i = 1; i < N_array; i++ )
        size.resize( dim, size[dim] + x[i].size( dim ) );
    Array<TYPE, FUN, Allocator> out( size );
    size_t N1 = 1;
@ -1129,7 +1199,7 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::vector<
    for ( size_t d = dim + 1; d < size.ndim(); d++ )
        N3 *= size[d];
    TYPE *data = out.data();
-    for ( size_t i = 0, i0 = 0; i < x.size(); i++ ) {
+    for ( size_t i = 0, i0 = 0; i < N_array; i++ ) {
        const TYPE *src = x[i].data();
        size_t N22      = x[i].size( dim );
        for ( size_t j2 = 0; j2 < N3; j2++ ) {
@ -1149,87 +1219,82 @@ Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::cat( const std::vector<
 *  Interpolate                                          *
 ********************************************************/
 template<class T>
-struct is_compatible_double
-    : std::integral_constant<bool, std::is_floating_point<T>::value || std::is_integral<T>::value> {
-};
-template<class TYPE>
-inline typename std::enable_if<is_compatible_double<TYPE>::value, TYPE>::type Array_interp_1D(
-    double x, int N, const TYPE *data )
+constexpr bool is_compatible_double()
 {
-    int i = floor( x );
-    i     = std::max( i, 0 );
-    i     = std::min( i, N - 2 );
-    return ( i + 1 - x ) * data[i] + ( x - i ) * data[i + 1];
+    return std::is_floating_point<T>::value || std::is_integral<T>::value;
 }
 template<class TYPE>
-inline typename std::enable_if<is_compatible_double<TYPE>::value, TYPE>::type Array_interp_2D(
-    double x, double y, int Nx, int Ny, const TYPE *data )
+inline TYPE Array_interp_1D( double x, int N, const TYPE *data )
 {
-    int i       = floor( x );
-    i           = std::max( i, 0 );
-    i           = std::min( i, Nx - 2 );
-    double dx   = x - i;
-    double dx2  = 1.0 - dx;
-    int j       = floor( y );
-    j           = std::max( j, 0 );
-    j           = std::min( j, Ny - 2 );
-    double dy   = y - j;
-    double dy2  = 1.0 - dy;
-    double f[4] = { (double) data[i + j * Nx], (double) data[i + 1 + j * Nx],
-        (double) data[i + ( j + 1 ) * Nx], (double) data[i + 1 + ( j + 1 ) * Nx] };
-    return ( dx * f[1] + dx2 * f[0] ) * dy2 + ( dx * f[3] + dx2 * f[2] ) * dy;
+    if ( is_compatible_double<TYPE>() ) {
+        int i = floor( x );
+        i     = std::max( i, 0 );
+        i     = std::min( i, N - 2 );
+        return ( i + 1 - x ) * data[i] + ( x - i ) * data[i + 1];
+    } else {
+        throw std::logic_error( "Invalid conversion" );
+    }
 }
 template<class TYPE>
-inline typename std::enable_if<is_compatible_double<TYPE>::value, TYPE>::type Array_interp_3D(
-    double x, double y, double z, int Nx, int Ny, int Nz, const TYPE *data )
+inline TYPE Array_interp_2D( double x, double y, int Nx, int Ny, const TYPE *data )
 {
-    int i       = floor( x );
-    i           = std::max( i, 0 );
-    i           = std::min( i, Nx - 2 );
-    double dx   = x - i;
-    double dx2  = 1.0 - dx;
-    int j       = floor( y );
-    j           = std::max( j, 0 );
-    j           = std::min( j, Ny - 2 );
-    double dy   = y - j;
-    double dy2  = 1.0 - dy;
-    int k       = floor( z );
-    k           = std::max( k, 0 );
-    k           = std::min( k, Nz - 2 );
-    double dz   = z - k;
-    double dz2  = 1.0 - dz;
-    double f[8] = { (double) data[i + j * Nx + k * Nx * Ny],
-        (double) data[i + 1 + j * Nx + k * Nx * Ny],
-        (double) data[i + ( j + 1 ) * Nx + k * Nx * Ny],
-        (double) data[i + 1 + ( j + 1 ) * Nx + k * Nx * Ny],
-        (double) data[i + j * Nx + ( k + 1 ) * Nx * Ny],
-        (double) data[i + 1 + j * Nx + ( k + 1 ) * Nx * Ny],
-        (double) data[i + ( j + 1 ) * Nx + ( k + 1 ) * Nx * Ny],
-        (double) data[i + 1 + ( j + 1 ) * Nx + ( k + 1 ) * Nx * Ny] };
-    double h0   = ( dx * f[1] + dx2 * f[0] ) * dy2 + ( dx * f[3] + dx2 * f[2] ) * dy;
-    double h1   = ( dx * f[5] + dx2 * f[4] ) * dy2 + ( dx * f[7] + dx2 * f[6] ) * dy;
-    return h0 * dz2 + h1 * dz;
+    if ( is_compatible_double<TYPE>() ) {
+        int i       = floor( x );
+        i           = std::max( i, 0 );
+        i           = std::min( i, Nx - 2 );
+        double dx   = x - i;
+        double dx2  = 1.0 - dx;
+        int j       = floor( y );
+        j           = std::max( j, 0 );
+        j           = std::min( j, Ny - 2 );
+        double dy   = y - j;
+        double dy2  = 1.0 - dy;
+        double f[4] = { (double) data[i + j * Nx],
+                        (double) data[i + 1 + j * Nx],
+                        (double) data[i + ( j + 1 ) * Nx],
+                        (double) data[i + 1 + ( j + 1 ) * Nx] };
+        return ( dx * f[1] + dx2 * f[0] ) * dy2 + ( dx * f[3] + dx2 * f[2] ) * dy;
+    } else {
+        throw std::logic_error( "Invalid conversion" );
+    }
 }
 template<class TYPE>
-inline typename std::enable_if<!is_compatible_double<TYPE>::value, TYPE>::type Array_interp_1D(
-    double, int, const TYPE * )
+inline TYPE
+Array_interp_3D( double x, double y, double z, int Nx, int Ny, int Nz, const TYPE *data )
 {
-    throw std::logic_error( "Invalid conversion" );
-}
-template<class TYPE>
-inline typename std::enable_if<!is_compatible_double<TYPE>::value, TYPE>::type Array_interp_2D(
-    double, double, int, int, const TYPE * )
-{
-    throw std::logic_error( "Invalid conversion" );
-}
-template<class TYPE>
-inline typename std::enable_if<!is_compatible_double<TYPE>::value, TYPE>::type Array_interp_3D(
-    double, double, double, int, int, int, const TYPE * )
-{
-    throw std::logic_error( "Invalid conversion" );
+    if ( is_compatible_double<TYPE>() ) {
+        int i       = floor( x );
+        i           = std::max( i, 0 );
+        i           = std::min( i, Nx - 2 );
+        double dx   = x - i;
+        double dx2  = 1.0 - dx;
+        int j       = floor( y );
+        j           = std::max( j, 0 );
+        j           = std::min( j, Ny - 2 );
+        double dy   = y - j;
+        double dy2  = 1.0 - dy;
+        int k       = floor( z );
+        k           = std::max( k, 0 );
+        k           = std::min( k, Nz - 2 );
+        double dz   = z - k;
+        double dz2  = 1.0 - dz;
+        double f[8] = { (double) data[i + j * Nx + k * Nx * Ny],
+                        (double) data[i + 1 + j * Nx + k * Nx * Ny],
+                        (double) data[i + ( j + 1 ) * Nx + k * Nx * Ny],
+                        (double) data[i + 1 + ( j + 1 ) * Nx + k * Nx * Ny],
+                        (double) data[i + j * Nx + ( k + 1 ) * Nx * Ny],
+                        (double) data[i + 1 + j * Nx + ( k + 1 ) * Nx * Ny],
+                        (double) data[i + ( j + 1 ) * Nx + ( k + 1 ) * Nx * Ny],
+                        (double) data[i + 1 + ( j + 1 ) * Nx + ( k + 1 ) * Nx * Ny] };
+        double h0   = ( dx * f[1] + dx2 * f[0] ) * dy2 + ( dx * f[3] + dx2 * f[2] ) * dy;
+        double h1   = ( dx * f[5] + dx2 * f[4] ) * dy2 + ( dx * f[7] + dx2 * f[6] ) * dy;
+        return h0 * dz2 + h1 * dz;
+    } else {
+        throw std::logic_error( "Invalid conversion" );
+    }
 }
 template<class TYPE, class FUN, class Allocator>
-TYPE Array<TYPE, FUN, Allocator>::interp( const std::vector<double> &x ) const
+TYPE Array<TYPE, FUN, Allocator>::interp( const double *x ) const
 {
    int ndim = 0, dim[5];
    double x2[5];
@ -1265,81 +1330,75 @@ void Array<TYPE, FUN, Allocator>::rand()
    FUN::rand( *this );
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator+=(
-    const Array<TYPE, FUN, Allocator> &rhs )
+Array<TYPE, FUN, Allocator> &
+Array<TYPE, FUN, Allocator>::operator+=( const Array<TYPE, FUN, Allocator> &rhs )
 {
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
-    FUN::transform( fun, *this, rhs, *this );
+    auto op = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    FUN::transform( op, *this, rhs, *this );
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator-=(
-    const Array<TYPE, FUN, Allocator> &rhs )
+Array<TYPE, FUN, Allocator> &
+Array<TYPE, FUN, Allocator>::operator-=( const Array<TYPE, FUN, Allocator> &rhs )
 {
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a - b; };
-    FUN::transform( fun, *this, rhs, *this );
+    auto op = []( const TYPE &a, const TYPE &b ) { return a - b; };
+    FUN::transform( op, *this, rhs, *this );
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator+=( const TYPE &rhs )
 {
-    const auto &fun = [rhs]( const TYPE &x ) { return x + rhs; };
-    FUN::transform( fun, *this, *this );
+    auto op = [rhs]( const TYPE &x ) { return x + rhs; };
+    FUN::transform( op, *this, *this );
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
 Array<TYPE, FUN, Allocator> &Array<TYPE, FUN, Allocator>::operator-=( const TYPE &rhs )
 {
-    const auto &fun = [rhs]( const TYPE &x ) { return x - rhs; };
-    FUN::transform( fun, *this, *this );
+    auto op = [rhs]( const TYPE &x ) { return x - rhs; };
+    FUN::transform( op, *this, *this );
    return *this;
 }
 template<class TYPE, class FUN, class Allocator>
 TYPE Array<TYPE, FUN, Allocator>::min() const
 {
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a < b ? a : b; };
-    return FUN::reduce( fun, *this );
+    const auto &op = []( const TYPE &a, const TYPE &b ) { return a < b ? a : b; };
+    return FUN::reduce( op, *this, d_data[0] );
 }
 template<class TYPE, class FUN, class Allocator>
 TYPE Array<TYPE, FUN, Allocator>::max() const
 {
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a > b ? a : b; };
-    return FUN::reduce( fun, *this );
+    const auto &op = []( const TYPE &a, const TYPE &b ) { return a > b ? a : b; };
+    return FUN::reduce( op, *this, d_data[0] );
 }
 template<class TYPE, class FUN, class Allocator>
 TYPE Array<TYPE, FUN, Allocator>::sum() const
 {
-    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
-    return FUN::reduce( fun, *this );
+    const auto &op = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    return FUN::reduce( op, *this, static_cast<TYPE>( 0 ) );
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::multiply(
-    const Array<TYPE, FUN, Allocator> &a, const Array<TYPE, FUN, Allocator> &b )
+void Array<TYPE, FUN, Allocator>::axpby( const TYPE &alpha,
+                                         const Array<TYPE, FUN, Allocator> &x,
+                                         const TYPE &beta )
 {
-    Array<TYPE, FUN, Allocator> c;
-    FUN::multiply( a, b, c );
-    return c;
+    const auto &op = [alpha, beta]( const TYPE &x, const TYPE &y ) { return alpha * x + beta * y; };
+    return FUN::transform( op, x, *this, *this );
 }
 template<class TYPE, class FUN, class Allocator>
-void Array<TYPE, FUN, Allocator>::axpby(
-    const TYPE &alpha, const Array<TYPE, FUN, Allocator> &x, const TYPE &beta )
-{
-    const auto &fun = [alpha, beta](
-                          const TYPE &x, const TYPE &y ) { return alpha * x + beta * y; };
-    return FUN::transform( fun, x, *this, *this );
-}
-template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::transform(
-    std::function<TYPE( const TYPE & )> fun, const Array<TYPE, FUN, Allocator> &x )
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::transform( std::function<TYPE( const TYPE & )> fun,
+                                        const Array<TYPE, FUN, Allocator> &x )
 {
    Array<TYPE, FUN, Allocator> y;
    FUN::transform( fun, x, y );
    return y;
 }
 template<class TYPE, class FUN, class Allocator>
-Array<TYPE, FUN, Allocator> Array<TYPE, FUN, Allocator>::transform(
-    std::function<TYPE( const TYPE &, const TYPE & )> fun, const Array<TYPE, FUN, Allocator> &x,
-    const Array<TYPE, FUN, Allocator> &y )
+Array<TYPE, FUN, Allocator>
+Array<TYPE, FUN, Allocator>::transform( std::function<TYPE( const TYPE &, const TYPE & )> fun,
+                                        const Array<TYPE, FUN, Allocator> &x,
+                                        const Array<TYPE, FUN, Allocator> &y )
 {
    Array<TYPE, FUN, Allocator> z;
    FUN::transform( fun, x, y, z );
@ -1351,4 +1410,5 @@ bool Array<TYPE, FUN, Allocator>::equals( const Array &rhs, TYPE tol ) const
    return FUN::equals( *this, rhs, tol );
 }

+
 #endif
--- a/common/ArraySize.h
+++ b/common/ArraySize.h
@ -1,8 +1,12 @@
 #ifndef included_ArraySizeClass
 #define included_ArraySizeClass

+#include "common/Utilities.h"

 #include <array>
+#include <cmath>
+#include <complex>
+#include <cstdlib>
 #include <cstring>
 #include <initializer_list>
 #include <vector>
@ -22,21 +26,22 @@


 #if ( defined( DEBUG ) || defined( _DEBUG ) ) && !defined( NDEBUG )
-#define CHECK_ARRAY_LENGTH( i )                                      \
+#define CHECK_ARRAY_LENGTH( i, length )                              \
    do {                                                             \
-        if ( i >= d_length )                                         \
+        if ( i >= length )                                           \
            throw std::out_of_range( "Index exceeds array bounds" ); \
    } while ( 0 )
 #else
-#define CHECK_ARRAY_LENGTH( i ) \
-    do {                        \
+#define CHECK_ARRAY_LENGTH( i, length ) \
+    do {                                \
    } while ( 0 )
 #endif


+
 // Forward declerations
 class FunctionTable;
-template<class TYPE, class FUN = FunctionTable, class Allocator = std::nullptr_t>
+template<class TYPE, class FUN = FunctionTable, class Allocator = std::allocator<TYPE>>
 class Array;


@ -46,7 +51,7 @@ class Range final
 {
 public:
    //! Empty constructor
-    constexpr Range() : i( 0 ), j( -1 ), k( 1 ) {}
+    Range() : i( 0 ), j( -1 ), k( 1 ) {}

    /*!
     * Create a range i:k:j (or i:j)
@ -54,8 +59,30 @@ public:
     * @param j_            Ending value
     * @param k_            Increment value
     */
-    constexpr Range( TYPE i_, TYPE j_, TYPE k_ = 1 ) : i( i_ ), j( j_ ), k( k_ ) {}
+    Range( const TYPE &i_, const TYPE &j_, const TYPE &k_ = 1 )
+        : i( i_ ), j( j_ ), k( k_ )
+    {
+    }

+    //! Get the number of values in the range
+    size_t size() const
+    {
+        if ( std::is_integral<TYPE>::value ) {
+            return ( static_cast<int64_t>( j ) - static_cast<int64_t>( i ) ) /
+                   static_cast<int64_t>( k );
+        } else if ( std::is_floating_point<TYPE>::value ) {
+            double tmp = static_cast<double>( ( j - i ) ) / static_cast<double>( k );
+            return static_cast<size_t>( floor( tmp + 1e-12 ) + 1 );
+        } else if ( std::is_same<TYPE, std::complex<float>>::value ||
+                                 std::is_same<TYPE, std::complex<double>>::value ) {
+            double tmp = std::real( ( j - i ) / ( k ) );
+            return static_cast<size_t>( floor( tmp + 1e-12 ) + 1 );
+        } else {
+            ERROR( "Unsupported type for range" );
+        }
+    }
+
+public:
    TYPE i, j, k;
 };

@ -65,20 +92,20 @@ class ArraySize final
 {
 public:
    //! Empty constructor
-    constexpr ArraySize() : d_ndim( 1 ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 } {}
+    ArraySize() : d_ndim( 1 ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 } {}

    /*!
     * Create the vector size
     * @param N1            Number of elements in the first dimension
     */
-    constexpr ArraySize( size_t N1 ) : d_ndim( 1 ), d_length( N1 ), d_N{ N1, 1, 1, 1, 1 } {}
+    ArraySize( size_t N1 ) : d_ndim( 1 ), d_length( N1 ), d_N{ N1, 1, 1, 1, 1 } {}

    /*!
     * Create the vector size
     * @param N1            Number of elements in the first dimension
     * @param N2            Number of elements in the second dimension
     */
-    constexpr ArraySize( size_t N1, size_t N2 )
+    ArraySize( size_t N1, size_t N2 )
        : d_ndim( 2 ), d_length( N1 * N2 ), d_N{ N1, N2, 1, 1, 1 }
    {
    }
@ -89,7 +116,7 @@ public:
     * @param N2            Number of elements in the second dimension
     * @param N3            Number of elements in the third dimension
     */
-    constexpr ArraySize( size_t N1, size_t N2, size_t N3 )
+    ArraySize( size_t N1, size_t N2, size_t N3 )
        : d_ndim( 3 ), d_length( N1 * N2 * N3 ), d_N{ N1, N2, N3, 1, 1 }
    {
    }
@ -101,7 +128,7 @@ public:
     * @param N3            Number of elements in the third dimension
     * @param N4            Number of elements in the fourth dimension
     */
-    constexpr ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 )
+    ArraySize( size_t N1, size_t N2, size_t N3, size_t N4 )
        : d_ndim( 4 ), d_length( N1 * N2 * N3 * N4 ), d_N{ N1, N2, N3, N4, 1 }
    {
    }
@ -114,7 +141,7 @@ public:
     * @param N4            Number of elements in the fourth dimension
     * @param N5            Number of elements in the fifth dimension
     */
-    constexpr ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 )
+    ArraySize( size_t N1, size_t N2, size_t N3, size_t N4, size_t N5 )
        : d_ndim( 5 ), d_length( N1 * N2 * N3 * N4 * N5 ), d_N{ N1, N2, N3, N4, N5 }
    {
    }
@ -122,11 +149,14 @@ public:
    /*!
     * Create from initializer list
     * @param N             Size of the array
+     * @param ndim          Number of dimensions
     */
-    constexpr ArraySize( std::initializer_list<size_t> N )
+    ArraySize( std::initializer_list<size_t> N, int ndim = -1 )
        : d_ndim( N.size() ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 }
    {
-        if ( d_ndim > maxDim() )
+        if ( ndim >= 0 )
+            d_ndim = ndim;
+        if ( d_ndim > 5 )
            throw std::out_of_range( "Maximum number of dimensions exceeded" );
        auto it = N.begin();
        for ( size_t i = 0; i < d_ndim; i++, ++it )
@ -144,10 +174,10 @@ public:
     * @param ndim          Number of dimensions
     * @param dims          Dimensions
     */
-    constexpr ArraySize( size_t ndim, const size_t *dims )
+    ArraySize( size_t ndim, const size_t *dims )
        : d_ndim( ndim ), d_length( 0 ), d_N{ 0, 1, 1, 1, 1 }
    {
-        if ( d_ndim > maxDim() )
+        if ( d_ndim > 5 )
            throw std::out_of_range( "Maximum number of dimensions exceeded" );
        for ( size_t i = 0; i < ndim; i++ )
            d_N[i] = dims[i];
@ -158,35 +188,44 @@ public:
            d_length = 0;
    }

+    /*!
+     * Create from std::array
+     * @param N             Size of the array
+     */
+    template<std::size_t NDIM>
+    ArraySize( const std::array<size_t, NDIM> &N ) : ArraySize( NDIM, N.data() )
+    {
+    }
+
    /*!
     * Create from std::vector
     * @param N             Size of the array
     */
-    ArraySize( const std::vector<size_t> &N );
+    inline ArraySize( const std::vector<size_t> &N ) : ArraySize( N.size(), N.data() ) {}

    // Copy/assignment constructors
-    constexpr ArraySize( ArraySize &&rhs )      = default;
-    constexpr ArraySize( const ArraySize &rhs ) = default;
-    constexpr ArraySize &operator=( ArraySize &&rhs ) = default;
-    constexpr ArraySize &operator=( const ArraySize &rhs ) = default;
+    ArraySize( ArraySize &&rhs )      = default;
+    ArraySize( const ArraySize &rhs ) = default;
+    ArraySize &operator=( ArraySize &&rhs ) = default;
+    ArraySize &operator=( const ArraySize &rhs ) = default;

    /*!
     * Access the ith dimension
     * @param i             Index to access
     */
-    constexpr ARRAY_ATTRIBUTE size_t operator[]( size_t i ) const { return d_N[i]; }
+    ARRAY_ATTRIBUTE size_t operator[]( size_t i ) const { return d_N[i]; }

    //! Return the number of dimensions
-    constexpr ARRAY_ATTRIBUTE uint8_t ndim() const { return d_ndim; }
+    ARRAY_ATTRIBUTE uint8_t ndim() const { return d_ndim; }

    //! Return the number of dimensions
-    constexpr ARRAY_ATTRIBUTE size_t size() const { return d_ndim; }
+    ARRAY_ATTRIBUTE size_t size() const { return d_ndim; }

    //! Return the total number of elements in the array
-    constexpr ARRAY_ATTRIBUTE size_t length() const { return d_length; }
+    ARRAY_ATTRIBUTE size_t length() const { return d_length; }

    //! Resize the dimension
-    constexpr void resize( uint8_t dim, size_t N )
+    void resize( uint8_t dim, size_t N )
    {
        if ( dim >= d_ndim )
            throw std::out_of_range( "Invalid dimension" );
@ -201,75 +240,141 @@ public:
     *    max of ndim and the largest dim>1.
     * @param ndim          Desired number of dimensions
     */
-    constexpr void setNdim( uint8_t ndim ) { d_ndim = std::max( ndim, d_ndim ); }
+    void setNdim( uint8_t ndim ) { d_ndim = std::max( ndim, d_ndim ); }
+
+    /*!
+     * Remove singleton dimensions
+     */
+    void squeeze()
+    {
+        d_ndim = 0;
+        for ( uint8_t i = 0; i < maxDim(); i++ ) {
+            if ( d_N[i] != 1 )
+                d_N[d_ndim++] = d_N[i];
+        }
+    }

    //! Returns an iterator to the beginning
-    constexpr const size_t *begin() const { return d_N; }
+    const size_t *begin() const { return d_N; }

    //! Returns an iterator to the end
-    constexpr const size_t *end() const { return d_N + d_ndim; }
+    const size_t *end() const { return d_N + d_ndim; }

    // Check if two array sizes are equal
-    constexpr ARRAY_ATTRIBUTE bool operator==( const ArraySize &rhs ) const
+    ARRAY_ATTRIBUTE bool operator==( const ArraySize &rhs ) const
    {
        return d_ndim == rhs.d_ndim && memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
    }

    // Check if two array sizes are equal (ignoring the dimension)
-    constexpr ARRAY_ATTRIBUTE bool approxEqual( const ArraySize &rhs ) const
+    ARRAY_ATTRIBUTE bool approxEqual( const ArraySize &rhs ) const
    {
        return ( length() == 0 && rhs.length() == 0 ) || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) == 0;
    }

    //! Check if two matrices are not equal
-    constexpr ARRAY_ATTRIBUTE bool operator!=( const ArraySize &rhs ) const
+    ARRAY_ATTRIBUTE bool operator!=( const ArraySize &rhs ) const
    {
        return d_ndim != rhs.d_ndim || memcmp( d_N, rhs.d_N, sizeof( d_N ) ) != 0;
    }

    //! Maximum supported dimension
-    constexpr ARRAY_ATTRIBUTE static uint8_t maxDim() { return 5u; }
+    ARRAY_ATTRIBUTE static uint8_t maxDim() { return 5; }

    //! Get the index
-    constexpr ARRAY_ATTRIBUTE size_t index( size_t i ) const
+    ARRAY_ATTRIBUTE size_t index( size_t i ) const
    {
-        CHECK_ARRAY_LENGTH( i );
+        CHECK_ARRAY_LENGTH( i, d_length );
        return i;
    }

    //! Get the index
-    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2 ) const
+    ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2 ) const
    {
        size_t index = i1 + i2 * d_N[0];
-        CHECK_ARRAY_LENGTH( index );
+        CHECK_ARRAY_LENGTH( index, d_length );
        return index;
    }

    //! Get the index
-    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3 ) const
+    ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3 ) const
    {
        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * i3 );
-        CHECK_ARRAY_LENGTH( index );
+        CHECK_ARRAY_LENGTH( index, d_length );
        return index;
    }

    //! Get the index
-    constexpr ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const
+    ARRAY_ATTRIBUTE size_t index( size_t i1, size_t i2, size_t i3, size_t i4 ) const
    {
        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * i4 ) );
-        CHECK_ARRAY_LENGTH( index );
+        CHECK_ARRAY_LENGTH( index, d_length );
        return index;
    }

    //! Get the index
-    constexpr ARRAY_ATTRIBUTE size_t index(
-        size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
+    ARRAY_ATTRIBUTE size_t
+    index( size_t i1, size_t i2, size_t i3, size_t i4, size_t i5 ) const
    {
        size_t index = i1 + d_N[0] * ( i2 + d_N[1] * ( i3 + d_N[2] * ( i4 + d_N[3] * i5 ) ) );
-        CHECK_ARRAY_LENGTH( index );
+        CHECK_ARRAY_LENGTH( index, d_length );
        return index;
    }

+    //! Get the index
+    size_t index( const std::array<size_t, 5> &i ) const
+    {
+        size_t j = 0;
+        for ( size_t m = 0, N = 1; m < 5; m++ ) {
+            j += i[m] * N;
+            N *= d_N[m];
+        }
+        return j;
+    }
+
+    //! Get the index
+    size_t index( std::initializer_list<size_t> i ) const
+    {
+        size_t N = 1;
+        size_t j = 0;
+        size_t m = 0;
+        for ( size_t k : i ) {
+            j += k * N;
+            N *= d_N[m++];
+        }
+        return j;
+    }
+
+    //! Convert the index to ijk values
+    std::array<size_t, 5> ijk( size_t index ) const
+    {
+        CHECK_ARRAY_LENGTH( index, d_length );
+        size_t i0 = index % d_N[0];
+        index     = index / d_N[0];
+        size_t i1 = index % d_N[1];
+        index     = index / d_N[1];
+        size_t i2 = index % d_N[2];
+        index     = index / d_N[2];
+        size_t i3 = index % d_N[3];
+        index     = index / d_N[3];
+        return { i0, i1, i2, i3, index };
+    }
+
+    //! Convert the index to ijk values
+    void ijk( size_t index, size_t *x ) const
+    {
+        CHECK_ARRAY_LENGTH( index, d_length );
+        x[0]  = index % d_N[0];
+        index = index / d_N[0];
+        x[1]  = index % d_N[1];
+        index = index / d_N[1];
+        x[2]  = index % d_N[2];
+        index = index / d_N[2];
+        x[3]  = index % d_N[3];
+        index = index / d_N[3];
+        x[4]  = index;
+    }
+
 private:
    uint8_t d_ndim;
    size_t d_length;
@ -278,11 +383,11 @@ private:


 // Function to concatenate dimensions of two array sizes
-constexpr ArraySize cat( const ArraySize &x, const ArraySize &y )
+inline ArraySize cat( const ArraySize &x, const ArraySize &y )
 {
-    if ( x.ndim() + y.ndim() > ArraySize::maxDim() )
+    if ( x.ndim() + y.ndim() > 5 )
        throw std::out_of_range( "Maximum number of dimensions exceeded" );
-    size_t N[ArraySize::maxDim()] = { 0 };
+    size_t N[5] = { 0 };
    for ( int i = 0; i < x.ndim(); i++ )
        N[i] = x[i];
    for ( int i = 0; i < y.ndim(); i++ )
@ -291,4 +396,36 @@ constexpr ArraySize cat( const ArraySize &x, const ArraySize &y )
 }


+// Operator overloads
+inline ArraySize operator*( size_t v, const ArraySize &x )
+{
+    size_t N[5] = { v * x[0], v * x[1], v * x[2], v * x[3], v * x[4] };
+    return ArraySize( x.ndim(), N );
+}
+inline ArraySize operator*( const ArraySize &x, size_t v )
+{
+    size_t N[5] = { v * x[0], v * x[1], v * x[2], v * x[3], v * x[4] };
+    return ArraySize( x.ndim(), N );
+}
+inline ArraySize operator-( const ArraySize &x, size_t v )
+{
+    size_t N[5] = { x[0] - v, x[1] - v, x[2] - v, x[3] - v, x[4] - v };
+    return ArraySize( x.ndim(), N );
+}
+inline ArraySize operator+( const ArraySize &x, size_t v )
+{
+    size_t N[5] = { x[0] + v, x[1] + v, x[2] + v, x[3] + v, x[4] + v };
+    return ArraySize( x.ndim(), N );
+}
+inline ArraySize operator+( size_t v, const ArraySize &x )
+{
+    size_t N[5] = { x[0] + v, x[1] + v, x[2] + v, x[3] + v, x[4] + v };
+    return ArraySize( x.ndim(), N );
+}
+
+#if defined( USING_ICC )
+ENABLE_WARNINGS
+#endif
+
+
 #endif
--- a/common/FunctionTable.cpp
+++ b/common/FunctionTable.cpp
@ -0,0 +1,147 @@
+#include "FunctionTable.hpp"
+
+
+/********************************************************
+ *  Random number generation                             *
+ ********************************************************/
+template<> char genRand<char>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<char> dis;
+    return dis( gen );
+}
+template<> int8_t genRand<int8_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<int8_t> dis;
+    return dis( gen );
+}
+template<> uint8_t genRand<uint8_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<uint8_t> dis;
+    return dis( gen );
+}
+template<> int16_t genRand<int16_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<int16_t> dis;
+    return dis( gen );
+}
+template<> uint16_t genRand<uint16_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<uint16_t> dis;
+    return dis( gen );
+}
+template<> int32_t genRand<int32_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<int32_t> dis;
+    return dis( gen );
+}
+template<> uint32_t genRand<uint32_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<uint32_t> dis;
+    return dis( gen );
+}
+template<> int64_t genRand<int64_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<int64_t> dis;
+    return dis( gen );
+}
+template<> uint64_t genRand<uint64_t>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_int_distribution<uint64_t> dis;
+    return dis( gen );
+}
+template<> float genRand<float>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_real_distribution<float> dis;
+    return dis( gen );
+}
+template<> double genRand<double>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_real_distribution<double> dis;
+    return dis( gen );
+}
+template<> long double genRand<long double>()
+{
+    static std::random_device rd;
+    static std::mt19937 gen( rd() );
+    static std::uniform_real_distribution<double> dis;
+    return dis( gen );
+}
+
+
+/********************************************************
+ *  axpy                                                 *
+ ********************************************************/
+template<>
+void call_axpy<float>( size_t N, const float alpha, const float *x, float *y )
+{
+    ERROR("Not finished");
+}
+template<>
+void call_axpy<double>( size_t N, const double alpha, const double *x, double *y )
+{
+    ERROR("Not finished");
+}
+
+
+/********************************************************
+ *  Multiply two arrays                                  *
+ ********************************************************/
+template<>
+void call_gemv<double>(
+    size_t M, size_t N, double alpha, double beta, const double *A, const double *x, double *y )
+{
+    ERROR("Not finished");
+}
+template<>
+void call_gemv<float>(
+    size_t M, size_t N, float alpha, float beta, const float *A, const float *x, float *y )
+{
+    ERROR("Not finished");
+}
+template<>
+void call_gemm<double>( size_t M,
+                        size_t N,
+                        size_t K,
+                        double alpha,
+                        double beta,
+                        const double *A,
+                        const double *B,
+                        double *C )
+{
+    ERROR("Not finished");
+}
+template<>
+void call_gemm<float>( size_t M,
+                       size_t N,
+                       size_t K,
+                       float alpha,
+                       float beta,
+                       const float *A,
+                       const float *B,
+                       float *C )
+{
+    ERROR("Not finished");
+}
+
--- a/common/FunctionTable.h
+++ b/common/FunctionTable.h
@ -23,6 +23,7 @@
 #include <functional>


+
 /*!
 * Class FunctionTable is a serial function table class that defines
 *   a series of operations that can be performed on the Array class.
@ -41,38 +42,55 @@ public:

    /*!
     * Perform a reduce operator y = f(x)
-     * @param[in] op        The function operation
-     *                      Note: the operator is a template parameter
-     *                      (compared to a std::function to improve performance)
-     * @param[in] A         The array to operate on
-     * @return              The reduction
+     * @param[in] op            The function operation
+     *                          Note: the operator is a template parameter to improve performance
+     * @param[in] A             The array to operate on
+     * @param[in] initialValue  The initial value for the reduction (0 for sum, +/- inf for min/max,
+     * ...)
+     * @return                  The reduction
     */
    template<class TYPE, class FUN, typename LAMBDA>
-    static inline TYPE reduce( LAMBDA &op, const Array<TYPE, FUN> &A );
+    static inline TYPE reduce( LAMBDA &op, const Array<TYPE, FUN> &A, const TYPE &initialValue );
+
+    /*!
+     * Perform a reduce operator z = f(x,y)
+     * @param[in] op            The function operation
+     *                          Note: the operator is a template parameter to improve performance
+     * @param[in] A             The first array to operate on
+     * @param[in] B             The second array to operate on
+     * @param[in] initialValue  The initial value for the reduction (0 for sum, +/- inf for min/max,
+     * ...)
+     * @return                  The reduction
+     */
+    template<class TYPE, class FUN, typename LAMBDA>
+    static inline TYPE reduce( LAMBDA &op,
+                               const Array<TYPE, FUN> &A,
+                               const Array<TYPE, FUN> &B,
+                               const TYPE &initialValue );

    /*!
     * Perform a element-wise operation y = f(x)
-     * @param[in] fun       The function operation
-     *                      Note: the operator is a template parameter
-     *                      (compared to a std::function to improve performance)
-     * @param[in] x         The input array to operate on
-     * @param[out] y        The output array
+     * @param[in] fun           The function operation
+     *                          Note: the function is a template parameter to improve performance
+     * @param[in,out] x         The array to operate on
+     * @param[out] y            The output array
     */
    template<class TYPE, class FUN, typename LAMBDA>
    static inline void transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y );

    /*!
     * Perform a element-wise operation z = f(x,y)
-     * @param[in] fun       The function operation
-     *                      Note: the operator is a template parameter
-     *                      (compared to a std::function to improve performance)
-     * @param[in] x         The first array
-     * @param[in] y         The second array
-     * @param[out] z        The result
+     * @param[in] fun           The function operation
+     *                          Note: the function is a template parameter to improve performance
+     * @param[in] x             The first array
+     * @param[in] y             The second array
+     * @param[out] z            The output array
     */
    template<class TYPE, class FUN, typename LAMBDA>
-    static inline void transform(
-        LAMBDA &fun, const Array<TYPE, FUN> &x, const Array<TYPE, FUN> &y, Array<TYPE, FUN> &z );
+    static inline void transform( LAMBDA &fun,
+                                  const Array<TYPE, FUN> &x,
+                                  const Array<TYPE, FUN> &y,
+                                  Array<TYPE, FUN> &z );

    /*!
     * Multiply two arrays
@ -81,8 +99,8 @@ public:
     * @param[out] c            The output array
     */
    template<class TYPE, class FUN>
-    static inline void multiply(
-        const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c );
+    static void
+    multiply( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, Array<TYPE, FUN> &c );

    /*!
     * Perform dgemv/dgemm equavalent operation ( C = alpha*A*B + beta*C )
@ -90,11 +108,14 @@ public:
     * @param[in] A             The first array
     * @param[in] B             The second array
     * @param[in] beta          The scalar value alpha
-     * @param[in,out] c         The output array C
+     * @param[in,out] C         The output array C
     */
    template<class TYPE, class FUN>
-    static void gemm( const TYPE alpha, const Array<TYPE, FUN> &A, const Array<TYPE, FUN> &B,
-        const TYPE beta, Array<TYPE, FUN> &C );
+    static void gemm( const TYPE alpha,
+                      const Array<TYPE, FUN> &A,
+                      const Array<TYPE, FUN> &B,
+                      const TYPE beta,
+                      Array<TYPE, FUN> &C );

    /*!
     * Perform axpy equavalent operation ( y = alpha*x + y )
@ -114,9 +135,84 @@ public:
    template<class TYPE, class FUN>
    static bool equals( const Array<TYPE, FUN> &A, const Array<TYPE, FUN> &B, TYPE tol );

+    template<class TYPE>
+    static inline void gemmWrapper( char TRANSA,
+                                    char TRANSB,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    TYPE alpha,
+                                    const TYPE *A,
+                                    int LDA,
+                                    const TYPE *B,
+                                    int LDB,
+                                    TYPE beta,
+                                    TYPE *C,
+                                    int LDC );
+
+
+    /* Specialized Functions */
+
+    /*!
+     * Perform a element-wise operation y = max(x , 0)
+     * @param[in] A             The input array
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformReLU( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Perform a element-wise operation B = |A|
+     * @param[in] A             The array to operate on
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformAbs( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Perform a element-wise operation B = tanh(A)
+     * @param[in] A             The array to operate on
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformTanh( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Perform a element-wise operation B = max(-1 , min(1 , A) )
+     * @param[in] A             The array to operate on
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformHardTanh( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Perform a element-wise operation B = 1 / (1 + exp(-A))
+     * @param[in] A             The array to operate on
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformSigmoid( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Perform a element-wise operation B = log(exp(A) + 1)
+     * @param[in] A             The array to operate on
+     * @param[out] B            The output array
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static void transformSoftPlus( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B );
+
+    /*!
+     * Sum the elements of the Array
+     * @param[in] A             The array to sum
+     */
+    template<class TYPE, class FUN, class ALLOC>
+    static TYPE sum( const Array<TYPE, FUN, ALLOC> &A );

 private:
    FunctionTable();
+
+    template<class T>
+    static inline void rand( size_t N, T *x );
 };


--- a/common/FunctionTable.hpp
+++ b/common/FunctionTable.hpp
@ -34,7 +34,7 @@
 #define included_FunctionTable_hpp

 #include "common/FunctionTable.h"
-#include "common/Utilities.h"
+#include "common/UtilityMacros.h"

 #include <algorithm>
 #include <cstring>
@ -42,33 +42,16 @@
 #include <random>


+
 /********************************************************
 *  Random number initialization                         *
 ********************************************************/
-template<class TYPE>
-static inline typename std::enable_if<std::is_integral<TYPE>::value>::type genRand(
-    size_t N, TYPE* x )
-{
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_int_distribution<TYPE> dis;
-    for ( size_t i = 0; i < N; i++ )
-        x[i] = dis( gen );
-}
-template<class TYPE>
-static inline typename std::enable_if<std::is_floating_point<TYPE>::value>::type genRand(
-    size_t N, TYPE* x )
-{
-    std::random_device rd;
-    std::mt19937 gen( rd() );
-    std::uniform_real_distribution<TYPE> dis( 0, 1 );
-    for ( size_t i = 0; i < N; i++ )
-        x[i] = dis( gen );
-}
+template<class TYPE> TYPE genRand();
 template<class TYPE, class FUN>
-inline void FunctionTable::rand( Array<TYPE, FUN>& x )
+inline void FunctionTable::rand( Array<TYPE, FUN> &x )
 {
-    genRand<TYPE>( x.length(), x.data() );
+    for ( size_t i = 0; i < x.length(); i++ )
+        x( i ) = genRand<TYPE>();
 }


@ -76,24 +59,39 @@ inline void FunctionTable::rand( Array<TYPE, FUN>& x )
 *  Reduction                                            *
 ********************************************************/
 template<class TYPE, class FUN, typename LAMBDA>
-inline TYPE FunctionTable::reduce( LAMBDA& op, const Array<TYPE, FUN>& A )
+inline TYPE FunctionTable::reduce( LAMBDA &op, const Array<TYPE, FUN> &A, const TYPE &initialValue )
 {
    if ( A.length() == 0 )
        return TYPE();
-    const TYPE* x  = A.data();
-    TYPE y         = x[0];
-    const size_t N = A.length();
-    for ( size_t i = 1; i < N; i++ )
+    const TYPE *x = A.data();
+    TYPE y        = initialValue;
+    for ( size_t i = 0; i < A.length(); i++ )
        y = op( x[i], y );
    return y;
 }
+template<class TYPE, class FUN, typename LAMBDA>
+inline TYPE FunctionTable::reduce( LAMBDA &op,
+                                   const Array<TYPE, FUN> &A,
+                                   const Array<TYPE, FUN> &B,
+                                   const TYPE &initialValue )
+{
+    ARRAY_ASSERT( A.length() == B.length() );
+    if ( A.length() == 0 )
+        return TYPE();
+    const TYPE *x = A.data();
+    const TYPE *y = B.data();
+    TYPE z        = initialValue;
+    for ( size_t i = 0; i < A.length(); i++ )
+        z = op( x[i], y[i], z );
+    return z;
+}


 /********************************************************
 *  Unary transformation                                 *
 ********************************************************/
 template<class TYPE, class FUN, typename LAMBDA>
-inline void FunctionTable::transform( LAMBDA& fun, const Array<TYPE, FUN>& x, Array<TYPE, FUN>& y )
+inline void FunctionTable::transform( LAMBDA &fun, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y )
 {
    y.resize( x.size() );
    const size_t N = x.length();
@ -101,8 +99,10 @@ inline void FunctionTable::transform( LAMBDA& fun, const Array<TYPE, FUN>& x, Ar
        y( i ) = fun( x( i ) );
 }
 template<class TYPE, class FUN, typename LAMBDA>
-inline void FunctionTable::transform(
-    LAMBDA& fun, const Array<TYPE, FUN>& x, const Array<TYPE, FUN>& y, Array<TYPE, FUN>& z )
+inline void FunctionTable::transform( LAMBDA &fun,
+                                      const Array<TYPE, FUN> &x,
+                                      const Array<TYPE, FUN> &y,
+                                      Array<TYPE, FUN> &z )
 {
    if ( x.size() != y.size() )
        throw std::logic_error( "Sizes of x and y do not match" );
@ -117,25 +117,19 @@ inline void FunctionTable::transform(
 *  axpy                                                 *
 ********************************************************/
 template<class TYPE>
-inline void call_axpy( size_t N, const TYPE alpha, const TYPE* x, TYPE* y );
+void call_axpy( size_t N, const TYPE alpha, const TYPE *x, TYPE *y );
 template<>
-inline void call_axpy<float>( size_t, const float, const float*, float* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_axpy<float>( size_t N, const float alpha, const float *x, float *y );
 template<>
-inline void call_axpy<double>( size_t, const double, const double*, double* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_axpy<double>( size_t N, const double alpha, const double *x, double *y );
 template<class TYPE>
-inline void call_axpy( size_t N, const TYPE alpha, const TYPE* x, TYPE* y )
+void call_axpy( size_t N, const TYPE alpha, const TYPE *x, TYPE *y )
 {
    for ( size_t i = 0; i < N; i++ )
        y[i] += alpha * x[i];
 }
 template<class TYPE, class FUN>
-void FunctionTable::axpy( const TYPE alpha, const Array<TYPE, FUN>& x, Array<TYPE, FUN>& y )
+void FunctionTable::axpy( const TYPE alpha, const Array<TYPE, FUN> &x, Array<TYPE, FUN> &y )
 {
    if ( x.size() != y.size() )
        throw std::logic_error( "Array sizes do not match" );
@ -147,21 +141,15 @@ void FunctionTable::axpy( const TYPE alpha, const Array<TYPE, FUN>& x, Array<TYP
 *  Multiply two arrays                                  *
 ********************************************************/
 template<class TYPE>
-inline void call_gemv( size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* x, TYPE* y );
+void call_gemv( size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE *A, const TYPE *x, TYPE *y );
 template<>
-inline void call_gemv<double>(
-    size_t, size_t, double, double, const double*, const double*, double* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_gemv<double>(
+    size_t M, size_t N, double alpha, double beta, const double *A, const double *x, double *y );
 template<>
-inline void call_gemv<float>( size_t, size_t, float, float, const float*, const float*, float* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_gemv<float>(
+    size_t M, size_t N, float alpha, float beta, const float *A, const float *x, float *y );
 template<class TYPE>
-inline void call_gemv(
-    size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* x, TYPE* y )
+void call_gemv( size_t M, size_t N, TYPE alpha, TYPE beta, const TYPE *A, const TYPE *x, TYPE *y )
 {
    for ( size_t i = 0; i < M; i++ )
        y[i] = beta * y[i];
@ -171,21 +159,29 @@ inline void call_gemv(
    }
 }
 template<class TYPE>
-inline void call_gemm(
-    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* B, TYPE* C );
+void call_gemm(
+    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE *A, const TYPE *B, TYPE *C );
 template<>
-inline void call_gemm<double>( size_t, size_t, size_t, double, double, const double*, const double*, double* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_gemm<double>( size_t M,
+                        size_t N,
+                        size_t K,
+                        double alpha,
+                        double beta,
+                        const double *A,
+                        const double *B,
+                        double *C );
 template<>
-inline void call_gemm<float>( size_t, size_t, size_t, float, float, const float*, const float*, float* )
-{
-    throw std::logic_error( "LapackWrappers not configured" );
-}
+void call_gemm<float>( size_t M,
+                       size_t N,
+                       size_t K,
+                       float alpha,
+                       float beta,
+                       const float *A,
+                       const float *B,
+                       float *C );
 template<class TYPE>
-inline void call_gemm(
-    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE* A, const TYPE* B, TYPE* C )
+void call_gemm(
+    size_t M, size_t N, size_t K, TYPE alpha, TYPE beta, const TYPE *A, const TYPE *B, TYPE *C )
 {
    for ( size_t i = 0; i < K * M; i++ )
        C[i] = beta * C[i];
@ -197,16 +193,17 @@ inline void call_gemm(
    }
 }
 template<class TYPE, class FUN>
-void FunctionTable::gemm( const TYPE alpha, const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b,
-    const TYPE beta, Array<TYPE, FUN>& c )
+void FunctionTable::gemm( const TYPE alpha,
+                          const Array<TYPE, FUN> &a,
+                          const Array<TYPE, FUN> &b,
+                          const TYPE beta,
+                          Array<TYPE, FUN> &c )
 {
+    if ( a.size( 1 ) != b.size( 0 ) )
+        throw std::logic_error( "Inner dimensions must match" );
    if ( a.ndim() == 2 && b.ndim() == 1 ) {
-        if ( a.size( 1 ) != b.size( 0 ) )
-            throw std::logic_error( "Inner dimensions must match" );
        call_gemv<TYPE>( a.size( 0 ), a.size( 1 ), alpha, beta, a.data(), b.data(), c.data() );
    } else if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
-        if ( a.size( 1 ) != b.size( 0 ) )
-            throw std::logic_error( "Inner dimensions must match" );
        call_gemm<TYPE>(
            a.size( 0 ), a.size( 1 ), b.size( 1 ), alpha, beta, a.data(), b.data(), c.data() );
    } else {
@ -214,17 +211,16 @@ void FunctionTable::gemm( const TYPE alpha, const Array<TYPE, FUN>& a, const Arr
    }
 }
 template<class TYPE, class FUN>
-void FunctionTable::multiply(
-    const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, Array<TYPE, FUN>& c )
+void FunctionTable::multiply( const Array<TYPE, FUN> &a,
+                              const Array<TYPE, FUN> &b,
+                              Array<TYPE, FUN> &c )
 {
+    if ( a.size( 1 ) != b.size( 0 ) )
+        throw std::logic_error( "Inner dimensions must match" );
    if ( a.ndim() == 2 && b.ndim() == 1 ) {
-        if ( a.size( 1 ) != b.size( 0 ) )
-            throw std::logic_error( "Inner dimensions must match" );
        c.resize( a.size( 0 ) );
        call_gemv<TYPE>( a.size( 0 ), a.size( 1 ), 1, 0, a.data(), b.data(), c.data() );
    } else if ( a.ndim() <= 2 && b.ndim() <= 2 ) {
-        if ( a.size( 1 ) != b.size( 0 ) )
-            throw std::logic_error( "Inner dimensions must match" );
        c.resize( a.size( 0 ), b.size( 1 ) );
        call_gemm<TYPE>(
            a.size( 0 ), a.size( 1 ), b.size( 1 ), 1, 0, a.data(), b.data(), c.data() );
@ -238,8 +234,8 @@ void FunctionTable::multiply(
 *  Check if two arrays are equal                        *
 ********************************************************/
 template<class TYPE, class FUN>
-inline typename std::enable_if<!std::is_floating_point<TYPE>::value, bool>::type
-FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE )
+inline typename std::enable_if<std::is_integral<TYPE>::value, bool>::type
+FunctionTableCompare( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, TYPE )
 {
    bool pass = true;
    if ( a.size() != b.size() )
@ -250,7 +246,7 @@ FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE
 }
 template<class TYPE, class FUN>
 inline typename std::enable_if<std::is_floating_point<TYPE>::value, bool>::type
-FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE tol )
+FunctionTableCompare( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, TYPE tol )
 {
    bool pass = true;
    if ( a.size() != b.size() )
@ -260,10 +256,89 @@ FunctionTableCompare( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE
    return pass;
 }
 template<class TYPE, class FUN>
-bool FunctionTable::equals( const Array<TYPE, FUN>& a, const Array<TYPE, FUN>& b, TYPE tol )
+bool FunctionTable::equals( const Array<TYPE, FUN> &a, const Array<TYPE, FUN> &b, TYPE tol )
 {
    return FunctionTableCompare( a, b, tol );
 }


+/********************************************************
+ *  Specialized Functions                                *
+ ********************************************************/
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformReLU( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B )
+{
+    const auto &fun = []( const TYPE &a ) { return std::max( a, static_cast<TYPE>( 0 ) ); };
+    transform( fun, A, B );
+}
+
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformAbs( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B )
+{
+    B.resize( A.size() );
+    const auto &fun = []( const TYPE &a ) { return std::abs( a ); };
+    transform( fun, A, B );
+}
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformTanh( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B )
+{
+    B.resize( A.size() );
+    const auto &fun = []( const TYPE &a ) { return tanh( a ); };
+    transform( fun, A, B );
+}
+
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformHardTanh( const Array<TYPE, FUN, ALLOC> &A,
+                                       Array<TYPE, FUN, ALLOC> &B )
+{
+    B.resize( A.size() );
+    const auto &fun = []( const TYPE &a ) {
+        return std::max( -static_cast<TYPE>( 1.0 ), std::min( static_cast<TYPE>( 1.0 ), a ) );
+    };
+    transform( fun, A, B );
+}
+
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformSigmoid( const Array<TYPE, FUN, ALLOC> &A, Array<TYPE, FUN, ALLOC> &B )
+{
+    B.resize( A.size() );
+    const auto &fun = []( const TYPE &a ) { return 1.0 / ( 1.0 + exp( -a ) ); };
+    transform( fun, A, B );
+}
+
+template<class TYPE, class FUN, class ALLOC>
+void FunctionTable::transformSoftPlus( const Array<TYPE, FUN, ALLOC> &A,
+                                       Array<TYPE, FUN, ALLOC> &B )
+{
+    B.resize( A.size() );
+    const auto &fun = []( const TYPE &a ) { return log1p( exp( a ) ); };
+    transform( fun, A, B );
+}
+
+template<class TYPE, class FUN, class ALLOC>
+TYPE FunctionTable::sum( const Array<TYPE, FUN, ALLOC> &A )
+{
+    const auto &fun = []( const TYPE &a, const TYPE &b ) { return a + b; };
+    return reduce( fun, A, (TYPE) 0 );
+}
+
+template<class TYPE>
+inline void FunctionTable::gemmWrapper( char TRANSA,
+                                        char TRANSB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        TYPE alpha,
+                                        const TYPE *A,
+                                        int LDA,
+                                        const TYPE *B,
+                                        int LDB,
+                                        TYPE beta,
+                                        TYPE *C,
+                                        int LDC )
+{
+    ERROR("Not finished");
+}
+
+
 #endif
--- a/common/ScaLBL.cpp
+++ b/common/ScaLBL.cpp
@ -2313,24 +2313,70 @@ void ScaLBL_Communicator::D3Q7_Ion_Concentration_BC_Z(int *neighborList, double
 	}
 }

-void ScaLBL_Communicator::D3Q7_Ion_Flux_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time){
+void ScaLBL_Communicator::D3Q7_Ion_Flux_Diff_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time){
 	if (kproc == 0) {
 		if (time%2==0){
-			ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z(dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
+			ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z(dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
 		}
 		else{
-			ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z(neighborList, dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
+			ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z(neighborList, dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
 		}
 	}
 }

-void ScaLBL_Communicator::D3Q7_Ion_Flux_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time){
+void ScaLBL_Communicator::D3Q7_Ion_Flux_Diff_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time){
 	if (kproc == nprocz-1){
 		if (time%2==0){
-			ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z(dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
+			ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z(dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
 		}
 		else{
-			ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(neighborList, dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
+			ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z(neighborList, dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
+		}
+	}
+}
+
+void ScaLBL_Communicator::D3Q7_Ion_Flux_DiffAdvc_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time){
+	if (kproc == 0) {
+		if (time%2==0){
+			ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z(dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
+		}
+		else{
+			ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z(neighborList, dvcSendList_z, fq, Cin, tau, VelocityZ, sendCount_z, N);
+		}
+	}
+}
+
+void ScaLBL_Communicator::D3Q7_Ion_Flux_DiffAdvc_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time){
+	if (kproc == nprocz-1){
+		if (time%2==0){
+			ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z(dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
+		}
+		else{
+			ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z(neighborList, dvcSendList_Z, fq, Cout, tau, VelocityZ, sendCount_Z, N);
+		}
+	}
+}
+
+void ScaLBL_Communicator::D3Q7_Ion_Flux_DiffAdvcElec_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                          double Di, double zi, double Vt, int time){
+	if (kproc == 0) {
+		if (time%2==0){
+			ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z(dvcSendList_z, fq, Cin, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, sendCount_z, N);
+		}
+		else{
+			ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z(neighborList, dvcSendList_z, fq, Cin, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, sendCount_z, N);
+		}
+	}
+}
+
+void ScaLBL_Communicator::D3Q7_Ion_Flux_DiffAdvcElec_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                          double Di, double zi, double Vt, int time){
+	if (kproc == nprocz-1){
+		if (time%2==0){
+			ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z(dvcSendList_Z, fq, Cout, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, sendCount_Z, N);
+		}
+		else{
+			ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z(neighborList, dvcSendList_Z, fq, Cout, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, sendCount_Z, N);
 		}
 	}
 }
--- a/common/ScaLBL.h
+++ b/common/ScaLBL.h
@ -104,12 +104,12 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor(int *d_neighborList, int *Map,
 		double Fx, double Fy, double Fz, int strideY, int strideZ, int start, int finish, int Np);

 extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm,double *Vel, double *Pressure,
+        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm,double *Vel, double *Pressure,
        double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Fx, double Fy, double Fz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np);

 extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *d_neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-		double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm,double *Vel,double *Pressure, 
+		double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros, double *Perm,double *Vel,double *Pressure, 
        double rhoA, double rhoB, double tauA, double tauB, double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Fx, double Fy, double Fz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np);

@ -330,13 +330,20 @@ extern "C" void ScaLBL_D3Q7_AAodd_Ion_Concentration_BC_z(int *d_neighborList, in

 extern "C" void ScaLBL_D3Q7_AAodd_Ion_Concentration_BC_Z(int *d_neighborList, int *list, double *dist, double Cout, int count, int Np);

-extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z(int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z(int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z(int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z(int *d_neighborList, int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z(int *d_neighborList, int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);

-extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z(int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z(int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z(int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z(int *d_neighborList, int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z(int *d_neighborList, int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);

-extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z(int *d_neighborList, int *list, double *dist, double Cin, double tau, double *VelocityZ, int count, int Np);
-
-extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(int *d_neighborList, int *list, double *dist, double Cout, double tau, double *VelocityZ, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z(int *list, double *dist, double Cin, double tau, double *VelocityZ,double *ElectricField,double Di,double zi,double Vt,int count,int Np);
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z(int *list, double *dist, double Cout, double tau, double *VelocityZ,double *ElectricField,double Di,double zi,double Vt,int count,int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z(int *d_neighborList, int *list, double *dist, double Cin, double tau, double *VelocityZ,double *ElectricField,double Di,double zi,double Vt, int count, int Np);
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z(int *d_neighborList, int *list, double *dist, double Cout, double tau, double *VelocityZ,double *ElectricField,double Di,double zi,double Vt, int count, int Np);

 class ScaLBL_Communicator{
 public:
@ -409,8 +416,12 @@ public:
    void Poisson_D3Q7_BC_Z(int *Map, double *Psi, double Vout);
    void D3Q7_Ion_Concentration_BC_z(int *neighborList, double *fq, double Cin, int time);
    void D3Q7_Ion_Concentration_BC_Z(int *neighborList, double *fq, double Cout, int time);
-    void D3Q7_Ion_Flux_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time);
-    void D3Q7_Ion_Flux_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time);
+    void D3Q7_Ion_Flux_Diff_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time);
+    void D3Q7_Ion_Flux_Diff_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time);
+    void D3Q7_Ion_Flux_DiffAdvc_BC_z(int *neighborList, double *fq, double Cin, double tau, double *VelocityZ, int time);
+    void D3Q7_Ion_Flux_DiffAdvc_BC_Z(int *neighborList, double *fq, double Cout, double tau, double *VelocityZ, int time);
+    void D3Q7_Ion_Flux_DiffAdvcElec_BC_z(int *neighborList,double *fq,double Cin,double tau,double *VelocityZ,double *ElectricField_Z,double Di,double zi,double Vt, int time);
+    void D3Q7_Ion_Flux_DiffAdvcElec_BC_Z(int *neighborList,double *fq,double Cout,double tau,double *VelocityZ,double *ElectricField_Z,double Di,double zi,double Vt, int time);
    void GreyscaleSC_BC_z(int *Map, double *DenA, double *DenB, double vA, double vB);
    void GreyscaleSC_BC_Z(int *Map, double *DenA, double *DenB, double vA, double vB);
    void GreyscaleSC_Pressure_BC_z(int *neighborList, double *fqA, double *fqB, double dinA, double dinB, int time);
--- a/cpu/D3Q7BC.cpp
+++ b/cpu/D3Q7BC.cpp
@ -396,3 +396,406 @@ extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(int *d_neighborList, int *list,
 		dist[nr6] = f6;
 	}
 }
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f6 = dist[5*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*(f6+uz*fsum_partial))/(1.0-0.5/tau)/(1.0-uz); 
+		dist[6*Np+n] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*(f5-uz*fsum_partial))/(1.0-0.5/tau)/(1.0+uz); 
+		dist[5*Np+n] = f6;
+	}
+}
+
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+
+	for (int idx=0; idx<count; idx++){
+
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*(f6+uz*fsum_partial))/(1.0-0.5/tau)/(1.0-uz); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (int idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*(f6+uz*fsum_partial))/(1.0-0.5/tau)/(1.0-uz); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f6 = dist[5*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*f6-0.5*uz*fsum_partial/tau)/(1.0-0.5/tau+0.5*uz/tau); 
+		dist[6*Np+n] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+0.5*uz*fsum_partial/tau)/(1.0-0.5/tau-0.5*uz/tau); 
+		dist[5*Np+n] = f6;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*f6-0.5*uz*fsum_partial/tau)/(1.0-0.5/tau+0.5*uz/tau); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr6;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+4*Np];
+		f5 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+0.5*uz*fsum_partial/tau)/(1.0-0.5/tau-0.5*uz/tau); 
+
+		// unknown distributions
+		nr6 = d_neighborList[n+5*Np];
+		dist[nr6] = f6;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                              double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau-0.5*uz/tau-uEPz); 
+		dist[5*Np+n] = f6;
+	}
+}
+ 
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                             double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*f6-(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau+0.5*uz/tau+uEPz); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                              double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau-0.5*uz/tau-uEPz); 
+		dist[5*Np+n] = f6;
+	}
+}
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                             double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr6;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	for (idx=0; idx<count; idx++){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+4*Np];
+		f5 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau-0.5*uz/tau-uEPz); 
+
+		// unknown distributions
+		nr6 = d_neighborList[n+5*Np];
+		dist[nr6] = f6;
+	}
+}
+
+
+
+
+
+
+
+
+
+
+
--- a/cpu/FreeLee.cpp
+++ b/cpu/FreeLee.cpp
--- a/cpu/GreyscaleColor.cpp
+++ b/cpu/GreyscaleColor.cpp
@ -1341,7 +1341,7 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor(int *Map, double *dist, doubl
 //CP: capillary penalty
 // also turn off recoloring for grey nodes
 extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den,
-		 double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm, double *Velocity, double *Pressure,
+		 double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm, double *Velocity, double *Pressure,
        double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff,double alpha, double beta,
 		double Gx, double Gy, double Gz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){

@ -1375,6 +1375,11 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
    double W;//greyscale wetting strength
    double Sn_grey,Sw_grey;

+    /* Corey model parameters */
+    double Kn_grey,Kw_grey;    
+    double Swn,Krn_grey,Krw_grey,mobility_ratio,jA,jB;
+    double GreyDiff; // grey diffusion
+    
 	const double mrt_V1=0.05263157894736842;
 	const double mrt_V2=0.012531328320802;
 	const double mrt_V3=0.04761904761904762;
@ -1394,14 +1399,16 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
 		nB = Den[Np + n];

        porosity = Poros[n];
-        perm = Perm[n];
+        GreyDiff = Perm[n];
+        perm = 1.0;
        W = GreySolidW[n];
        Sn_grey = GreySn[n];
        Sw_grey = GreySw[n];
-
+        Kn_grey = GreyKn[n];
+        Kw_grey = GreyKw[n];
+        
 		// compute phase indicator field
 		phi=(nA-nB)/(nA+nB);
-
 		// local density
 		rho0=rhoA + 0.5*(1.0-phi)*(rhoB-rhoA);
 		// local relaxation time
@ -1411,6 +1418,27 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
 		rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
        mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
 		
+        mobility_ratio = 1.0;
+        Krn_grey = 0.0;
+        Krw_grey = 0.0;
+        if (nA/(nA+nB)<Sn_grey && porosity !=1.0){
+        	perm = Kw_grey;
+        	Swn = 0.0;
+        }
+        else if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){ 
+        	Swn = (nA/(nA+nB) - Sn_grey) /(Sw_grey - Sn_grey);
+        	Krn_grey = Kn_grey*Swn*Swn; // Corey model with exponent = 2, make sure that W cannot shift to zero
+        	Krw_grey = Kw_grey*(1.0-Swn)*(1.0-Swn); // Corey model with exponent = 2, make sure that W cannot shift to zero
+        	// recompute the effective permeability
+        	perm = mu_eff*(Krn_grey*3.0/(tauA-0.5) + Krw_grey*3.0/(tauA-0.5));
+        	//mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));
+        }
+        else if (nA/(nA+nB)>Sw_grey && porosity !=1.0){
+        	perm = Kn_grey;
+        	Swn = 1.0;
+        }	        		
+    	mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));
+
 		// Get the 1D index based on regular data layout
 		ijk = Map[n];
 		//					COMPUTE THE COLOR GRADIENT
@ -2053,21 +2081,28 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
 		nAB = 1.0/(nA+nB);
 		Aq[n] = 0.3333333333333333*nA;
 		Bq[n] = 0.3333333333333333*nB;
-
+		
 		//...............................................
 		// q = 0,2,4
 		// Cq = {1,0,0}, {0,1,0}, {0,0,1}
+		jA = nA*ux;
+		jB = nB*ux;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*nx;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*nx;
+        	jA = 0.5*ux*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*ux*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta;
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		// q = 1
 		//nread = neighborList[n+Np];
@ -2080,17 +2115,24 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map

 		//...............................................
 		// Cq = {0,1,0}
+		jA = nA*uy;
+		jB = nB*uy;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*ny;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*ny;
+    		jA = 0.5*uy*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*uy*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta;
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		// q = 3
 		//nread = neighborList[n+3*Np];
@ -2104,17 +2146,25 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
 		//...............................................
 		// q = 4
 		// Cq = {0,0,1}
+		jA = nA*uz;
+		jB = nB*uz;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*nz;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*nz;
+    		jA = 0.5*uz*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*uz*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta;
+
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		// q = 5
 		//nread = neighborList[n+5*Np];
@ -2131,7 +2181,7 @@ extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map
 //CP: capillary penalty
 // also turn off recoloring for grey nodes
 extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm, double *Velocity, double *Pressure, 
+        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm, double *Velocity, double *Pressure, 
        double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Gx, double Gy, double Gz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){

@ -2152,6 +2202,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do
    double W;//greyscale wetting strength
    double Sn_grey,Sw_grey;
    
+    /* Corey model parameters */
+    double Kn_grey,Kw_grey;
+    double Swn,Krn_grey,Krw_grey,mobility_ratio,jA,jB;
+    double GreyDiff; // grey diffusion
+
    //double GeoFun=0.0;//geometric function from Guo's PRE 66, 036304 (2002)
    double porosity;
    double perm;//voxel permeability
@ -2180,11 +2235,14 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do
 		nB = Den[Np + n];

        porosity = Poros[n];
-        perm = Perm[n];
+        GreyDiff = Perm[n];
+        perm = 1.0;
        W = GreySolidW[n];
        Sn_grey = GreySn[n];
        Sw_grey = GreySw[n];
-
+        Kn_grey = GreyKn[n];
+        Kw_grey = GreyKw[n];
+        
 		// compute phase indicator field
 		phi=(nA-nB)/(nA+nB);

@ -2196,6 +2254,26 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do
 		rlx_setA = 1.f/tau;
 		rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
        mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
+	
+        Krn_grey = 0.0;
+        Krw_grey = 0.0;
+        if (nA/(nA+nB)<Sn_grey && porosity !=1.0){
+        	perm = Kw_grey;
+        	Swn = 0.0;
+        }
+        else if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){ 
+        	Swn = (nA/(nA+nB) - Sn_grey) /(Sw_grey - Sn_grey);
+        	Krn_grey = Kn_grey*Swn*Swn; // Corey model with exponent = 2, make sure that W cannot shift to zero
+        	Krw_grey = Kw_grey*(1.0-Swn)*(1.0-Swn); // Corey model with exponent = 2, make sure that W cannot shift to zero
+        	// recompute the effective permeability
+        	perm = mu_eff*(Krn_grey*3.0/(tauA-0.5) + Krw_grey*3.0/(tauA-0.5));
+        	//mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));
+        }
+        else if (nA/(nA+nB)>Sw_grey && porosity !=1.0){
+        	perm = Kn_grey;
+        	Swn = 1.0;
+        }	
+    	mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));

 		// Get the 1D index based on regular data layout
 		ijk = Map[n];
@ -2772,21 +2850,28 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do
 		nAB = 1.0/(nA+nB);
 		Aq[n] = 0.3333333333333333*nA;
 		Bq[n] = 0.3333333333333333*nB;
-
+		
 		//...............................................
 		// q = 0,2,4
 		// Cq = {1,0,0}, {0,1,0}, {0,0,1}
+		jA = nA*ux;
+		jB = nB*ux;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*nx;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*nx;
+    		jA = 0.5*ux*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*ux*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta;
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		Aq[1*Np+n] = a1;
 		Bq[1*Np+n] = b1;
@ -2794,38 +2879,53 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do
 		Bq[2*Np+n] = b2;

 		//...............................................
-		// q = 2
 		// Cq = {0,1,0}
+		jA = nA*uy;
+		jB = nB*uy;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*ny;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*ny;
+    		jA = 0.5*uy*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*uy*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta;
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		Aq[3*Np+n] = a1;
 		Bq[3*Np+n] = b1;
 		Aq[4*Np+n] = a2;
 		Bq[4*Np+n] = b2;
+
 		//...............................................
 		// q = 4
 		// Cq = {0,0,1}
+		jA = nA*uz;
+		jB = nB*uz;		
 		delta = beta*nA*nB*nAB*0.1111111111111111*nz;
 		if (!(nA*nB*nAB>0)) delta=0;
        //----------------newly added for better control of recoloring---------------//
-        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
+        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+        	//delta = 0.0; 
+        	delta = 0.111111111111111*C*W*GreyDiff*nA*nB*nAB*nz;
+    		jA = 0.5*uz*(nA+nB)*(1.0+mobility_ratio);
+    		jB = 0.5*uz*(nA+nB)*(1.0-mobility_ratio);
+        }
        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
        //---------------------------------------------------------------------------//
        if (RecoloringOff==true && porosity !=1.0) delta=0;
-		a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta;
-		b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta;
-		a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta;
-		b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta;
+
+		a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+		b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+		a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+		b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 		Aq[5*Np+n] = a1;
 		Bq[5*Np+n] = b1;
--- a/cuda/D3Q7BC.cu
+++ b/cuda/D3Q7BC.cu
@ -316,7 +316,133 @@ __global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Concentration_BC_Z(int *d_neighborList
 	}
 }

-__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f6 = dist[5*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*(f6+uz*fsum_partial))/(1.0-0.5/tau)/(1.0-uz); 
+		dist[6*Np+n] = f5;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*(f5-uz*fsum_partial))/(1.0-0.5/tau)/(1.0+uz); 
+		dist[5*Np+n] = f6;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*(f6+uz*fsum_partial))/(1.0-0.5/tau)/(1.0-uz); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr6;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+4*Np];
+		f5 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*(f5-uz*fsum_partial))/(1.0-0.5/tau)/(1.0+uz); 
+
+		// unknown distributions
+		nr6 = d_neighborList[n+5*Np];
+		dist[nr6] = f6;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
 {
    //NOTE: FluxIn is the inward flux
    int idx,n;
@ -340,7 +466,7 @@ __global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z(int *list, double *dist, do
 	}
 }

-__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
 {
    //NOTE: FluxIn is the inward flux
    int idx,n;
@ -364,7 +490,7 @@ __global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z(int *list, double *dist, do
 	}
 }

-__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
 {
    //NOTE: FluxIn is the inward flux
 	int idx, n;
@ -403,7 +529,7 @@ __global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z(int *d_neighborList, int *li
 	}
 }

-__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np)
 {
    //NOTE: FluxIn is the inward flux
 	int idx, n;
@ -441,6 +567,152 @@ __global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(int *d_neighborList, int *li
 		dist[nr6] = f6;
 	}
 }
+
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                                  double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f6 = dist[5*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*f6-(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau+0.5*uz/tau+uEPz); 
+		dist[6*Np+n] = f5;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z, 
+                                                                  double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+    int idx,n;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+		f1 = dist[2*Np+n];
+		f2 = dist[1*Np+n];
+		f3 = dist[4*Np+n];
+		f4 = dist[3*Np+n];
+		f5 = dist[6*Np+n];
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau-0.5*uz/tau-uEPz); 
+		dist[5*Np+n] = f6;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                                 double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr5;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+		nread = d_neighborList[n+5*Np];
+		f6 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f6;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f5 =(FluxIn+(1.0-0.5/tau)*f6-(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau+0.5*uz/tau+uEPz); 
+
+		// Unknown distributions
+		nr5 = d_neighborList[n+4*Np];
+		dist[nr5] = f5;
+	}
+}
+
+__global__ void dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                                 double Di, double zi, double Vt, int count, int Np)
+{
+    //NOTE: FluxIn is the inward flux
+	int idx, n;
+    int nread,nr6;
+	double f0,f1,f2,f3,f4,f5,f6;
+    double fsum_partial;
+    double uz;
+    double uEPz;//electrochemical induced velocity
+    double Ez;//electrical field
+	idx = blockIdx.x*blockDim.x + threadIdx.x;
+	if (idx < count){
+		n = list[idx];
+		f0 = dist[n];
+
+		nread = d_neighborList[n];
+		f1 = dist[nread];
+
+		nread = d_neighborList[n+2*Np];
+		f3 = dist[nread];
+
+		nread = d_neighborList[n+4*Np];
+		f5 = dist[nread];
+
+		nread = d_neighborList[n+Np];
+		f2 = dist[nread];
+
+		nread = d_neighborList[n+3*Np];
+		f4 = dist[nread];
+
+        fsum_partial = f0+f1+f2+f3+f4+f5;
+        uz = VelocityZ[n];
+        Ez = ElectricField_Z[n];
+        uEPz=zi*Di/Vt*Ez;
+		//...................................................
+        f6 =(FluxIn+(1.0-0.5/tau)*f5+(0.5*uz/tau+uEPz)*fsum_partial)/(1.0-0.5/tau-0.5*uz/tau-uEPz); 
+
+		// unknown distributions
+		nr6 = d_neighborList[n+5*Np];
+		dist[nr6] = f6;
+	}
+}
 //*************************************************************************

 extern "C" void ScaLBL_Solid_Dirichlet_D3Q7(double *dist, double *BoundaryValue, int *BounceBackDist_list, int *BounceBackSolid_list, int count){
@ -567,39 +839,116 @@ extern "C" void ScaLBL_D3Q7_AAodd_Ion_Concentration_BC_Z(int *d_neighborList, in
 		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Concentration_BC_Z (kernel): %s \n",cudaGetErrorString(err));
 	}
 }
-
-extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+//------------Diff-----------------
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_BC_z (kernel): %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_z (kernel): %s \n",cudaGetErrorString(err));
 	}
 }

-extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_Diff_BC_Z (kernel): %s \n",cudaGetErrorString(err));
 	}
 }

-extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_BC_z (kernel): %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_z (kernel): %s \n",cudaGetErrorString(err));
 	}
 }

-extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
 	int GRID = count / 512 + 1;
-	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
 	cudaError_t err = cudaGetLastError();
 	if (cudaSuccess != err){
-		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_Diff_BC_Z (kernel): %s \n",cudaGetErrorString(err));
 	}
 }
+//----------DiffAdvc-------------
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvc_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvc_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+//----------DiffAdvcElec-------------
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                              double Di, double zi, double Vt, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z(int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                              double Di, double zi, double Vt, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z<<<GRID,512>>>(list, dist, FluxIn, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAeven_Ion_Flux_DiffAdvcElec_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                             double Di, double zi, double Vt, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+
+extern "C" void ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z(int *d_neighborList, int *list, double *dist, double FluxIn, double tau, double *VelocityZ, double *ElectricField_Z,
+                                                             double Di, double zi, double Vt, int count, int Np){
+	int GRID = count / 512 + 1;
+	dvc_ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z<<<GRID,512>>>(d_neighborList, list, dist, FluxIn, tau, VelocityZ, ElectricField_Z, Di, zi, Vt, count, Np);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err){
+		printf("CUDA error in ScaLBL_D3Q7_AAodd_Ion_Flux_DiffAdvcElec_BC_Z (kernel): %s \n",cudaGetErrorString(err));
+	}
+}
+//-------------------------------
--- a/cuda/GreyscaleColor.cu
+++ b/cuda/GreyscaleColor.cu
@ -1450,7 +1450,7 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor(int *Map, double *dist,
 //CP: capillary penalty
 // also turn off recoloring for grey nodes
 __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den,
-		 double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm, double *Velocity, double *Pressure,
+		 double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm, double *Velocity, double *Pressure,
         double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff,double alpha, double beta,
 		double Gx, double Gy, double Gz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){

@ -1479,7 +1479,11 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
    double Fcpx,Fcpy,Fcpz;//capillary penalty force
    double W;//greyscale wetting strength
    double Sn_grey,Sw_grey;
-
+    
+    /* Corey model parameters */
+    double Kn_grey,Kw_grey;    
+    double Swn,Krn_grey,Krw_grey,mobility_ratio,jA,jB;
+    
 	const double mrt_V1=0.05263157894736842;
 	const double mrt_V2=0.012531328320802;
 	const double mrt_V3=0.04761904761904762;
@ -1502,15 +1506,17 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			nA = Den[n];
 			nB = Den[Np + n];

-            porosity = Poros[n];
-            perm = Perm[n];
-            W = GreySolidW[n];
-            Sn_grey = GreySn[n];
-            Sw_grey = GreySw[n];
-
+	        porosity = Poros[n];
+	        //perm = Perm[n];
+	        perm = 1.0;
+	        W = GreySolidW[n];
+	        Sn_grey = GreySn[n];
+	        Sw_grey = GreySw[n];
+	        Kn_grey = GreyKn[n];
+	        Kw_grey = GreyKw[n];
+	        
 			// compute phase indicator field
 			phi=(nA-nB)/(nA+nB);
-
 			// local density
 			rho0=rhoA + 0.5*(1.0-phi)*(rhoB-rhoA);
 			// local relaxation time
@ -1518,8 +1524,24 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			tau_eff=tauA_eff + 0.5*(1.0-phi)*(tauB_eff-tauA_eff);
 			rlx_setA = 1.f/tau;
 			rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
-            mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
+	        mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
 			
+	        if (nA/(nA+nB)<Sn_grey && porosity !=1.0){
+	        	perm = Kw_grey;
+	        	Swn = 0.0;
+	        }
+	        else if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){ 
+	        	Swn = (nA/(nA+nB) - Sn_grey) /(Sw_grey - Sn_grey);
+	        	Krn_grey = Kn_grey*Swn*Swn; // Corey model with exponent = 2, make sure that W cannot shift to zero
+	        	Krw_grey = Kw_grey*(1.0-Swn)*(1.0-Swn); // Corey model with exponent = 2, make sure that W cannot shift to zero
+	        	// recompute the effective permeability
+	        	perm = mu_eff*(Krn_grey*3.0/(tauA-0.5) + Krw_grey*3.0/(tauA-0.5));
+	        	mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));
+	        }
+	        else if (nA/(nA+nB)>Sw_grey && porosity !=1.0){
+	        	perm = Kn_grey;
+	        	Swn = 1.0;
+	        }	        		
 			// Get the 1D index based on regular data layout
 			ijk = Map[n];
 			//					COMPUTE THE COLOR GRADIENT
@ -1585,35 +1607,35 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			nz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));

 			//............Compute the Greyscale Potential Gradient.....................
-//			Fcpx = 0.0;
-//			Fcpy = 0.0;
-//			Fcpz = 0.0;
-//            if (porosity!=1.0){
-//                //Fcpx = -3.0/18.0*(gp1-gp2+0.5*(gp7-gp8+gp9-gp10+gp11-gp12+gp13-gp14));
-//                //Fcpy = -3.0/18.0*(gp3-gp4+0.5*(gp7-gp8-gp9+gp10+gp15-gp16+gp17-gp18));
-//                //Fcpz = -3.0/18.0*(gp5-gp6+0.5*(gp11-gp12-gp13+gp14+gp15-gp16-gp17+gp18));
-//                Fcpx = -3.0/18.0*(m1-m2+0.5*(m7-m8+m9-m10+m11-m12+m13-m14));
-//                Fcpy = -3.0/18.0*(m3-m4+0.5*(m7-m8-m9+m10+m15-m16+m17-m18));
-//                Fcpz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));
-//                Fcpx *= alpha*W/sqrt(perm); 
-//                Fcpy *= alpha*W/sqrt(perm); 
-//                Fcpz *= alpha*W/sqrt(perm); 
-//                //double Fcp_mag_temp = sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
-//                //double Fcp_mag = Fcp_mag_temp;
-//                //if (Fcp_mag_temp==0.0) Fcp_mag=1.0;
-//                //nx = Fcpx/Fcp_mag;
-//                //ny = Fcpy/Fcp_mag;
-//                //nz = Fcpz/Fcp_mag;
-//            }
-            Fcpx = nx;
-            Fcpy = ny;
-            Fcpz = nz;
-            double Fcp_mag=sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
-            if (Fcp_mag==0.0); Fcpx=Fcpy=Fcpz=0.0;
-            //NOTE for open node (porosity=1.0),Fcp=0.0
-            Fcpx *= alpha*W*(1.0-porosity)/sqrt(perm); 
-            Fcpy *= alpha*W*(1.0-porosity)/sqrt(perm); 
-            Fcpz *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	//			Fcpx = 0.0;
+	//			Fcpy = 0.0;
+	//			Fcpz = 0.0;
+	//            if (porosity!=1.0){
+	//                //Fcpx = -3.0/18.0*(gp1-gp2+0.5*(gp7-gp8+gp9-gp10+gp11-gp12+gp13-gp14));
+	//                //Fcpy = -3.0/18.0*(gp3-gp4+0.5*(gp7-gp8-gp9+gp10+gp15-gp16+gp17-gp18));
+	//                //Fcpz = -3.0/18.0*(gp5-gp6+0.5*(gp11-gp12-gp13+gp14+gp15-gp16-gp17+gp18));
+	//                Fcpx = -3.0/18.0*(m1-m2+0.5*(m7-m8+m9-m10+m11-m12+m13-m14));
+	//                Fcpy = -3.0/18.0*(m3-m4+0.5*(m7-m8-m9+m10+m15-m16+m17-m18));
+	//                Fcpz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));
+	//                Fcpx *= alpha*W/sqrt(perm); 
+	//                Fcpy *= alpha*W/sqrt(perm); 
+	//                Fcpz *= alpha*W/sqrt(perm); 
+	//                //double Fcp_mag_temp = sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
+	//                //double Fcp_mag = Fcp_mag_temp;
+	//                //if (Fcp_mag_temp==0.0) Fcp_mag=1.0;
+	//                //nx = Fcpx/Fcp_mag;
+	//                //ny = Fcpy/Fcp_mag;
+	//                //nz = Fcpz/Fcp_mag;
+	//            }
+	        Fcpx = nx;
+	        Fcpy = ny;
+	        Fcpz = nz;
+	        double Fcp_mag=sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
+	        if (Fcp_mag==0.0) Fcpx=Fcpy=Fcpz=0.0;
+	        //NOTE for open node (porosity=1.0),Fcp=0.0
+	        Fcpx *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	        Fcpy *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	        Fcpz *= alpha*W*(1.0-porosity)/sqrt(perm); 

 			//...........Normalize the Color Gradient.................................
 			C = sqrt(nx*nx+ny*ny+nz*nz);
@ -1944,98 +1966,98 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			m17 -= fq;
 			m18 -= fq;
 			
-            // Compute greyscale related parameters
-            ux = (jx/rho0+0.5*porosity*Gx+0.5*Fcpx/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            uy = (jy/rho0+0.5*porosity*Gy+0.5*Fcpy/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            uz = (jz/rho0+0.5*porosity*Gz+0.5*Fcpz/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            if (porosity==1.0){//i.e. open nodes
-                ux = (jx/rho0+0.5*porosity*Gx);
-                uy = (jy/rho0+0.5*porosity*Gy);
-                uz = (jz/rho0+0.5*porosity*Gz);
-            }
+	        // Compute greyscale related parameters
+	        ux = (jx/rho0+0.5*porosity*Gx+0.5*Fcpx/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        uy = (jy/rho0+0.5*porosity*Gy+0.5*Fcpy/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        uz = (jz/rho0+0.5*porosity*Gz+0.5*Fcpz/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        if (porosity==1.0){//i.e. open nodes
+	            ux = (jx/rho0+0.5*porosity*Gx);
+	            uy = (jy/rho0+0.5*porosity*Gy);
+	            uz = (jz/rho0+0.5*porosity*Gz);
+	        }

-            //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-            Fx = rho0*(-porosity*mu_eff/perm*ux + porosity*Gx)+Fcpx;
-            Fy = rho0*(-porosity*mu_eff/perm*uy + porosity*Gy)+Fcpy;
-            Fz = rho0*(-porosity*mu_eff/perm*uz + porosity*Gz)+Fcpz;
-            if (porosity==1.0){
-                Fx=rho0*(porosity*Gx);
-                Fy=rho0*(porosity*Gy);
-                Fz=rho0*(porosity*Gz);
-            }
+	        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+	        Fx = rho0*(-porosity*mu_eff/perm*ux + porosity*Gx)+Fcpx;
+	        Fy = rho0*(-porosity*mu_eff/perm*uy + porosity*Gy)+Fcpy;
+	        Fz = rho0*(-porosity*mu_eff/perm*uz + porosity*Gz)+Fcpz;
+	        if (porosity==1.0){
+	            Fx=rho0*(porosity*Gx);
+	            Fy=rho0*(porosity*Gy);
+	            Fz=rho0*(porosity*Gz);
+	        }

 			// write the velocity 
 			Velocity[n] = ux;
 			Velocity[Np+n] = uy;
 			Velocity[2*Np+n] = uz;
-            //Pressure[n] = rho/3.f/porosity;
-            Pressure[n] = rho/3.f;
+	        //Pressure[n] = rho/3.f/porosity;
+	        Pressure[n] = rho/3.f;

 			//........................................................................
 			//..............carry out relaxation process..............................
 			//..........Toelke, Fruediger et. al. 2006................................
-            //---------------- NO higher-order force -------------------------------//
+	        //---------------- NO higher-order force -------------------------------//
 			if (C == 0.0)	nx = ny = nz = 0.0;
 			m1 = m1 + rlx_setA*((19*(ux*ux+uy*uy+uz*uz)*rho0/porosity - 11*rho) -19*alpha*C - m1);
 			m2 = m2 + rlx_setA*((3*rho - 5.5*(ux*ux+uy*uy+uz*uz)*rho0/porosity)- m2);
-            jx = jx + Fx;
+	        jx = jx + Fx;
 			m4 = m4 + rlx_setB*((-0.6666666666666666*ux*rho0)- m4)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
-            jy = jy + Fy;
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+	        jy = jy + Fy;
 			m6 = m6 + rlx_setB*((-0.6666666666666666*uy*rho0)- m6)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
-            jz = jz + Fz;
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+	        jz = jz + Fz;
 			m8 = m8 + rlx_setB*((-0.6666666666666666*uz*rho0)- m8)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
 			m9 = m9 + rlx_setA*(((2*ux*ux-uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9);
 			m10 = m10 + rlx_setA*( - m10);
-            //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+	        //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
 			m11 = m11 + rlx_setA*(((uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(ny*ny-nz*nz)- m11);
 			m12 = m12 + rlx_setA*( - m12);
-            //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12);
+	        //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12);
 			m13 = m13 + rlx_setA*( (ux*uy*rho0/porosity) + 0.5*alpha*C*nx*ny - m13);
 			m14 = m14 + rlx_setA*( (uy*uz*rho0/porosity) + 0.5*alpha*C*ny*nz - m14);
 			m15 = m15 + rlx_setA*( (ux*uz*rho0/porosity) + 0.5*alpha*C*nx*nz - m15);
 			m16 = m16 + rlx_setB*( - m16);
 			m17 = m17 + rlx_setB*( - m17);
 			m18 = m18 + rlx_setB*( - m18);
-            //----------------------------------------------------------------------//
+	        //----------------------------------------------------------------------//

-            //----------------With higher-order force ------------------------------//
+	        //----------------With higher-order force ------------------------------//
 			//if (C == 0.0)	nx = ny = nz = 0.0;
 			//m1 = m1 + rlx_setA*((19*(ux*ux+uy*uy+uz*uz)*rho0/porosity - 11*rho) -19*alpha*C - m1)
-            //        + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+	        //        + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
 			//m2 = m2 + rlx_setA*((3*rho - 5.5*(ux*ux+uy*uy+uz*uz)*rho0/porosity)- m2)
-            //        + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
-            //jx = jx + Fx;
+	        //        + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+	        //jx = jx + Fx;
 			//m4 = m4 + rlx_setB*((-0.6666666666666666*ux*rho0)- m4)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
-            //jy = jy + Fy;
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+	        //jy = jy + Fy;
 			//m6 = m6 + rlx_setB*((-0.6666666666666666*uy*rho0)- m6)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
-            //jz = jz + Fz;
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+	        //jz = jz + Fz;
 			//m8 = m8 + rlx_setB*((-0.6666666666666666*uz*rho0)- m8)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
 			//m9 = m9 + rlx_setA*(((2*ux*ux-uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9)
-            //        + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+	        //        + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
 			////m10 = m10 + rlx_setA*( - m10);
-            //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-            //          + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+	        //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+	        //          + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
 			//m11 = m11 + rlx_setA*(((uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(ny*ny-nz*nz)- m11)
-            //          + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
 			////m12 = m12 + rlx_setA*( - m12);
-            //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12)
-            //          + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+	        //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12)
+	        //          + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
 			//m13 = m13 + rlx_setA*( (ux*uy*rho0/porosity) + 0.5*alpha*C*nx*ny - m13);
-            //          + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
 			//m14 = m14 + rlx_setA*( (uy*uz*rho0/porosity) + 0.5*alpha*C*ny*nz - m14);
-            //          + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
 			//m15 = m15 + rlx_setA*( (ux*uz*rho0/porosity) + 0.5*alpha*C*nx*nz - m15);
-            //          + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
 			//m16 = m16 + rlx_setB*( - m16);
 			//m17 = m17 + rlx_setB*( - m17);
 			//m18 = m18 + rlx_setB*( - m18);
-            //----------------------------------------------------------------------//
+	        //----------------------------------------------------------------------//

 			//.................inverse transformation......................................................
 			// q=0
@ -2162,21 +2184,27 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			nAB = 1.0/(nA+nB);
 			Aq[n] = 0.3333333333333333*nA;
 			Bq[n] = 0.3333333333333333*nB;
-
+			
 			//...............................................
 			// q = 0,2,4
 			// Cq = {1,0,0}, {0,1,0}, {0,0,1}
+			jA = nA*ux;
+			jB = nB*ux;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*nx;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*ux*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*ux*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			// q = 1
 			//nread = neighborList[n+Np];
@ -2189,17 +2217,23 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int

 			//...............................................
 			// Cq = {0,1,0}
+			jA = nA*uy;
+			jB = nB*uy;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*ny;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*uy*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*uy*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			// q = 3
 			//nread = neighborList[n+3*Np];
@ -2213,17 +2247,24 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 			//...............................................
 			// q = 4
 			// Cq = {0,0,1}
+			jA = nA*uz;
+			jB = nB*uz;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*nz;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*uz*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*uz*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			// q = 5
 			//nread = neighborList[n+5*Np];
@ -2241,7 +2282,7 @@ __global__ void dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *neighborList, int
 //CP: capillary penalty
 // also turn off recoloring for grey nodes
 __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm, double *Velocity, double *Pressure, 
+        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm, double *Velocity, double *Pressure, 
        double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Gx, double Gy, double Gz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){
 	int ijk,nn,n;
@ -2265,6 +2306,10 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
    double Fcpx,Fcpy,Fcpz;//capillary penalty force
    double W;//greyscale wetting strength
    double Sn_grey,Sw_grey;
+    
+    /* Corey model parameters */
+    double Kn_grey,Kw_grey;    
+    double Swn,Krn_grey,Krw_grey,mobility_ratio,jA,jB;

 	const double mrt_V1=0.05263157894736842;
 	const double mrt_V2=0.012531328320802;
@ -2284,17 +2329,18 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 		//........Get 1-D index for this thread....................
 		n =  S*blockIdx.x*blockDim.x + s*blockDim.x + threadIdx.x + start;
 		if (n<finish) {
-
-			// read the component number densities
 			nA = Den[n];
 			nB = Den[Np + n];

-            porosity = Poros[n];
-            perm = Perm[n];
-            W = GreySolidW[n];
-            Sn_grey = GreySn[n];
-            Sw_grey = GreySw[n];
-
+	        porosity = Poros[n];
+	        //perm = Perm[n];
+	        perm = 1.0;
+	        W = GreySolidW[n];
+	        Sn_grey = GreySn[n];
+	        Sw_grey = GreySw[n];
+	        Kn_grey = GreyKn[n];
+	        Kw_grey = GreyKw[n];
+	        
 			// compute phase indicator field
 			phi=(nA-nB)/(nA+nB);

@ -2305,8 +2351,24 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 			tau_eff=tauA_eff + 0.5*(1.0-phi)*(tauB_eff-tauA_eff);
 			rlx_setA = 1.f/tau;
 			rlx_setB = 8.f*(2.f-rlx_setA)/(8.f-rlx_setA);
-            mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
-
+	        mu_eff = (tau_eff-0.5)/3.0;//kinematic viscosity
+		
+	        if (nA/(nA+nB)<Sn_grey && porosity !=1.0){
+	        	perm = Kw_grey;
+	        	Swn = 0.0;
+	        }
+	        else if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){ 
+	        	Swn = (nA/(nA+nB) - Sn_grey) /(Sw_grey - Sn_grey);
+	        	Krn_grey = Kn_grey*Swn*Swn; // Corey model with exponent = 2, make sure that W cannot shift to zero
+	        	Krw_grey = Kw_grey*(1.0-Swn)*(1.0-Swn); // Corey model with exponent = 2, make sure that W cannot shift to zero
+	        	// recompute the effective permeability
+	        	perm = mu_eff*(Krn_grey*3.0/(tauA-0.5) + Krw_grey*3.0/(tauA-0.5));
+	        	mobility_ratio =(nA*Krn_grey*3.0/(tauA-0.5) - nB*Krw_grey*3.0/(tauB-0.5))/(nA*Krn_grey*3.0/(tauA-0.5) + nB*Krw_grey*3.0/(tauB-0.5));
+	        }
+	        else if (nA/(nA+nB)>Sw_grey && porosity !=1.0){
+	        	perm = Kn_grey;
+	        	Swn = 1.0;
+	        }	
 			// Get the 1D index based on regular data layout
 			ijk = Map[n];
 			//					COMPUTE THE COLOR GRADIENT
@ -2372,35 +2434,35 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 			nz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));

 			//............Compute the Greyscale Potential Gradient.....................
-//			Fcpx = 0.0;
-//			Fcpy = 0.0;
-//			Fcpz = 0.0;
-//            if (porosity!=1.0){
-//                //Fcpx = -3.0/18.0*(gp1-gp2+0.5*(gp7-gp8+gp9-gp10+gp11-gp12+gp13-gp14));
-//                //Fcpy = -3.0/18.0*(gp3-gp4+0.5*(gp7-gp8-gp9+gp10+gp15-gp16+gp17-gp18));
-//                //Fcpz = -3.0/18.0*(gp5-gp6+0.5*(gp11-gp12-gp13+gp14+gp15-gp16-gp17+gp18));
-//                Fcpx = -3.0/18.0*(m1-m2+0.5*(m7-m8+m9-m10+m11-m12+m13-m14));
-//                Fcpy = -3.0/18.0*(m3-m4+0.5*(m7-m8-m9+m10+m15-m16+m17-m18));
-//                Fcpz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));
-//                Fcpx *= alpha*W/sqrt(perm); 
-//                Fcpy *= alpha*W/sqrt(perm); 
-//                Fcpz *= alpha*W/sqrt(perm); 
-//                double Fcp_mag_temp = sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
-//                double Fcp_mag = Fcp_mag_temp;
-//                if (Fcp_mag_temp==0.0) Fcp_mag=1.0;
-//                nx = Fcpx/Fcp_mag;
-//                ny = Fcpy/Fcp_mag;
-//                nz = Fcpz/Fcp_mag;
-//            }
-            Fcpx = nx; 
-            Fcpy = ny; 
-            Fcpz = nz; 
-            double Fcp_mag=sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
-            if (Fcp_mag==0.0); Fcpx=Fcpy=Fcpz=0.0;
-            //NOTE for open node (porosity=1.0),Fcp=0.0
-            Fcpx *= alpha*W*(1.0-porosity)/sqrt(perm); 
-            Fcpy *= alpha*W*(1.0-porosity)/sqrt(perm); 
-            Fcpz *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	//			Fcpx = 0.0;
+	//			Fcpy = 0.0;
+	//			Fcpz = 0.0;
+	//            if (porosity!=1.0){
+	//                //Fcpx = -3.0/18.0*(gp1-gp2+0.5*(gp7-gp8+gp9-gp10+gp11-gp12+gp13-gp14));
+	//                //Fcpy = -3.0/18.0*(gp3-gp4+0.5*(gp7-gp8-gp9+gp10+gp15-gp16+gp17-gp18));
+	//                //Fcpz = -3.0/18.0*(gp5-gp6+0.5*(gp11-gp12-gp13+gp14+gp15-gp16-gp17+gp18));
+	//                Fcpx = -3.0/18.0*(m1-m2+0.5*(m7-m8+m9-m10+m11-m12+m13-m14));
+	//                Fcpy = -3.0/18.0*(m3-m4+0.5*(m7-m8-m9+m10+m15-m16+m17-m18));
+	//                Fcpz = -3.0/18.0*(m5-m6+0.5*(m11-m12-m13+m14+m15-m16-m17+m18));
+	//                Fcpx *= alpha*W/sqrt(perm); 
+	//                Fcpy *= alpha*W/sqrt(perm); 
+	//                Fcpz *= alpha*W/sqrt(perm); 
+	//                double Fcp_mag_temp = sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
+	//                double Fcp_mag = Fcp_mag_temp;
+	//                if (Fcp_mag_temp==0.0) Fcp_mag=1.0;
+	//                nx = Fcpx/Fcp_mag;
+	//                ny = Fcpy/Fcp_mag;
+	//                nz = Fcpz/Fcp_mag;
+	//            }
+	        Fcpx = nx; 
+	        Fcpy = ny; 
+	        Fcpz = nz; 
+	        double Fcp_mag=sqrt(Fcpx*Fcpx+Fcpy*Fcpy+Fcpz*Fcpz);
+	        if (Fcp_mag==0.0) Fcpx=Fcpy=Fcpz=0.0;
+	        //NOTE for open node (porosity=1.0),Fcp=0.0
+	        Fcpx *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	        Fcpy *= alpha*W*(1.0-porosity)/sqrt(perm); 
+	        Fcpz *= alpha*W*(1.0-porosity)/sqrt(perm); 

 			//...........Normalize the Color Gradient.................................
 			C = sqrt(nx*nx+ny*ny+nz*nz);
@ -2680,98 +2742,98 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 			m17 -= fq;
 			m18 -= fq;

-            // Compute greyscale related parameters
-            ux = (jx/rho0+0.5*porosity*Gx+0.5*Fcpx/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            uy = (jy/rho0+0.5*porosity*Gy+0.5*Fcpy/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            uz = (jz/rho0+0.5*porosity*Gz+0.5*Fcpz/rho0)/(1.0+0.5*porosity*mu_eff/perm);
-            if (porosity==1.0){//i.e. open nodes
-                ux = (jx/rho0+0.5*porosity*Gx);
-                uy = (jy/rho0+0.5*porosity*Gy);
-                uz = (jz/rho0+0.5*porosity*Gz);
-            }
+	        // Compute greyscale related parameters
+	        ux = (jx/rho0+0.5*porosity*Gx+0.5*Fcpx/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        uy = (jy/rho0+0.5*porosity*Gy+0.5*Fcpy/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        uz = (jz/rho0+0.5*porosity*Gz+0.5*Fcpz/rho0)/(1.0+0.5*porosity*mu_eff/perm);
+	        if (porosity==1.0){//i.e. open nodes
+	            ux = (jx/rho0+0.5*porosity*Gx);
+	            uy = (jy/rho0+0.5*porosity*Gy);
+	            uz = (jz/rho0+0.5*porosity*Gz);
+	        }

-            //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
-            Fx = rho0*(-porosity*mu_eff/perm*ux + porosity*Gx)+Fcpx;
-            Fy = rho0*(-porosity*mu_eff/perm*uy + porosity*Gy)+Fcpy;
-            Fz = rho0*(-porosity*mu_eff/perm*uz + porosity*Gz)+Fcpz;
-            if (porosity==1.0){
-                Fx=rho0*(porosity*Gx);
-                Fy=rho0*(porosity*Gy);
-                Fz=rho0*(porosity*Gz);
-            }
+	        //Update the total force to include linear (Darcy) and nonlinear (Forchheimer) drags due to the porous medium
+	        Fx = rho0*(-porosity*mu_eff/perm*ux + porosity*Gx)+Fcpx;
+	        Fy = rho0*(-porosity*mu_eff/perm*uy + porosity*Gy)+Fcpy;
+	        Fz = rho0*(-porosity*mu_eff/perm*uz + porosity*Gz)+Fcpz;
+	        if (porosity==1.0){
+	            Fx=rho0*(porosity*Gx);
+	            Fy=rho0*(porosity*Gy);
+	            Fz=rho0*(porosity*Gz);
+	        }

 			// write the velocity 
 			Velocity[n] = ux;
 			Velocity[Np+n] = uy;
 			Velocity[2*Np+n] = uz;
-            //Pressure[n] = rho/3.f/porosity;
-            Pressure[n] = rho/3.f;
+	        //Pressure[n] = rho/3.f/porosity;
+	        Pressure[n] = rho/3.f;

 			//........................................................................
 			//..............carry out relaxation process..............................
 			//..........Toelke, Fruediger et. al. 2006................................
-            //---------------- NO higher-order force -------------------------------//
+	        //---------------- NO higher-order force -------------------------------//
 			if (C == 0.0)	nx = ny = nz = 0.0;
 			m1 = m1 + rlx_setA*((19*(ux*ux+uy*uy+uz*uz)*rho0/porosity - 11*rho) -19*alpha*C - m1);
 			m2 = m2 + rlx_setA*((3*rho - 5.5*(ux*ux+uy*uy+uz*uz)*rho0/porosity)- m2);
-            jx = jx + Fx;
+	        jx = jx + Fx;
 			m4 = m4 + rlx_setB*((-0.6666666666666666*ux*rho0)- m4)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
-            jy = jy + Fy;
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+	        jy = jy + Fy;
 			m6 = m6 + rlx_setB*((-0.6666666666666666*uy*rho0)- m6)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
-            jz = jz + Fz;
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+	        jz = jz + Fz;
 			m8 = m8 + rlx_setB*((-0.6666666666666666*uz*rho0)- m8)
-                    + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+	                + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
 			m9 = m9 + rlx_setA*(((2*ux*ux-uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9);
 			m10 = m10 + rlx_setA*( - m10);
-            //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
+	        //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10);
 			m11 = m11 + rlx_setA*(((uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(ny*ny-nz*nz)- m11);
 			m12 = m12 + rlx_setA*( - m12);
-            //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12);
+	        //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12);
 			m13 = m13 + rlx_setA*( (ux*uy*rho0/porosity) + 0.5*alpha*C*nx*ny - m13);
 			m14 = m14 + rlx_setA*( (uy*uz*rho0/porosity) + 0.5*alpha*C*ny*nz - m14);
 			m15 = m15 + rlx_setA*( (ux*uz*rho0/porosity) + 0.5*alpha*C*nx*nz - m15);
 			m16 = m16 + rlx_setB*( - m16);
 			m17 = m17 + rlx_setB*( - m17);
 			m18 = m18 + rlx_setB*( - m18);
-            //----------------------------------------------------------------------//
+	        //----------------------------------------------------------------------//

-            //----------------With higher-order force ------------------------------//
+	        //----------------With higher-order force ------------------------------//
 			//if (C == 0.0)	nx = ny = nz = 0.0;
 			//m1 = m1 + rlx_setA*((19*(ux*ux+uy*uy+uz*uz)*rho0/porosity - 11*rho) -19*alpha*C - m1)
-            //        + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
+	        //        + (1-0.5*rlx_setA)*38*(Fx*ux+Fy*uy+Fz*uz)/porosity;
 			//m2 = m2 + rlx_setA*((3*rho - 5.5*(ux*ux+uy*uy+uz*uz)*rho0/porosity)- m2)
-            //        + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
-            //jx = jx + Fx;
+	        //        + (1-0.5*rlx_setA)*11*(-Fx*ux-Fy*uy-Fz*uz)/porosity;
+	        //jx = jx + Fx;
 			//m4 = m4 + rlx_setB*((-0.6666666666666666*ux*rho0)- m4)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
-            //jy = jy + Fy;
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fx);
+	        //jy = jy + Fy;
 			//m6 = m6 + rlx_setB*((-0.6666666666666666*uy*rho0)- m6)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
-            //jz = jz + Fz;
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fy);
+	        //jz = jz + Fz;
 			//m8 = m8 + rlx_setB*((-0.6666666666666666*uz*rho0)- m8)
-            //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
+	        //        + (1-0.5*rlx_setB)*(-0.6666666666666666*Fz);
 			//m9 = m9 + rlx_setA*(((2*ux*ux-uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(2*nx*nx-ny*ny-nz*nz) - m9)
-            //        + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
+	        //        + (1-0.5*rlx_setA)*(4*Fx*ux-2*Fy*uy-2*Fz*uz)/porosity;
 			////m10 = m10 + rlx_setA*( - m10);
-            //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
-            //          + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
+	        //m10 = m10 + rlx_setA*(-0.5*rho0*((2*ux*ux-uy*uy-uz*uz)/porosity)- m10)
+	        //          + (1-0.5*rlx_setA)*(-2*Fx*ux+Fy*uy+Fz*uz)/porosity;
 			//m11 = m11 + rlx_setA*(((uy*uy-uz*uz)*rho0/porosity) + 0.5*alpha*C*(ny*ny-nz*nz)- m11)
-            //          + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(2*Fy*uy-2*Fz*uz)/porosity;
 			////m12 = m12 + rlx_setA*( - m12);
-            //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12)
-            //          + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
+	        //m12 = m12 + rlx_setA*(-0.5*(rho0*(uy*uy-uz*uz)/porosity)- m12)
+	        //          + (1-0.5*rlx_setA)*(-Fy*uy+Fz*uz)/porosity;
 			//m13 = m13 + rlx_setA*( (ux*uy*rho0/porosity) + 0.5*alpha*C*nx*ny - m13);
-            //          + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fy*ux+Fx*uy)/porosity;
 			//m14 = m14 + rlx_setA*( (uy*uz*rho0/porosity) + 0.5*alpha*C*ny*nz - m14);
-            //          + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fz*uy+Fy*uz)/porosity;
 			//m15 = m15 + rlx_setA*( (ux*uz*rho0/porosity) + 0.5*alpha*C*nx*nz - m15);
-            //          + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
+	        //          + (1-0.5*rlx_setA)*(Fz*ux+Fx*uz)/porosity;
 			//m16 = m16 + rlx_setB*( - m16);
 			//m17 = m17 + rlx_setB*( - m17);
 			//m18 = m18 + rlx_setB*( - m18);
-            //----------------------------------------------------------------------//
+	        //----------------------------------------------------------------------//

 			//.................inverse transformation......................................................
 			// q=0
@ -2882,21 +2944,27 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 			nAB = 1.0/(nA+nB);
 			Aq[n] = 0.3333333333333333*nA;
 			Bq[n] = 0.3333333333333333*nB;
-
+			
 			//...............................................
 			// q = 0,2,4
 			// Cq = {1,0,0}, {0,1,0}, {0,0,1}
+			jA = nA*ux;
+			jB = nB*ux;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*nx;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*ux))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*ux))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*ux))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*ux))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*ux*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*ux*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			Aq[1*Np+n] = a1;
 			Bq[1*Np+n] = b1;
@ -2904,45 +2972,57 @@ __global__  void dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dis
 			Bq[2*Np+n] = b2;

 			//...............................................
-			// q = 2
 			// Cq = {0,1,0}
+			jA = nA*uy;
+			jB = nB*uy;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*ny;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*uy))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*uy))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*uy))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*uy))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*uy*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*uy*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			Aq[3*Np+n] = a1;
 			Bq[3*Np+n] = b1;
 			Aq[4*Np+n] = a2;
 			Bq[4*Np+n] = b2;
+
 			//...............................................
 			// q = 4
 			// Cq = {0,0,1}
+			jA = nA*uz;
+			jB = nB*uz;		
 			delta = beta*nA*nB*nAB*0.1111111111111111*nz;
 			if (!(nA*nB*nAB>0)) delta=0;
-            //----------------newly added for better control of recoloring---------------//
-            if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0) delta = 0.0; 
-            if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
-            //---------------------------------------------------------------------------//
-            if (RecoloringOff==true && porosity !=1.0) delta=0;
-			a1 = nA*(0.1111111111111111*(1+4.5*uz))+delta;
-			b1 = nB*(0.1111111111111111*(1+4.5*uz))-delta;
-			a2 = nA*(0.1111111111111111*(1-4.5*uz))-delta;
-			b2 = nB*(0.1111111111111111*(1-4.5*uz))+delta;
+	        //----------------newly added for better control of recoloring---------------//
+	        if (nA/(nA+nB)>=Sn_grey && nA/(nA+nB) <= Sw_grey && porosity !=1.0){
+	        	delta = 0.0; 
+	    		jA = 0.5*uz*(nA+nB)*(1.0+mobility_ratio);
+	    		jB = 0.5*uz*(nA+nB)*(1.0-mobility_ratio);
+	        }
+	        if (nA/(nA+nB)>Sw_grey && porosity !=1.0) delta = -1.0*delta; 
+	        //---------------------------------------------------------------------------//
+	        if (RecoloringOff==true && porosity !=1.0) delta=0;
+
+			a1 = (0.1111111111111111*(nA+4.5*jA))+delta;
+			b1 = (0.1111111111111111*(nB+4.5*jB))-delta;
+			a2 = (0.1111111111111111*(nA-4.5*jA))-delta;
+			b2 = (0.1111111111111111*(nB-4.5*jB))+delta;

 			Aq[5*Np+n] = a1;
 			Bq[5*Np+n] = b1;
 			Aq[6*Np+n] = a2;
 			Bq[6*Np+n] = b2;
 			//...............................................
-
 		}
 	}
 }
@ -4530,11 +4610,11 @@ extern "C" void ScaLBL_PhaseField_InitFromRestart(double *Den, double *Aq, doubl

 //Model-1 & 4 with capillary pressure penalty
 extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm,double *Vel, double *Pressure,
+        double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw,  double *Poros,double *Perm,double *Vel, double *Pressure,
        double rhoA, double rhoB, double tauA, double tauB,double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Fx, double Fy, double Fz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){

-	dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP<<<NBLOCKS,NTHREADS >>>(Map, dist, Aq, Bq, Den, Phi, GreySolidW, GreySn, GreySw, Poros, Perm, Vel, Pressure,
+	dvc_ScaLBL_D3Q19_AAeven_GreyscaleColor_CP<<<NBLOCKS,NTHREADS >>>(Map, dist, Aq, Bq, Den, Phi, GreySolidW, GreySn, GreySw, GreyKn, GreyKw, Poros, Perm, Vel, Pressure,
            rhoA, rhoB, tauA, tauB, tauA_eff, tauB_eff, alpha, beta, Fx, Fy, Fz, RecoloringOff, strideY, strideZ, start, finish, Np);
 	
 	cudaError_t err = cudaGetLastError();
@ -4546,11 +4626,11 @@ extern "C" void ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(int *Map, double *dist, do

 //Model-1 & 4 with capillary pressure penalty
 extern "C" void ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(int *d_neighborList, int *Map, double *dist, double *Aq, double *Bq, double *Den, 
-		double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *Poros,double *Perm,double *Vel,double *Pressure, 
+		double *Phi, double *GreySolidW, double *GreySn, double *GreySw, double *GreyKn, double *GreyKw, double *Poros,double *Perm,double *Vel,double *Pressure, 
        double rhoA, double rhoB, double tauA, double tauB, double tauA_eff,double tauB_eff, double alpha, double beta,
 		double Fx, double Fy, double Fz, bool RecoloringOff, int strideY, int strideZ, int start, int finish, int Np){

-	dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP<<<NBLOCKS,NTHREADS >>>(d_neighborList, Map, dist, Aq, Bq, Den, Phi, GreySolidW, GreySn, GreySw, Poros, Perm,Vel,Pressure,
+	dvc_ScaLBL_D3Q19_AAodd_GreyscaleColor_CP<<<NBLOCKS,NTHREADS >>>(d_neighborList, Map, dist, Aq, Bq, Den, Phi, GreySolidW, GreySn, GreySw, GreyKn, GreyKw, Poros, Perm,Vel,Pressure,
 			rhoA, rhoB, tauA, tauB, tauA_eff, tauB_eff,alpha, beta, Fx, Fy, Fz, RecoloringOff, strideY, strideZ, start, finish, Np);

 	cudaError_t err = cudaGetLastError();
--- a/docs/source/userGuide/models/color/index.rst
+++ b/docs/source/userGuide/models/color/index.rst
@ -54,12 +54,12 @@ Model parameters

 The essential model parameters for the color model are

- ``alpha`` -- control the interfacial tension between fluids with :math:`0 < \alpha < 0.01`
- ``beta`` -- control the width of the interface with key :math:`\beta < 1`
- ``tauA`` -- control the viscosity of fluid A with :math:`0.7 < \tau_A < 1.5`
- ``tauB`` -- control the viscosity of fluid B with :math:`0.7 < \tau_B < 1.5`
- ``rhoA`` -- control the viscosity of fluid A with :math:`0.05 < \rho_A < 1.0`
- ``rhoB`` -- control the viscosity of fluid B with :math:`0.05 < \rho_B < 1.0`
+- ``alpha`` -- control the interfacial tension between fluids -- :math:`0 < \alpha < 0.01`
+- ``beta`` -- control the width of the interface -- :math:`\beta < 1`
+- ``tauA`` -- control the viscosity of fluid A -- :math:`0.7 < \tau_A < 1.5`
+- ``tauB`` -- control the viscosity of fluid B -- :math:`0.7 < \tau_B < 1.5`
+- ``rhoA`` -- control the viscosity of fluid A -- :math:`0.05 < \rho_A < 1.0`
+- ``rhoB`` -- control the viscosity of fluid B -- :math:`0.05 < \rho_B < 1.0`

 ****************************
 Model Formulation
--- a/hip/FreeLee.cu
+++ b/hip/FreeLee.cu
@ -3396,7 +3396,6 @@ extern "C" void ScaLBL_FreeLeeModel_PhaseField_Init(int *Map, double *Phi, doubl
 extern "C" void ScaLBL_D3Q7_AAodd_FreeLee_PhaseField(int *neighborList, int *Map, double *hq, double *Den, double *Phi, double *ColorGrad, double *Vel,
                                                          double rhoA, double rhoB, double tauM, double W, int start, int finish, int Np)
 {
-
 	hipFuncSetCacheConfig((void*)dvc_ScaLBL_D3Q7_AAodd_FreeLee_PhaseField, hipFuncCachePreferL1);
 	dvc_ScaLBL_D3Q7_AAodd_FreeLee_PhaseField<<<NBLOCKS,NTHREADS >>>(neighborList, Map, hq, Den, Phi, ColorGrad, Vel,
             rhoA,  rhoB, tauM, W, start, finish,  Np);
@ -3410,7 +3409,6 @@ extern "C" void ScaLBL_D3Q7_AAeven_FreeLee_PhaseField( int *Map, double *hq, dou
 		double rhoA, double rhoB, double tauM, double W, int start, int finish, int Np){

 	hipFuncSetCacheConfig((void*)dvc_ScaLBL_D3Q7_AAeven_FreeLee_PhaseField, hipFuncCachePreferL1);
-
 	dvc_ScaLBL_D3Q7_AAeven_FreeLee_PhaseField<<<NBLOCKS,NTHREADS >>>( Map, hq, Den, Phi, ColorGrad, Vel, rhoA, rhoB, tauM, W, start, finish, Np);
 	hipError_t err = hipGetLastError();
 	if (hipSuccess != err){
@ -3517,7 +3515,6 @@ extern "C" void ScaLBL_D3Q19_AAodd_FreeLeeModel_SingleFluid_BGK(int *neighborLis
 extern "C" void ScaLBL_D3Q19_AAeven_FreeLeeModel_SingleFluid_BGK(double *dist, double *Vel, double *Pressure, 
                                                                 double tau, double rho0, double Fx, double Fy, double Fz, int start, int finish, int Np){

-
 	hipFuncSetCacheConfig((void*)dvc_ScaLBL_D3Q19_AAeven_FreeLeeModel_SingleFluid_BGK, hipFuncCachePreferL1);
 	dvc_ScaLBL_D3Q19_AAeven_FreeLeeModel_SingleFluid_BGK<<<NBLOCKS,NTHREADS >>>(dist, Vel, Pressure, 
            tau, rho0,  Fx, Fy, Fz, start, finish, Np);
--- a/models/ColorModel.cpp
+++ b/models/ColorModel.cpp
@ -214,6 +214,7 @@ void ScaLBL_ColorModel::ReadParams(string filename){
 		domain_db->putScalar<int>( "BC", BoundaryCondition );
 	} 
 	else if (protocol == "core flooding"){
+	        if (rank == 0) printf("Using core flooding protocol \n");
 		if (BoundaryCondition != 4){
 			BoundaryCondition = 4;
 			if (rank==0) printf("WARNING: protocol (core flooding) supports only volumetric flux boundary condition \n");
@ -221,11 +222,12 @@ void ScaLBL_ColorModel::ReadParams(string filename){
 		domain_db->putScalar<int>( "BC", BoundaryCondition );
 		if (color_db->keyExists( "capillary_number" )){
 			double capillary_number = color_db->getScalar<double>( "capillary_number" );
+			if (rank==0) printf("   set flux to achieve Ca=%f \n", capillary_number);
 			double MuB = rhoB*(tauB - 0.5)/3.0;
 			double IFT = 6.0*alpha;
-			double CrossSectionalArea = (double) (nprocx*(Nx-2)*nprocy*(Ny-2));
-			flux = Dm->Porosity()*CrossSectionalArea*IFT*capillary_number/MuB;
-			if (rank==0) printf("  protocol (core flooding): set flux=%f to achieve Ca=%f \n",flux, capillary_number);
+			//double CrossSectionalArea = (double) (nprocx*(Nx-2)*nprocy*(Ny-2));
+			flux = Dm->Porosity()*nprocx*(Nx-2)*nprocy*(Ny-2)*IFT*capillary_number/MuB;
+			if (rank==0) printf("  flux=%f \n",flux);
 		}
 		color_db->putScalar<double>( "flux", flux );
 	} 
@ -627,7 +629,6 @@ double ScaLBL_ColorModel::Run(int returntime){
 	runAnalysis analysis( current_db, rank_info, ScaLBL_Comm, Dm, Np, Regular, Map );
 	auto t1 = std::chrono::system_clock::now();
 	int CURRENT_TIMESTEP = 0;
-	int START_TIMESTEP = timestep;
 	int EXIT_TIMESTEP = min(timestepMax,returntime);
 	while (timestep < EXIT_TIMESTEP ) {
 		//if ( rank==0 ) { printf("Running timestep %i (%i MB)\n",timestep+1,(int)(Utilities::getMemoryUsage()/1048576)); }
@ -896,7 +897,7 @@ double ScaLBL_ColorModel::Run(int returntime){
 	//************************************************************************
 	// Compute the walltime per timestep
 	auto t2 = std::chrono::system_clock::now();
-	double cputime = std::chrono::duration<double>( t2 - t1 ).count() / (timestep - START_TIMESTEP);
+	double cputime = std::chrono::duration<double>( t2 - t1 ).count() / CURRENT_TIMESTEP;
 	// Performance obtained from each node
 	double MLUPS = double(Np)/cputime/1000000;

--- a/models/GreyscaleColorModel.cpp
+++ b/models/GreyscaleColorModel.cpp
@ -246,8 +246,10 @@ void ScaLBL_GreyscaleColorModel::AssignComponentLabels()
 		ERROR("Error: ComponentLabels and ComponentAffinity must be the same length! \n");
 	}

-	double label_count[NLABELS];
-	double label_count_global[NLABELS];
+	double * label_count;
+	double *label_count_global;
+	label_count = new double [NLABELS];
+	label_count_global = new double [NLABELS];
 	// Assign the labels

 	for (size_t idx=0; idx<NLABELS; idx++) label_count[idx]=0;
@ -258,7 +260,7 @@ void ScaLBL_GreyscaleColorModel::AssignComponentLabels()
 				int n = k*Nx*Ny+j*Nx+i;
 				VALUE=id[n];
 				// Assign the affinity from the paired list
-				for (unsigned int idx=0; idx < NLABELS; idx++){
+				for (size_t idx=0; idx < NLABELS; idx++){
 				      //printf("idx=%i, value=%i, %i, \n",idx, VALUE,LabelList[idx]);
 					if (VALUE == LabelList[idx]){
 						AFFINITY=AffinityList[idx];
@ -311,23 +313,31 @@ void ScaLBL_GreyscaleColorModel::AssignGreySolidLabels()//apply capillary penalt
 	double *GreySolidW_host  = new double [Np];
 	double *GreySn_host      = new double [Np];
 	double *GreySw_host      = new double [Np];
-
+	double *GreyKn_host      = new double [Np];
+	double *GreyKw_host      = new double [Np];
+	
 	size_t NLABELS=0;
 	signed char VALUE=0;
 	double AFFINITY=0.f;
    double Sn,Sw;//end-point saturation of greynodes set by users
+    double Kn,Kw; // endpoint effective permeability

 	auto LabelList = greyscaleColor_db->getVector<int>( "GreySolidLabels" );
 	auto AffinityList = greyscaleColor_db->getVector<double>( "GreySolidAffinity" );
-	auto SnList = greyscaleColor_db->getVector<double>( "GreySnList" );
-	auto SwList = greyscaleColor_db->getVector<double>( "GreySwList" );
-
+	auto SnList = greyscaleColor_db->getVector<double>( "grey_endpoint_A" );
+	auto SwList = greyscaleColor_db->getVector<double>( "grey_endpoint_B" );
+	auto KnList = greyscaleColor_db->getVector<double>( "grey_endpoint_permeability_A" );
+	auto KwList = greyscaleColor_db->getVector<double>( "grey_endpoint_permeability_B" );
+	
 	NLABELS=LabelList.size();
 	if (NLABELS != AffinityList.size()){
 		ERROR("Error: GreySolidLabels and GreySolidAffinity must be the same length! \n");
 	}
 	if (NLABELS != SnList.size() || NLABELS != SwList.size()){
-		ERROR("Error: GreySolidLabels, GreySnList, and GreySwList must be the same length! \n");
+		ERROR("Error: GreySolidLabels, grey_endpoint_A, and grey_endpoint_B must be the same length! \n");
+	}
+	if (NLABELS != KnList.size() || NLABELS != KwList.size()){
+		ERROR("Error: GreySolidLabels, grey_endpoint_permeability_A, and grey_endpoint_permeability_B must be the same length! \n");
 	}

 	for (int k=0;k<Nz;k++){
@ -338,12 +348,16 @@ void ScaLBL_GreyscaleColorModel::AssignGreySolidLabels()//apply capillary penalt
 	            AFFINITY=0.f;//all nodes except the specified grey nodes have grey-solid affinity = 0.0
                Sn=99.0;
                Sw=-99.0;
+                Kn = 0.0;
+                Kw = 0.0;
 				// Assign the affinity from the paired list
 				for (unsigned int idx=0; idx < NLABELS; idx++){
 					if (VALUE == LabelList[idx]){
 						AFFINITY=AffinityList[idx];
                        Sn = SnList[idx];
                        Sw = SwList[idx];
+                        Kn = SnList[idx];
+                        Kw = SwList[idx];
 						idx = NLABELS;
 					}
 				}
@ -352,6 +366,8 @@ void ScaLBL_GreyscaleColorModel::AssignGreySolidLabels()//apply capillary penalt
                    GreySolidW_host[idx] = AFFINITY;
                    GreySn_host[idx]     = Sn;
                    GreySw_host[idx]     = Sw;
+                    GreyKn_host[idx]     = Kn;
+                    GreyKw_host[idx]     = Kw;
                }
 			}
 		}
@ -374,6 +390,8 @@ void ScaLBL_GreyscaleColorModel::AssignGreySolidLabels()//apply capillary penalt
 	ScaLBL_CopyToDevice(GreySolidW, GreySolidW_host, Np*sizeof(double));
 	ScaLBL_CopyToDevice(GreySn, GreySn_host, Np*sizeof(double));
 	ScaLBL_CopyToDevice(GreySw, GreySw_host, Np*sizeof(double));
+	ScaLBL_CopyToDevice(GreyKn, GreySn_host, Np*sizeof(double));
+	ScaLBL_CopyToDevice(GreyKw, GreySw_host, Np*sizeof(double));
 	ScaLBL_Comm->Barrier();
    delete [] GreySolidW_host;
    delete [] GreySn_host;
@ -402,11 +420,13 @@ void ScaLBL_GreyscaleColorModel::AssignGreyPoroPermLabels()
 		ERROR("Error: GreySolidLabels and PorosityList must be the same length! \n");
 	}

-	double label_count[NLABELS];
-	double label_count_global[NLABELS];
+	double * label_count;
+	double * label_count_global;
+	label_count = new double [NLABELS];
+	label_count_global = new double [NLABELS];
 	// Assign the labels

-	for (int idx=0; idx<NLABELS; idx++) label_count[idx]=0;
+	for (size_t idx=0; idx<NLABELS; idx++) label_count[idx]=0;

 	for (int k=0;k<Nz;k++){
 		for (int j=0;j<Ny;j++){
@ -415,7 +435,7 @@ void ScaLBL_GreyscaleColorModel::AssignGreyPoroPermLabels()
 				VALUE=id[n];
                POROSITY=1.f;//default: label 1 or 2, i.e. open nodes and porosity=1.0
 				// Assign the affinity from the paired list
-				for (unsigned int idx=0; idx < NLABELS; idx++){
+				for (size_t idx=0; idx < NLABELS; idx++){
 				      //printf("idx=%i, value=%i, %i, \n",idx, VALUE,LabelList[idx]);
 					if (VALUE == LabelList[idx]){
 						POROSITY=PorosityList[idx];
@ -472,7 +492,7 @@ void ScaLBL_GreyscaleColorModel::AssignGreyPoroPermLabels()
 	// Set Dm to match Mask
 	for (int i=0; i<Nx*Ny*Nz; i++) Dm->id[i] = Mask->id[i]; 
 	
-	for (int idx=0; idx<NLABELS; idx++)		label_count_global[idx]=Dm->Comm.sumReduce(  label_count[idx]);
+	for (size_t idx=0; idx<NLABELS; idx++)		label_count_global[idx]=Dm->Comm.sumReduce(  label_count[idx]);

    //Initialize a weighted porosity after considering grey voxels
    GreyPorosity=0.0;
@ -620,7 +640,9 @@ void ScaLBL_GreyscaleColorModel::Create(){
    //ScaLBL_AllocateDeviceMemory((void **) &GreySolidGrad, 3*sizeof(double)*Np);		
    ScaLBL_AllocateDeviceMemory((void **) &GreySolidW, sizeof(double)*Np);		
    ScaLBL_AllocateDeviceMemory((void **) &GreySn, sizeof(double)*Np);		
-    ScaLBL_AllocateDeviceMemory((void **) &GreySw, sizeof(double)*Np);		
+    ScaLBL_AllocateDeviceMemory((void **) &GreySw, sizeof(double)*Np);	
+    ScaLBL_AllocateDeviceMemory((void **) &GreyKn, sizeof(double)*Np);		
+    ScaLBL_AllocateDeviceMemory((void **) &GreyKw, sizeof(double)*Np);	
    ScaLBL_AllocateDeviceMemory((void **) &Porosity_dvc, sizeof(double)*Np);
    ScaLBL_AllocateDeviceMemory((void **) &Permeability_dvc, sizeof(double)*Np);
 	//...........................................................................
@ -774,7 +796,6 @@ void ScaLBL_GreyscaleColorModel::Run(){
 	bool USE_MORPH = false;
 	bool USE_SEED = false;
 	bool USE_DIRECT = false;
-	bool USE_MORPHOPEN_OIL = false;
 	int MAX_MORPH_TIMESTEPS = 50000; // maximum number of LBM timesteps to spend in morphological adaptation routine
 	int MIN_STEADY_TIMESTEPS = 100000;
 	int MAX_STEADY_TIMESTEPS = 200000;
@ -915,7 +936,6 @@ void ScaLBL_GreyscaleColorModel::Run(){
 	//************ MAIN ITERATION LOOP ***************************************/
 	PROFILE_START("Loop");
    //std::shared_ptr<Database> analysis_db;
-	bool Regular = false;
 	auto current_db = db->cloneDatabase();
 	//runAnalysis analysis( current_db, rank_info, ScaLBL_Comm, Dm, Np, Regular, Map );
 	//analysis.createThreads( analysis_method, 4 );
@ -944,7 +964,7 @@ void ScaLBL_GreyscaleColorModel::Run(){
 		// Halo exchange for phase field
 		ScaLBL_Comm_Regular->SendHalo(Phi);
        //Model-1&4 with capillary pressure penalty for grey nodes
-        ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
+        ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,GreyKn,GreyKw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
                rhoA, rhoB, tauA, tauB,tauA_eff, tauB_eff, 
                alpha, beta, Fx, Fy, Fz, RecoloringOff, Nx, Nx*Ny, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
        //Model-1&4
@ -973,7 +993,7 @@ void ScaLBL_GreyscaleColorModel::Run(){
 		}

        //Model-1&4 with capillary pressure penalty for grey nodes
-        ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
+        ScaLBL_D3Q19_AAodd_GreyscaleColor_CP(NeighborList, dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,GreyKn,GreyKw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
                rhoA, rhoB, tauA, tauB,tauA_eff, tauB_eff,
                alpha, beta, Fx, Fy, Fz, RecoloringOff, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
        //Model-1&4
@ -1006,7 +1026,7 @@ void ScaLBL_GreyscaleColorModel::Run(){
 		}
 		ScaLBL_Comm_Regular->SendHalo(Phi);
        //Model-1&4 with capillary pressure penalty for grey nodes
-        ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,Porosity_dvc,Permeability_dvc,Velocity,Pressure, 
+        ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,GreyKn,GreyKw,Porosity_dvc,Permeability_dvc,Velocity,Pressure, 
                rhoA, rhoB, tauA, tauB,tauA_eff, tauB_eff,
                alpha, beta, Fx, Fy, Fz, RecoloringOff, Nx, Nx*Ny, ScaLBL_Comm->FirstInterior(), ScaLBL_Comm->LastInterior(), Np);
        //Model-1&4
@ -1035,7 +1055,7 @@ void ScaLBL_GreyscaleColorModel::Run(){
 		}

        //Model-1&4 with capillary pressure penalty for grey nodes
-        ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
+        ScaLBL_D3Q19_AAeven_GreyscaleColor_CP(dvcMap, fq, Aq, Bq, Den, Phi, GreySolidW,GreySn,GreySw,GreyKn,GreyKw,Porosity_dvc,Permeability_dvc,Velocity,Pressure,
                rhoA, rhoB, tauA, tauB,tauA_eff, tauB_eff,
                alpha, beta, Fx, Fy, Fz, RecoloringOff, Nx, Nx*Ny, 0, ScaLBL_Comm->LastExterior(), Np);
        //Model-1&4
@ -1117,14 +1137,7 @@ void ScaLBL_GreyscaleColorModel::Run(){
 			double muA = rhoA*(tauA-0.5)/3.f; 
 			double muB = rhoB*(tauB-0.5)/3.f;				
 			double force_mag = sqrt(Fx*Fx+Fy*Fy+Fz*Fz);
-			double dir_x = Fx/force_mag;
-			double dir_y = Fy/force_mag;
-			double dir_z = Fz/force_mag;
 			if (force_mag == 0.0){
-				// default to z direction
-				dir_x = 0.0;
-				dir_y = 0.0;
-				dir_z = 1.0;
 				force_mag = 1.0;
 			}
 			double current_saturation = Averages->saturation;
--- a/models/GreyscaleColorModel.h
+++ b/models/GreyscaleColorModel.h
@ -70,6 +70,8 @@ public:
    double *GreySolidW;
    double *GreySn;
    double *GreySw;
+    double *GreyKn;
+    double *GreyKw;
 	//double *ColorGrad;
 	double *Velocity;
 	double *Pressure;
--- a/models/GreyscaleModel.cpp
+++ b/models/GreyscaleModel.cpp
@ -457,16 +457,21 @@ void ScaLBL_GreyscaleModel::Initialize(){
 		if (rank==0){
 			printf("Initializing distributions from Restart! \n");
 		}
-		// Read in the restart file to CPU buffers
-        std::shared_ptr<double> cfq;
-        cfq = std::shared_ptr<double>(new double[19*Np],DeleteArray<double>);
-        FILE *File;
-        File=fopen(LocalRestartFile,"rb");
-        fread(cfq.get(),sizeof(double),19*Np,File);
-        fclose(File);
+		double value;
+        double *cfq;
+        cfq = new double[19*Np];
+		ifstream File(LocalRestartFile,ios::binary);
+		for (int n=0; n<Np; n++){
+			// Read the distributions
+			for (int q=0; q<19; q++){
+				File.read((char*) &value, sizeof(value));
+				cfq[q*Np+n] = value;
+			}
+		}
+		File.close();

 		// Copy the restart data to the GPU
-		ScaLBL_CopyToDevice(fq,cfq.get(),19*Np*sizeof(double));
+		ScaLBL_CopyToDevice(fq,cfq,19*Np*sizeof(double));
 		ScaLBL_DeviceBarrier();

 		comm.barrier();
--- a/models/IonModel.cpp
+++ b/models/IonModel.cpp
@ -191,7 +191,13 @@ void ScaLBL_IonModel::ReadParams(string filename,vector<int> &num_iter){
                    case 1://fixed boundary ion concentration [mol/m^3]
                       Cin[i] = Cin[i]*(h*h*h*1.0e-18);//LB ion concentration has unit [mol/lu^3]
                       break;
-                    case 2://fixed boundary ion flux [mol/m^2/sec]
+                    case 21://fixed boundary ion flux [mol/m^2/sec]
+                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 22://fixed boundary ion flux [mol/m^2/sec]
+                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 23://fixed boundary ion flux [mol/m^2/sec]
                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
                       break; 
                }
@ -225,7 +231,13 @@ void ScaLBL_IonModel::ReadParams(string filename,vector<int> &num_iter){
                    case 1://fixed boundary ion concentration [mol/m^3]
                       Cout[i] = Cout[i]*(h*h*h*1.0e-18);//LB ion concentration has unit [mol/lu^3]
                       break;
-                    case 2://fixed boundary ion flux [mol/m^2/sec]
+                    case 21://fixed boundary ion flux [mol/m^2/sec]
+                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 22://fixed boundary ion flux [mol/m^2/sec]
+                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 23://fixed boundary ion flux [mol/m^2/sec]
                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
                       break; 
                }
@ -401,7 +413,13 @@ void ScaLBL_IonModel::ReadParams(string filename){
                    case 1://fixed boundary ion concentration [mol/m^3]
                       Cin[i] = Cin[i]*(h*h*h*1.0e-18);//LB ion concentration has unit [mol/lu^3]
                       break;
-                    case 2://fixed boundary ion flux [mol/m^2/sec]
+                    case 21://fixed boundary ion flux [mol/m^2/sec]
+                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 22://fixed boundary ion flux [mol/m^2/sec]
+                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 23://fixed boundary ion flux [mol/m^2/sec]
                       Cin[i] = Cin[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
                       break; 
                }
@ -435,7 +453,13 @@ void ScaLBL_IonModel::ReadParams(string filename){
                    case 1://fixed boundary ion concentration [mol/m^3]
                       Cout[i] = Cout[i]*(h*h*h*1.0e-18);//LB ion concentration has unit [mol/lu^3]
                       break;
-                    case 2://fixed boundary ion flux [mol/m^2/sec]
+                    case 21://fixed boundary ion flux [mol/m^2/sec]
+                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 22://fixed boundary ion flux [mol/m^2/sec]
+                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
+                       break; 
+                    case 23://fixed boundary ion flux [mol/m^2/sec]
                       Cout[i] = Cout[i]*(h*h*1.0e-12)*time_conv[i];//LB ion flux has unit [mol/lu^2/lt]
                       break; 
                }
@ -744,8 +768,14 @@ void ScaLBL_IonModel::Initialize(){
            case 1:
                if (rank==0) printf("LB Ion Solver: inlet boundary for Ion %zu is concentration = %.5g [mol/m^3] \n",i+1,Cin[i]/(h*h*h*1.0e-18));
                break;
-            case 2:
-                if (rank==0) printf("LB Ion Solver: inlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec] \n",i+1,Cin[i]/(h*h*1.0e-12)/time_conv[i]);
+            case 21:
+                if (rank==0) printf("LB Ion Solver: inlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive flux only. \n",i+1,Cin[i]/(h*h*1.0e-12)/time_conv[i]);
+                break;
+            case 22:
+                if (rank==0) printf("LB Ion Solver: inlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive + advective flux. \n",i+1,Cin[i]/(h*h*1.0e-12)/time_conv[i]);
+                break;
+            case 23:
+                if (rank==0) printf("LB Ion Solver: inlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive + advective + electric flux. \n",i+1,Cin[i]/(h*h*1.0e-12)/time_conv[i]);
                break;
        }
        switch (BoundaryConditionOutlet[i]){
@ -755,8 +785,14 @@ void ScaLBL_IonModel::Initialize(){
            case 1:
                if (rank==0) printf("LB Ion Solver: outlet boundary for Ion %zu is concentration = %.5g [mol/m^3] \n",i+1,Cout[i]/(h*h*h*1.0e-18));
                break;
-            case 2:
-                if (rank==0) printf("LB Ion Solver: outlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec] \n",i+1,Cout[i]/(h*h*1.0e-12)/time_conv[i]);
+            case 21:
+                if (rank==0) printf("LB Ion Solver: outlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive flux only. \n",i+1,Cout[i]/(h*h*1.0e-12)/time_conv[i]);
+                break;
+            case 22:
+                if (rank==0) printf("LB Ion Solver: outlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive + advective flux. \n",i+1,Cout[i]/(h*h*1.0e-12)/time_conv[i]);
+                break;
+            case 23:
+                if (rank==0) printf("LB Ion Solver: outlet boundary for Ion %zu is (inward) flux = %.5g [mol/m^2/sec]; Diffusive + advective + electric flux. \n",i+1,Cout[i]/(h*h*1.0e-12)/time_conv[i]);
                break;
        }
    }
@ -805,8 +841,14 @@ void ScaLBL_IonModel::Run(double *Velocity, double *ElectricField){
                    case 1: 
                        ScaLBL_Comm->D3Q7_Ion_Concentration_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], timestep);
                        break;
-                    case 2: 
-                        ScaLBL_Comm->D3Q7_Ion_Flux_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                    case 21: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_Diff_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 22: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvc_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 23: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvcElec_BC_z(NeighborList,&fq[ic*Np*7],Cin[ic],tau[ic],&Velocity[2*Np],&ElectricField[2*Np],IonDiffusivity[ic],IonValence[ic],Vt,timestep);
                        break;
                }
            }
@ -815,8 +857,14 @@ void ScaLBL_IonModel::Run(double *Velocity, double *ElectricField){
                    case 1: 
                        ScaLBL_Comm->D3Q7_Ion_Concentration_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], timestep);
                        break;
-                    case 2: 
-                        ScaLBL_Comm->D3Q7_Ion_Flux_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                    case 21: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_Diff_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 22: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvc_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 23: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvcElec_BC_Z(NeighborList,&fq[ic*Np*7],Cout[ic],tau[ic],&Velocity[2*Np],&ElectricField[2*Np],IonDiffusivity[ic],IonValence[ic],Vt,timestep);
                        break;
                }
            }
@ -849,8 +897,14 @@ void ScaLBL_IonModel::Run(double *Velocity, double *ElectricField){
                    case 1: 
                        ScaLBL_Comm->D3Q7_Ion_Concentration_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], timestep);
                        break;
-                    case 2: 
-                        ScaLBL_Comm->D3Q7_Ion_Flux_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                    case 21: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_Diff_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 22: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvc_BC_z(NeighborList, &fq[ic*Np*7],  Cin[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 23: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvcElec_BC_z(NeighborList,&fq[ic*Np*7],Cin[ic],tau[ic],&Velocity[2*Np],&ElectricField[2*Np],IonDiffusivity[ic],IonValence[ic],Vt,timestep);
                        break;
                }
            }
@ -859,8 +913,14 @@ void ScaLBL_IonModel::Run(double *Velocity, double *ElectricField){
                    case 1: 
                        ScaLBL_Comm->D3Q7_Ion_Concentration_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], timestep);
                        break;
-                    case 2: 
-                        ScaLBL_Comm->D3Q7_Ion_Flux_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                    case 21: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_Diff_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 22: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvc_BC_Z(NeighborList, &fq[ic*Np*7],  Cout[ic], tau[ic], &Velocity[2*Np], timestep);
+                        break;
+                    case 23: 
+                        ScaLBL_Comm->D3Q7_Ion_Flux_DiffAdvcElec_BC_Z(NeighborList,&fq[ic*Np*7],Cout[ic],tau[ic],&Velocity[2*Np],&ElectricField[2*Np],IonDiffusivity[ic],IonValence[ic],Vt,timestep);
                        break;
                }
            }
--- a/models/StokesModel.cpp
+++ b/models/StokesModel.cpp
@ -297,7 +297,7 @@ void ScaLBL_StokesModel::AssignZetaPotentialSolid(double *zeta_potential_solid)

 	NLABELS=LabelList.size();
 	if (NLABELS != AffinityList.size()){
-		ERROR("Error: LB Stokes Solver: SolidLabels and ZetaPotentialSolidList must be the same length! \n");
+		ERROR("Error: LB Single-Fluid Solver: SolidLabels and ZetaPotentialSolidList must be the same length! \n");
 	}

 	double *label_count;
@ -331,7 +331,7 @@ void ScaLBL_StokesModel::AssignZetaPotentialSolid(double *zeta_potential_solid)
 		label_count_global[idx]=Dm->Comm.sumReduce(  label_count[idx]);

 	if (rank==0){
-		printf("LB Stokes Solver: number of solid labels: %lu \n",NLABELS);
+		printf("LB Single-Fluid Solver: number of solid labels: %lu \n",NLABELS);
 		for (unsigned int idx=0; idx<NLABELS; idx++){
 			VALUE=LabelList[idx];
 			AFFINITY=AffinityList[idx];
--- a/sample_scripts/configure_cascades_cpu
+++ b/sample_scripts/configure_cascades_cpu
@ -2,7 +2,7 @@

 module purge
 module load cmake/3.10.3
-module load gcc/6.1.0 mvapich2/2.1 atlas tpl/4.4.18
+module load gcc/7.3.0 openmpi/3.1.2 hdf5 silo

 # configure
 rm -rf CMake*
--- a/tests/TestWriter.cpp
+++ b/tests/TestWriter.cpp
@ -94,13 +94,13 @@ bool checkVar( const std::string &format, std::shared_ptr<IO::Mesh> mesh,
 {
    if ( format == "new" )
        IO::reformatVariable( *mesh, *variable2 );
-    bool pass                = true;
-    const IO::Variable &var1 = *variable1;
-    const IO::Variable &var2 = *variable2;
-    pass                     = var1.name == var2.name;
-    pass                     = pass && var1.dim == var2.dim;
-    pass                     = pass && var1.type == var2.type;
-    pass                     = pass && var1.data.length() == var2.data.length();
+    bool pass        = true;
+    const auto &var1 = *variable1;
+    const auto &var2 = *variable2;
+    pass             = var1.name == var2.name;
+    pass             = pass && var1.dim == var2.dim;
+    pass             = pass && var1.type == var2.type;
+    pass             = pass && var1.data.length() == var2.data.length();
    if ( pass ) {
        for ( size_t m = 0; m < var1.data.length(); m++ )
            pass = pass && approx_equal( var1.data( m ), var2.data( m ) );
@ -133,6 +133,12 @@ void testWriter(
    } else if ( format == "silo-float" ) {
        format2   = "silo";
        precision = IO::DataType::Float;
+    } else if ( format == "hdf5-double" ) {
+        format2   = "hdf5";
+        precision = IO::DataType::Double;
+    } else if ( format == "hdf5-float" ) {
+        format2   = "hdf5";
+        precision = IO::DataType::Float;
    }


@ -315,7 +321,7 @@ int main( int argc, char **argv )
    set_node_vec->data = point_node_vec->data;
    list_node_mag->data.resize( 3 * N_tri );
    list_node_vec->data.resize( 3 * N_tri, 3 );
-    for ( int i = 0; i < N_points; i++ ) {
+    for ( int i = 0; i < N_tri; i++ ) {
        list_node_mag->data( 3 * i + 0 )    = distance( trilist->A[i] );
        list_node_mag->data( 3 * i + 1 )    = distance( trilist->B[i] );
        list_node_mag->data( 3 * i + 2 )    = distance( trilist->C[i] );
@ -398,6 +404,8 @@ int main( int argc, char **argv )
    testWriter( "new", meshData, ut );
    testWriter( "silo-double", meshData, ut );
    testWriter( "silo-float", meshData, ut );
+    testWriter( "hdf5-double", meshData, ut );
+    testWriter( "hdf5-float", meshData, ut );

    // Finished
    ut.report();
--- a/tests/lbpm_minkowski_scalar.cpp
+++ b/tests/lbpm_minkowski_scalar.cpp
@ -77,6 +77,15 @@ int main(int argc, char **argv)
 		comm.barrier();
 		Dm->CommInit();
 		
+		/* read the data */
+		if (domain_db->keyExists( "Filename" )){
+			auto Filename = domain_db->getScalar<std::string>( "Filename" );
+			Dm->Decomp(Filename);
+		}
+		else{
+			Dm->ReadIDs();
+		}
+		
 		// Compute the Minkowski functionals
 		comm.barrier();
 		std::shared_ptr<Minkowski> Averages(new Minkowski(Dm));