From 4b35c00f0dc64a648f59d0829b5ccaef1b3d5199 Mon Sep 17 00:00:00 2001
From: Arne Morten Kvarving <arne.morten.kvarving@sintef.no>
Date: Tue, 9 Apr 2019 17:37:45 +0200
Subject: [PATCH] added: support for partitioning based parallel computations

automatically partition mesh and distribute in parallel.
this is rather suboptimal in many ways, but still useful.
---
 src/ASM/ASMbase.h             |   3 +
 src/ASM/ASMs2D.C              |  15 ++
 src/ASM/ASMs2D.h              |   3 +
 src/ASM/ASMs2Dmx.C            |  20 +-
 src/ASM/ASMs3D.C              |  18 ++
 src/ASM/ASMs3D.h              |   3 +
 src/ASM/ASMs3Dmx.C            |  15 +-
 src/ASM/DomainDecomposition.C | 279 +++++++++++++++++++++++--
 src/ASM/DomainDecomposition.h |  14 ++
 src/ASM/LR/ASMu2D.C           |  17 ++
 src/ASM/LR/ASMu2D.h           |   3 +
 src/ASM/LR/ASMu2Dmx.C         |  23 +-
 src/ASM/LR/ASMu3D.C           |  14 ++
 src/ASM/LR/ASMu3D.h           |   4 +
 src/ASM/LR/ASMu3Dmx.C         | 383 +++++++++++++++++-----------------
 src/ASM/SAMpatchPETSc.C       |  29 ++-
 src/IFEM.C                    |   2 +-
 src/LinAlg/PETScMatrix.C      |  53 +++--
 src/SIM/SIMbase.C             |  24 ++-
 src/SIM/SIMdummy.h            |   4 +-
 src/SIM/SIMoutput.C           |  62 +++++-
 src/Utility/HDF5Writer.C      |   2 +
 22 files changed, 733 insertions(+), 257 deletions(-)
diff --git a/src/ASM/ASMbase.h b/src/ASM/ASMbase.h
index f632c526..9f02d5bf 100644
--- a/src/ASM/ASMbase.h
+++ b/src/ASM/ASMbase.h
@@ -425,6 +425,8 @@ public:
   virtual void generateThreadGroups(const Integrand&, bool, bool) {}
   //! \brief Generates element groups for multi-threading of boundary integrals.
   virtual void generateThreadGroups(char, bool, bool) {}
+  //! \brief Generate element-groups for multi-threading based on a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>&) {}
 
 
   // Methods for integration of finite element quantities.
@@ -839,6 +841,7 @@ protected:
   IntVec myMLGE; //!< The actual Matrix of Local to Global Element numbers
   IntVec myMLGN; //!< The actual Matrix of Local to Global Node numbers
   IntMat myMNPC; //!< The actual Matrix of Nodal Point Correspondance
+  IntVec myElms; //!< Elements on patch - used with partitioning
 
   //! \brief Numerical integration scheme for this patch.
   //! \details A value in the range [1,10] means use that number of Gauss
diff --git a/src/ASM/ASMs2D.C b/src/ASM/ASMs2D.C
index 3f333d14..8d054698 100644
--- a/src/ASM/ASMs2D.C
+++ b/src/ASM/ASMs2D.C
@@ -2318,6 +2318,10 @@ bool ASMs2D::integrate (Integrand& integrand, int lIndex,
       fe.iel = abs(MLGE[doXelms+iel-1]);
       if (fe.iel < 1) continue; // zero-area element
 
+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
 #ifdef SP_DEBUG
       if (dbgElm < 0 && iel != -dbgElm)
         continue; // Skipping all elements, except for -dbgElm
@@ -3119,3 +3123,14 @@ void ASMs2D::getBoundaryElms (int lIndex, int, IntVec& elms) const
       elms.push_back(MLGE[i + (lIndex-3)*N1m*(N2m-1)] - 1);
   }
 }
+
+
+void ASMs2D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1) - 1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
diff --git a/src/ASM/ASMs2D.h b/src/ASM/ASMs2D.h
index 96f96b1c..410a0701 100644
--- a/src/ASM/ASMs2D.h
+++ b/src/ASM/ASMs2D.h
@@ -588,6 +588,9 @@ protected:
   void generateThreadGroups(size_t strip1, size_t strip2,
                             bool silence, bool ignoreGlobalLM);
 
+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
 public:
   //! \brief Auxilliary function for computation of basis function indices.
   static void scatterInd(int n1, int n2, int p1, int p2,
diff --git a/src/ASM/ASMs2Dmx.C b/src/ASM/ASMs2Dmx.C
index 4085921e..61ad7f36 100644
--- a/src/ASM/ASMs2Dmx.C
+++ b/src/ASM/ASMs2Dmx.C
@@ -528,13 +528,17 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
   const int n1 = surf->numCoefs_u();
   const int nel1 = n1 - p1 + 1;
 
+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneStripe(nel);
+  const ThreadGroups& groups = glInt.threadSafe() ? oneGroup : threadGroups;
+
 
   // === Assembly loop over all elements in the patch ==========================
 
   bool ok=true;
-  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+  for (size_t g = 0; g < groups.size() && ok; g++) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroups[g].size();++t) {
+    for (size_t t = 0; t < groups[g].size(); t++) {
       MxFiniteElement fe(elem_size);
       std::vector<Matrix> dNxdu(m_basis.size());
       std::vector<Matrix3D> d2Nxdu2(m_basis.size());
@@ -543,9 +547,9 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
       Matrix Xnod, Jac;
       double   param[3] = { 0.0, 0.0, 0.0 };
       Vec4   X(param);
-      for (size_t i = 0; i < threadGroups[g][t].size() && ok; ++i)
+      for (size_t i = 0; i < groups[g][t].size() && ok; ++i)
       {
-        int iel = threadGroups[g][t][i];
+        int iel = groups[g][t][i];
         fe.iel = MLGE[iel];
         if (fe.iel < 1) continue; // zero-area element
 
@@ -737,6 +741,10 @@ bool ASMs2Dmx::integrate (Integrand& integrand, int lIndex,
       fe.iel = MLGE[iel-1];
       if (fe.iel < 1) continue; // zero-area element
 
+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
       // Skip elements that are not on current boundary edge
       bool skipMe = false;
       switch (edgeDir)
@@ -866,6 +874,10 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
       fe.iel = abs(MLGE[iel]);
       if (fe.iel < 1) continue; // zero-area element
 
+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
       short int status = iChk.hasContribution(iel,i1,i2);
       if (!status) continue; // no interface contributions for this element
 
diff --git a/src/ASM/ASMs3D.C b/src/ASM/ASMs3D.C
index 449f2a59..4475553c 100644
--- a/src/ASM/ASMs3D.C
+++ b/src/ASM/ASMs3D.C
@@ -2706,6 +2706,10 @@ bool ASMs3D::integrateEdge (Integrand& integrand, int lEdge,
 	fe.iel = MLGE[iel-1];
 	if (fe.iel < 1) continue; // zero-volume element
 
+        if (!myElms.empty() &&
+            std::find(myElms.begin(), myElms.end(), fe.iel-1) == myElms.end())
+          continue;
+
 	// Skip elements that are not on current boundary edge
 	bool skipMe = false;
 	switch (lEdge)
@@ -3533,3 +3537,17 @@ void ASMs3D::getBoundaryElms (int lIndex, int, IntVec& elms) const
         elms.push_back(MLGE[i + j*N1m + (lIndex-5)*(N1m*N2m*(N3m-1))] - 1);
   }
 }
+
+
+void ASMs3D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroupsVol = threadGroupsVol.filter(myElms);
+
+  for (auto& group : threadGroupsFace)
+    group.second = group.second.filter(myElms);
+}
diff --git a/src/ASM/ASMs3D.h b/src/ASM/ASMs3D.h
index 75312eba..9d2ed726 100644
--- a/src/ASM/ASMs3D.h
+++ b/src/ASM/ASMs3D.h
@@ -653,6 +653,9 @@ protected:
   //! \param[in] silence If \e true, suppress threading group outprint
   virtual void generateThreadGroups(char lIndex, bool silence, bool);
 
+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
 public:
   //! \brief Auxilliary function for computation of basis function indices.
   static void scatterInd(int n1, int n2, int n3, int p1, int p2, int p3,
diff --git a/src/ASM/ASMs3Dmx.C b/src/ASM/ASMs3Dmx.C
index b897ebcb..5f5b2587 100644
--- a/src/ASM/ASMs3Dmx.C
+++ b/src/ASM/ASMs3Dmx.C
@@ -483,12 +483,17 @@ bool ASMs3Dmx::integrate (Integrand& integrand,
   const int nel1 = n1 - p1 + 1;
   const int nel2 = n2 - p2 + 1;
 
+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneStripe(nel);
+  const ThreadGroups& groups = glInt.threadSafe() ? oneGroup : threadGroupsVol;
+
+
   // === Assembly loop over all elements in the patch ==========================
 
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < groups.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+    for (size_t t = 0; t < groups[g].size(); ++t) {
       MxFiniteElement fe(elem_size);
       std::vector<Matrix> dNxdu(m_basis.size());
       std::vector<Matrix3D> d2Nxdu2(m_basis.size());
@@ -497,9 +502,9 @@ bool ASMs3Dmx::integrate (Integrand& integrand,
       Matrix Xnod, Jac;
       double   param[3];
       Vec4   X(param);
-      for (size_t l = 0; l < threadGroupsVol[g][t].size() && ok; ++l)
+      for (size_t l = 0; l < groups[g][t].size() && ok; ++l)
       {
-        int iel = threadGroupsVol[g][t][l];
+        int iel = groups[g][t][l];
         fe.iel = MLGE[iel];
         if (fe.iel < 1) continue; // zero-volume element
 
diff --git a/src/ASM/DomainDecomposition.C b/src/ASM/DomainDecomposition.C
index c01d5845..19b8b947 100644
--- a/src/ASM/DomainDecomposition.C
+++ b/src/ASM/DomainDecomposition.C
@@ -18,6 +18,7 @@
 #include "ASMunstruct.h"
 #include "LinSolParams.h"
 #include "ProcessAdm.h"
+#include "Profiler.h"
 #include "SAMpatch.h"
 #include "SIMbase.h"
 #include "Utilities.h"
@@ -26,6 +27,14 @@
 #include <functional>
 #include <numeric>
 
+#ifdef HAS_ZOLTAN
+#define OLD_MPI HAVE_MPI
+#undef HAVE_MPI
+#include <zoltan.h>
+#undef HAVE_MPI
+#define HAVE_MPI OLD_MPI
+#endif
+
 
 DomainDecomposition::
 OrientIterator::OrientIterator(const ASMbase* pch,
@@ -170,6 +179,95 @@ OrientIterator::OrientIterator(const ASMbase* pch, int orient, int lIdx)
 }
 
 
+#ifdef HAS_ZOLTAN
+static int getNumElements(void* mesh, int* err)
+{
+  *err = ZOLTAN_OK;
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  return neigh.size();
+}
+
+
+static void getElementList(void* mesh, int numGlobalEntries,
+                           int, ZOLTAN_ID_PTR gids,
+                           ZOLTAN_ID_PTR lids, int,
+                           float*, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  std::iota(gids, gids+neigh.size(), 0);
+  std::iota(lids, lids+neigh.size(), 0);
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNumEdges(void* mesh, int sizeGID, int sizeLID,
+                        int numCells, ZOLTAN_ID_PTR globalID,
+                        ZOLTAN_ID_PTR localID, int* numEdges, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  int* ne = numEdges;
+  for (const std::vector<int>& n : neigh)
+    *ne++ = std::accumulate(n.begin(), n.end(), 0,
+                            [](const int& a, const int& b)
+                            {
+                              return b != -1 ? a + 1 : a;
+                            });
+
+  *err = ZOLTAN_OK;
+}
+
+
+static void getEdges(void* mesh, int sizeGID, int sizeLID, int numCells,
+                     ZOLTAN_ID_PTR globalID, ZOLTAN_ID_PTR localID,
+                     int* numEdges, ZOLTAN_ID_PTR nborGID, int* nborProc,
+                     int wgtDim, float* egts, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+
+  memset(nborProc, 0, numCells*sizeof(int));
+  for (const std::vector<int>& elm : neigh)
+    for (int n : elm)
+      if (n != -1)
+        *nborGID++ = n;
+
+  *err = ZOLTAN_OK;
+}
+
+
+static int getNullElements(void*, int* err)
+{
+  *err = ZOLTAN_OK;
+  return 0;
+}
+
+
+static void getNullList(void*, int,
+                        int, ZOLTAN_ID_PTR,
+                        ZOLTAN_ID_PTR, int,
+                        float*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNumNullEdges(void*, int, int, int,
+                            ZOLTAN_ID_PTR, ZOLTAN_ID_PTR,
+                            int*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNullEdges(void*, int, int, int,
+                         ZOLTAN_ID_PTR, ZOLTAN_ID_PTR,
+                         int*, ZOLTAN_ID_PTR, int*,
+                         int, float*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+#endif
+
+
 std::vector<std::set<int>> DomainDecomposition::getSubdomains(int nx, int ny, int nz,
                                                               int overlap, size_t block) const
 {
@@ -811,6 +909,47 @@ bool DomainDecomposition::calcGlobalEqNumbers(const ProcessAdm& adm,
 }
 
 
+bool DomainDecomposition::calcGlobalEqNumbersPart(const ProcessAdm& adm,
+                                                  const SIMbase& sim)
+{
+#ifdef HAVE_MPI
+  blocks[0].MLGEQ.clear();
+  blocks[0].MLGEQ.resize(sim.getSAM()->getNoEquations(), -1);
+
+  blocks[0].minEq = 1;
+  blocks[0].maxEq = 0;
+
+  if (adm.getProcId() != 0) {
+    adm.receive(blocks[0].MLGEQ, adm.getProcId()-1);
+    adm.receive(blocks[0].minEq, adm.getProcId()-1);
+    blocks[0].maxEq = blocks[0].minEq;
+    blocks[0].minEq++;
+  }
+
+  for (size_t i = 0; i < myElms.size(); ++i) {
+    IntVec meen;
+    sim.getSAM()->getElmEqns(meen, myElms[i]+1);
+    for (int eq : meen)
+      if (eq > 0 && blocks[0].MLGEQ[eq-1] == -1)
+        blocks[0].MLGEQ[eq-1] = ++blocks[0].maxEq;
+  }
+
+  if (adm.getProcId() < adm.getNoProcs()-1) {
+    adm.send(blocks[0].MLGEQ, adm.getProcId()+1);
+    adm.send(blocks[0].maxEq, adm.getProcId()+1);
+  }
+
+  MPI_Bcast(&blocks[0].MLGEQ[0], blocks[0].MLGEQ.size(), MPI_INT,
+            adm.getNoProcs()-1, *adm.getCommunicator());
+
+  for (size_t i = 0; i < blocks[0].MLGEQ.size(); ++i)
+    blocks[0].G2LEQ[blocks[0].MLGEQ[i]] = i+1;
+#endif
+
+  return true;
+}
+
+
 int DomainDecomposition::getGlobalEq(int lEq, size_t idx) const
 {
   if (lEq < 1 || idx >= blocks.size())
@@ -933,37 +1072,44 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
 #endif
 #endif
 
+  if (ghostConnections.empty() && adm.getNoProcs() > 1 && myElms.empty())
+    if (!this->graphPartition(adm, sim))
+      return false;
+
   sam = dynamic_cast<const SAMpatch*>(sim.getSAM());
 
   int ok = 1;
-
-  // Establish global node numbers
-  if (!calcGlobalNodeNumbers(adm, sim))
-    ok = 0;
-
-  // sanity check the established domain decomposition
-  if (getMinNode() > getMaxNode()) {
-    std::cerr << "**DomainDecomposition::setup ** Process "
-              << adm.getProcId() << " owns no nodes." << std::endl;
-    ok = 0;
-  }
-
 #ifdef HAVE_MPI
   int lok = ok;
-  MPI_Allreduce(&lok, &ok, 1, MPI_INT, MPI_SUM, *adm.getCommunicator());
 #endif
+  if (myElms.empty()) {
+    // Establish global node numbers
+    if (!calcGlobalNodeNumbers(adm, sim))
+      ok = 0;
 
-  if (ok < adm.getNoProcs())
-    return false;
-
-  // sanity check all corners of the patches
-  if (!sanityCheckCorners(sim))
-    return false;
+    // sanity check the established domain decomposition
+    if (getMinNode() > getMaxNode()) {
+      std::cerr << "**DomainDecomposition::setup ** Process "
+                << adm.getProcId() << " owns no nodes." << std::endl;
+      ok = 0;
+    }
 
 #ifdef HAVE_MPI
-  ok = 1;
+    MPI_Allreduce(&lok, &ok, 1, MPI_INT, MPI_SUM, *adm.getCommunicator());
 #endif
 
+    if (ok < adm.getNoProcs())
+      return false;
+
+    // sanity check all corners of the patches
+    if (!sanityCheckCorners(sim))
+      return false;
+
+#ifdef HAVE_MPI
+    ok = 1;
+#endif
+  }
+
   // Establish local equation mappings for each block.
   if (sim.getSolParams() && sim.getSolParams()->getNoBlocks() > 1) {
     IFEM::cout << "\tEstablishing local block equation numbers" << std::endl;
@@ -1011,7 +1157,10 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
   }
 
   // Establish global equation numbers for all blocks.
-  if (!calcGlobalEqNumbers(adm, sim))
+  if (!myElms.empty()) {
+    if (!calcGlobalEqNumbersPart(adm, sim))
+      return false;
+  } else if (!calcGlobalEqNumbers(adm, sim))
 #ifdef HAVE_MPI
     ok = 0;
 #else
@@ -1072,3 +1221,91 @@ int DomainDecomposition::getPatchOwner(size_t p) const
 
   return it->second;
 }
+
+
+bool DomainDecomposition::graphPartition(const ProcessAdm& adm, const SIMbase& sim)
+{
+  PROFILE("Mesh partition");
+  myElms.clear();
+#ifdef HAS_ZOLTAN
+  static bool inited = false;
+  if (!inited)
+  {
+    float ver;
+    Zoltan_Initialize(0, nullptr, &ver);
+    inited = true;
+  }
+  struct Zoltan_Struct* zz = Zoltan_Create(*adm.getCommunicator());
+  IntMat neigh;
+  if (adm.getProcId() == 0) {
+    neigh = sim.getElmConnectivities();
+    Zoltan_Set_Num_Obj_Fn(zz, getNumElements, &neigh);
+    Zoltan_Set_Obj_List_Fn(zz, getElementList, &neigh);
+    Zoltan_Set_Num_Edges_Multi_Fn(zz, getNumEdges, &neigh);
+    Zoltan_Set_Edge_List_Multi_Fn(zz, getEdges, &neigh);
+  } else {
+    Zoltan_Set_Num_Obj_Fn(zz, getNullElements, nullptr);
+    Zoltan_Set_Obj_List_Fn(zz, getNullList, nullptr);
+    Zoltan_Set_Num_Edges_Multi_Fn(zz, getNumNullEdges, nullptr);
+    Zoltan_Set_Edge_List_Multi_Fn(zz, getNullEdges, nullptr);
+  }
+
+  Zoltan_Set_Param(zz, "DEBUG_LEVEL", "0");
+  Zoltan_Set_Param(zz, "LB_METHOD", "GRAPH");
+  Zoltan_Set_Param(zz, "GRAPH_PACKAGE", "Scotch");
+  Zoltan_Set_Param(zz, "LB_APPROACH", "PARTITION");
+  Zoltan_Set_Param(zz, "NUM_GID_ENTRIES", "1");
+  Zoltan_Set_Param(zz, "NUM_LID_ENTRIES", "1");
+  Zoltan_Set_Param(zz, "RETURN_LISTS", "ALL");
+  Zoltan_Set_Param(zz, "CHECK_GRAPH", "2");
+  Zoltan_Set_Param(zz,"EDGE_WEIGHT_DIM","0");
+  Zoltan_Set_Param(zz, "OBJ_WEIGHT_DIM", "0");
+  Zoltan_Set_Param(zz, "PHG_EDGE_SIZE_THRESHOLD", ".35");  /* 0-remove all, 1-remove none */
+  int changes, numGidEntries, numLidEntries, numImport, numExport;
+  ZOLTAN_ID_PTR importGlobalGids, importLocalGids, exportGlobalGids, exportLocalGids;
+  int* importProcs, *importToPart, *exportProcs, *exportToPart;
+  Zoltan_LB_Partition(zz, /* input (all remaining fields are output) */
+                           &changes,        /* 1 if partitioning was changed, 0 otherwise */
+                           &numGidEntries,  /* Number of integers used for a global ID */
+                           &numLidEntries,  /* Number of integers used for a local ID */
+                           &numImport,      /* Number of vertices to be sent to me */
+                           &importGlobalGids,  /* Global IDs of vertices to be sent to me */
+                           &importLocalGids,   /* Local IDs of vertices to be sent to me */
+                           &importProcs,    /* Process rank for source of each incoming vertex */
+                           &importToPart,   /* New partition for each incoming vertex */
+                           &numExport,      /* Number of vertices I must send to other processes*/
+                           &exportGlobalGids,  /* Global IDs of the vertices I must send */
+                           &exportLocalGids,   /* Local IDs of the vertices I must send */
+                           &exportProcs,    /* Process to which I send each of the vertices */
+                           &exportToPart);  /* Partition to which each vertex will belong */
+
+  if (sim.getProcessAdm().getProcId() == 0) {
+    std::vector<std::vector<int>> toExp(adm.getNoProcs());
+    std::vector<bool> offProc(sim.getNoElms(), false);
+    for (int i = 0; i < numExport; ++i) {
+      int gid = exportGlobalGids[i];
+      offProc[gid] = true;
+    }
+    myElms.reserve(numExport);
+    for (size_t i = 0; i < sim.getNoElms(); ++i)
+      if (!offProc[i])
+        myElms.push_back(i);
+  }
+  else {
+    myElms.resize(numImport);
+    std::copy(importGlobalGids, importGlobalGids+numImport, myElms.begin());
+  }
+  Zoltan_LB_Free_Part(&importGlobalGids, &importLocalGids, &importProcs, &importToPart);
+  Zoltan_LB_Free_Part(&exportGlobalGids, &exportLocalGids, &exportProcs, &exportToPart);
+  Zoltan_Destroy(&zz);
+#else
+  std::cerr << "*** DomainDecompositon::graphPartition: Compiled without Zoltan support. No partitioning available." << std::endl;
+  return false;
+#endif
+
+  if (!myElms.empty())
+    IFEM::cout << "Graph partitioning: " << myElms.size() << " elements in partition." << std::endl;
+
+  partitioned = !myElms.empty();
+  return partitioned;
+}
diff --git a/src/ASM/DomainDecomposition.h b/src/ASM/DomainDecomposition.h
index d28c77cd..f0442772 100644
--- a/src/ASM/DomainDecomposition.h
+++ b/src/ASM/DomainDecomposition.h
@@ -174,6 +174,12 @@ public:
   //! \brief Returns number of matrix blocks.
   size_t getNoBlocks() const { return blocks.size()-1; }
 
+  //! \brief Returns whether a graph based partition is used.
+  bool isPartitioned() const { return partitioned; }
+
+  //! \brief Returns elements in partition.
+  const std::vector<int>& getElms() const { return myElms; }
+
 private:
   //! \brief Calculates a 1D partitioning with a given overlap.
   //! \param[in] nel1 Number of knot-spans in first parameter direction.
@@ -237,11 +243,17 @@ private:
   //! \brief Calculate the global equation numbers for given finite element model.
   bool calcGlobalEqNumbers(const ProcessAdm& adm, const SIMbase& sim);
 
+  //! \brief Calculate the global equation numbers for a partitioned model.
+  bool calcGlobalEqNumbersPart(const ProcessAdm& adm, const SIMbase& sim);
+
   //! \brief Sanity check model.
   //! \details Collects the corners of all patches in the model and make sure nodes
   //!          with matching coordinates have the same global node ID.
   bool sanityCheckCorners(const SIMbase& sim);
 
+  //! \brief Setup domain decomposition based on graph partitioning.
+  bool graphPartition(const ProcessAdm& adm, const SIMbase& sim);
+
   std::map<int,int> patchOwner; //!< Process that owns a particular patch
 
   //! \brief Struct with information per matrix block.
@@ -258,12 +270,14 @@ private:
 
   std::vector<int> MLGN; //!< Process-local-to-global node numbers
   std::vector<BlockInfo> blocks; //!< Equation mappings for all matrix blocks.
+  std::vector<int> myElms; //!< Elements in partition
   int minDof = 0; //!< First DOF we own
   int maxDof = 0; //!< Last DOF we own
   int minNode = 0; //!< First node we own
   int maxNode = 0; //!< Last node we own
 
   const SAMpatch* sam = nullptr; //!< The assembly handler the DD is constructed for.
+  bool partitioned = false; //!< \e true if graph based decomposition
 };
 
 #endif
diff --git a/src/ASM/LR/ASMu2D.C b/src/ASM/LR/ASMu2D.C
index 9c99a0e0..bbaa29be 100644
--- a/src/ASM/LR/ASMu2D.C
+++ b/src/ASM/LR/ASMu2D.C
@@ -1574,6 +1574,12 @@ bool ASMu2D::integrate (Integrand& integrand, int lIndex,
   int iel = 0;
   for (const LR::Element* el : lrspline->getAllElements())
   {
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel) == myElms.end()) {
+        ++iel;
+      continue;
+    }
+
     fe.iel = MLGE[iel++];
 #ifdef SP_DEBUG
     if (dbgElm < 0 && iel != -dbgElm)
@@ -2705,3 +2711,14 @@ void ASMu2D::getBoundaryElms (int lIndex, int orient, IntVec& elms) const
   for (const LR::Element* elem : elements)
     elms.push_back(MLGE[elem->getId()]-1);
 }
+
+
+void ASMu2D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
diff --git a/src/ASM/LR/ASMu2D.h b/src/ASM/LR/ASMu2D.h
index 2c0c0ec6..52d8d771 100644
--- a/src/ASM/LR/ASMu2D.h
+++ b/src/ASM/LR/ASMu2D.h
@@ -591,6 +591,9 @@ protected:
   void generateThreadGroups(const Integrand& integrand, bool silence,
                             bool ignoreGlobalLM);
 
+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
   //! \brief Remap element wise errors to basis functions.
   //! \param     errors The remapped errors
   //! \param[in] origErr The element wise errors on the geometry mesh
diff --git a/src/ASM/LR/ASMu2Dmx.C b/src/ASM/LR/ASMu2Dmx.C
index 21c2e268..097a6f9e 100644
--- a/src/ASM/LR/ASMu2Dmx.C
+++ b/src/ASM/LR/ASMu2Dmx.C
@@ -301,16 +301,21 @@ bool ASMu2Dmx::integrate (Integrand& integrand,
   if (!xg || !wg) return false;
   bool use2ndDer = integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES;
 
+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneGroup(nel);
+  const IntMat& groups = glInt.threadSafe() ? oneGroup[0] : threadGroups[0];
+
+
   // === Assembly loop over all elements in the patch ==========================
   bool ok = true;
-  for (size_t t = 0; t < threadGroups[0].size() && ok; ++t)
+  for (size_t t = 0; t < groups.size() && ok; ++t)
   {
 //#pragma omp parallel for schedule(static)
-    for (size_t e = 0; e < threadGroups[0][t].size(); ++e)
+    for (size_t e = 0; e < groups[t].size(); ++e)
     {
       if (!ok)
         continue;
-      int iel = threadGroups[0][t][e] + 1;
+      int iel = groups[t][e] + 1;
       auto el1 = threadBasis->elementBegin()+iel-1;
       double uh = ((*el1)->umin()+(*el1)->umax())/2.0;
       double vh = ((*el1)->vmin()+(*el1)->vmax())/2.0;
@@ -524,6 +529,12 @@ bool ASMu2Dmx::integrate (Integrand& integrand, int lIndex,
     }
     if (skipMe) continue;
 
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end()) {
+        ++iel;
+      continue;
+    }
+
     double uh = ((*el1)->umin()+(*el1)->umax())/2.0;
     double vh = ((*el1)->vmin()+(*el1)->vmax())/2.0;
     std::vector<size_t> els;
@@ -644,6 +655,12 @@ bool ASMu2Dmx::integrate (Integrand& integrand,
   for (int iel = 1; el1 != m_basis[0]->elementEnd(); ++el1, ++iel) {
     short int status = iChk.hasContribution(iel);
     if (!status) continue; // no interface contributions for this element
+
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end()) {
+        ++iel, ++el1;
+      continue;
+    }
     // first push the hosting elements
     std::vector<size_t> els(1,iel);
     std::vector<size_t> elem_sizes(1,(*el1)->nBasisFunctions());
diff --git a/src/ASM/LR/ASMu3D.C b/src/ASM/LR/ASMu3D.C
index d34c2a04..a91d8531 100644
--- a/src/ASM/LR/ASMu3D.C
+++ b/src/ASM/LR/ASMu3D.C
@@ -1209,6 +1209,9 @@ bool ASMu3D::integrate (Integrand& integrand, int lIndex,
     if (dbgElm < 0 && iEl+1 != -dbgElm)
       continue; // Skipping all elements, except for -dbgElm
 #endif
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iEl) == myElms.end())
+     continue;
 
     FiniteElement fe;
     fe.iel = MLGE[iEl];
@@ -2324,3 +2327,14 @@ void ASMu3D::getBoundaryElms (int lIndex, int orient, IntVec& elms) const
   for (const LR::Element* elem : elements)
     elms.push_back(MLGE[elem->getId()]-1);
 }
+
+
+void ASMu3D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
diff --git a/src/ASM/LR/ASMu3D.h b/src/ASM/LR/ASMu3D.h
index 8735f1fa..58129f03 100644
--- a/src/ASM/LR/ASMu3D.h
+++ b/src/ASM/LR/ASMu3D.h
@@ -553,6 +553,9 @@ protected:
   void generateThreadGroups(const Integrand& integrand, bool silence,
                             bool ignoreGlobalLM);
 
+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
   //! \brief Remap element wise errors to basis functions.
   //! \param     errors The remapped errors
   //! \param[in] origErr The element wise errors on the geometry mesh
@@ -587,6 +590,7 @@ protected:
   Matrices      myBezierExtract; //!< Bezier extraction matrices
 
   ThreadGroups threadGroups; //!< Element groups for multi-threaded assembly
+  IntVec myElms; //!< Elements on patch - used with partitioning
   mutable double vMin; //!< Minimum element volume for adaptive refinement
 };
 
diff --git a/src/ASM/LR/ASMu3Dmx.C b/src/ASM/LR/ASMu3Dmx.C
index f30a1269..baeaf01d 100644
--- a/src/ASM/LR/ASMu3Dmx.C
+++ b/src/ASM/LR/ASMu3Dmx.C
@@ -360,205 +360,214 @@ bool ASMu3Dmx::integrate (Integrand& integrand,
   else if (nRed < 0)
     nRed = nGauss; // The integrand needs to know nGauss
 
+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneGroup(nel);
+  const IntMat& group = glInt.threadSafe() ? oneGroup[0] : threadGroups[0];
+
 
   // === Assembly loop over all elements in the patch ==========================
 
-  bool ok=true;
-  for(LR::Element *el : this->getBasis(geoBasis)->getAllElements()) {
-    double uh = (el->umin()+el->umax())/2.0;
-    double vh = (el->vmin()+el->vmax())/2.0;
-    double wh = (el->wmin()+el->wmax())/2.0;
-    std::vector<size_t> els;
-    std::vector<size_t> elem_sizes;
-    for (size_t i=0; i < m_basis.size(); ++i) {
-      els.push_back(m_basis[i]->getElementContaining(uh, vh, wh)+1);
-      elem_sizes.push_back((*(m_basis[i]->elementBegin()+els.back()-1))->nBasisFunctions());
-    }
-    int iEl = el->getId();
-    MxFiniteElement fe(elem_sizes);
-    fe.iel = MLGE[iEl];
-
-    std::vector<Matrix> dNxdu(m_basis.size());
-    Matrix   Xnod, Jac;
-    std::vector<Matrix3D> d2Nxdu2(m_basis.size());
-    Matrix3D Hess;
-    double   dXidu[3];
-    double   param[3] = { 0.0, 0.0, 0.0 };
-    Vec4     X(param);
-    // Get element volume in the parameter space
-    double du = el->umax() - el->umin();
-    double dv = el->vmax() - el->vmin();
-    double dw = el->wmax() - el->wmin();
-    double vol = el->volume();
-    if (vol < 0.0)
+  bool ok = true;
+  for (size_t t = 0; t < group.size() && ok; t++)
+//#pragma omp parallel for schedule(static)
+    for (size_t e = 0; e < group[t].size(); e++)
     {
-      ok = false; // topology error (probably logic error)
-      break;
-    }
+      int iel = group[t][e] + 1;
+      const LR::Element* el = lrspline->getElement(iel-1);
+      double uh = (el->umin()+el->umax())/2.0;
+      double vh = (el->vmin()+el->vmax())/2.0;
+      double wh = (el->wmin()+el->wmax())/2.0;
+      std::vector<size_t> els;
+      std::vector<size_t> elem_sizes;
+      for (size_t i=0; i < m_basis.size(); ++i) {
+        els.push_back(m_basis[i]->getElementContaining(uh, vh, wh)+1);
+        elem_sizes.push_back((*(m_basis[i]->elementBegin()+els.back()-1))->nBasisFunctions());
+      }
+      int iEl = el->getId();
+      MxFiniteElement fe(elem_sizes);
+      fe.iel = MLGE[iEl];
 
-    // Set up control point (nodal) coordinates for current element
-    if (!this->getElementCoordinates(Xnod,iEl+1))
-    {
-      ok = false;
-      break;
-    }
-
-    // Compute parameter values of the Gauss points over the whole element
-    std::array<RealArray,3> gpar, redpar;
-    for (int d = 0; d < 3; d++)
-    {
-      this->getGaussPointParameters(gpar[d],d,nGauss,iEl+1,xg);
-      if (xr)
-        this->getGaussPointParameters(redpar[d],d,nRed,iEl+1,xr);
-    }
-
-
-    if (integrand.getIntegrandType() & Integrand::ELEMENT_CORNERS)
-      fe.h = this->getElementCorners(iEl+1, fe.XC);
-
-    if (integrand.getIntegrandType() & Integrand::G_MATRIX)
-    {
-      // Element size in parametric space
-      dXidu[0] = el->getParmin(0);
-      dXidu[1] = el->getParmin(1);
-      dXidu[2] = el->getParmin(2);
-    }
-    else if (integrand.getIntegrandType() & Integrand::AVERAGE)
-    {
-      // --- Compute average value of basis functions over the element -----
-
-      fe.Navg.resize(elem_sizes[0],true);
-      double vol = 0.0;
-      for (int k = 0; k < nGauss; k++)
-        for (int j = 0; j < nGauss; j++)
-          for (int i = 0; i < nGauss; i++)
-          {
-            // Fetch basis function derivatives at current integration point
-            for (size_t b = 1; b <= m_basis.size(); ++b)
-              this->evaluateBasis(iEl, fe, dNxdu[b-1], b);
-
-            // Compute Jacobian determinant of coordinate mapping
-            // and multiply by weight of current integration point
-            double detJac = utl::Jacobian(Jac,fe.grad(geoBasis),
-                                          Xnod,dNxdu[geoBasis-1],false);
-            double weight = 0.125*vol*wg[i]*wg[j]*wg[k];
-
-            // Numerical quadrature
-            fe.Navg.add(fe.N,detJac*weight);
-            vol += detJac*weight;
+      std::vector<Matrix> dNxdu(m_basis.size());
+      Matrix   Xnod, Jac;
+      std::vector<Matrix3D> d2Nxdu2(m_basis.size());
+      Matrix3D Hess;
+      double   dXidu[3];
+      double   param[3] = { 0.0, 0.0, 0.0 };
+      Vec4     X(param);
+      // Get element volume in the parameter space
+      double du = el->umax() - el->umin();
+      double dv = el->vmax() - el->vmin();
+      double dw = el->wmax() - el->wmin();
+      double vol = el->volume();
+      if (vol < 0.0)
+      {
+        ok = false; // topology error (probably logic error)
+        break;
       }
 
-      // Divide by element volume
-      fe.Navg /= vol;
-    }
+      // Set up control point (nodal) coordinates for current element
+      if (!this->getElementCoordinates(Xnod,iEl+1))
+      {
+        ok = false;
+        break;
+      }
 
-    else if (integrand.getIntegrandType() & Integrand::ELEMENT_CENTER)
-    {
-      // Compute the element center
-      Go::Point X0;
-      double u0 = 0.5*(el->getParmin(0) + el->getParmax(0));
-      double v0 = 0.5*(el->getParmin(1) + el->getParmax(1));
-      double w0 = 0.5*(el->getParmin(2) + el->getParmax(2));
-      this->getBasis(geoBasis)->point(X0,u0,v0,w0);
-      X = SplineUtils::toVec3(X0);
-    }
+      // Compute parameter values of the Gauss points over the whole element
+      std::array<RealArray,3> gpar, redpar;
+      for (int d = 0; d < 3; d++)
+      {
+        this->getGaussPointParameters(gpar[d],d,nGauss,iEl+1,xg);
+        if (xr)
+          this->getGaussPointParameters(redpar[d],d,nRed,iEl+1,xr);
+      }
+
+
+      if (integrand.getIntegrandType() & Integrand::ELEMENT_CORNERS)
+        fe.h = this->getElementCorners(iEl+1, fe.XC);
+
+      if (integrand.getIntegrandType() & Integrand::G_MATRIX)
+      {
+        // Element size in parametric space
+        dXidu[0] = el->getParmin(0);
+        dXidu[1] = el->getParmin(1);
+        dXidu[2] = el->getParmin(2);
+      }
+      else if (integrand.getIntegrandType() & Integrand::AVERAGE)
+      {
+        // --- Compute average value of basis functions over the element -----
+
+        fe.Navg.resize(elem_sizes[0],true);
+        double vol = 0.0;
+        for (int k = 0; k < nGauss; k++)
+          for (int j = 0; j < nGauss; j++)
+            for (int i = 0; i < nGauss; i++)
+            {
+              // Fetch basis function derivatives at current integration point
+              for (size_t b = 1; b <= m_basis.size(); ++b)
+                this->evaluateBasis(iEl, fe, dNxdu[b-1], b);
+
+              // Compute Jacobian determinant of coordinate mapping
+              // and multiply by weight of current integration point
+              double detJac = utl::Jacobian(Jac,fe.grad(geoBasis),
+                                            Xnod,dNxdu[geoBasis-1],false);
+              double weight = 0.125*vol*wg[i]*wg[j]*wg[k];
+
+              // Numerical quadrature
+              fe.Navg.add(fe.N,detJac*weight);
+              vol += detJac*weight;
+        }
+
+        // Divide by element volume
+        fe.Navg /= vol;
+      }
+
+      else if (integrand.getIntegrandType() & Integrand::ELEMENT_CENTER)
+      {
+        // Compute the element center
+        Go::Point X0;
+        double u0 = 0.5*(el->getParmin(0) + el->getParmax(0));
+        double v0 = 0.5*(el->getParmin(1) + el->getParmax(1));
+        double w0 = 0.5*(el->getParmin(2) + el->getParmax(2));
+        this->getBasis(geoBasis)->point(X0,u0,v0,w0);
+        X = SplineUtils::toVec3(X0);
+      }
+
+      // Initialize element quantities
+      LocalIntegral* A = integrand.getLocalIntegral(elem_sizes,fe.iel);
+      if (!integrand.initElement(MNPC[iEl],elem_sizes,nb,*A))
+      {
+        A->destruct();
+        ok = false;
+        break;
+      }
+
+      if (xr)
+      {
+        std::cerr << "Haven't really figured out what this part does yet\n";
+        exit(42142);
+      }
+
+      // --- Integration loop over all Gauss points in each direction --------
+
+      fe.iGP = iEl*nGauss*nGauss*nGauss; // Global integration point counter
+
+      std::vector<Matrix> B(m_basis.size());
+      size_t ig = 1;
+      for (int k = 0; k < nGauss; k++)
+        for (int j = 0; j < nGauss; j++)
+          for (int i = 0; i < nGauss; i++, fe.iGP++, ig++)
+          {
+            // Local element coordinates of current integration point
+            fe.xi   = xg[i];
+            fe.eta  = xg[j];
+            fe.zeta = xg[k];
+
+            // Parameter values of current integration point
+            fe.u = param[0] = gpar[0][i];
+            fe.v = param[1] = gpar[1][j];
+            fe.w = param[2] = gpar[2][k];
+
+            // Extract bezier basis functions
+            for (size_t b = 0; b < m_basis.size(); ++b) {
+              Matrix B(BN[b].rows(), 4);
+              B.fillColumn(1, BN[b].getColumn(ig));
+              B.fillColumn(2, BdNdu[b].getColumn(ig)*2.0/du);
+              B.fillColumn(3, BdNdv[b].getColumn(ig)*2.0/dv);
+              B.fillColumn(4, BdNdw[b].getColumn(ig)*2.0/dw);
+
+              // Fetch basis function derivatives at current integration point
+              if (integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES)
+                this->evaluateBasis(els[b]-1, fe, dNxdu[b], d2Nxdu2[b], b+1);
+              else
+                this->evaluateBasis(fe, dNxdu[b], bezierExtractmx[b][els[b]-1], B, b+1);
+            }
+
+            // Compute Jacobian inverse of coordinate mapping and derivatives
+            fe.detJxW = utl::Jacobian(Jac,fe.grad(geoBasis),Xnod,dNxdu[geoBasis-1]);
+            if (fe.detJxW == 0.0) continue; // skip singular points
+            for (size_t b = 0; b < m_basis.size(); ++b)
+              if (b != (size_t)geoBasis-1)
+                fe.grad(b+1).multiply(dNxdu[b],Jac);
+
+            // Compute Hessian of coordinate mapping and 2nd order derivatives
+            if (integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES) {
+              if (!utl::Hessian(Hess,fe.hess(geoBasis),Jac,Xnod,
+                                d2Nxdu2[geoBasis-1],dNxdu[geoBasis-1]))
+                return false;
+
+              for (size_t b = 0; b < m_basis.size() && ok; ++b)
+                if ((int)b != geoBasis)
+                  if (!utl::Hessian(Hess,fe.hess(b+1),Jac,Xnod,
+                                    d2Nxdu2[b],fe.grad(b+1),false))
+                    return false;
+            }
+
+            // Compute G-matrix
+            if (integrand.getIntegrandType() & Integrand::G_MATRIX)
+              utl::getGmat(Jac,dXidu,fe.G);
+
+            // Cartesian coordinates of current integration point
+            X.assign(Xnod * fe.basis(geoBasis));
+            X.t = time.t;
+
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= 0.125*vol*wg[i]*wg[j]*wg[k];
+            if (!integrand.evalIntMx(*A,fe,time,X))
+              ok = false;
+
+      } // end gauss integrand
+
+      // Finalize the element quantities
+      if (ok && !integrand.finalizeElement(*A,time,0))
+        ok = false;
+
+      // Assembly of global system integral
+      if (ok && !glInt.assemble(A->ref(),fe.iel))
+        ok = false;
 
-    // Initialize element quantities
-    LocalIntegral* A = integrand.getLocalIntegral(elem_sizes,fe.iel);
-    if (!integrand.initElement(MNPC[iEl],elem_sizes,nb,*A))
-    {
       A->destruct();
-      ok = false;
-      break;
     }
 
-    if (xr)
-    {
-      std::cerr << "Haven't really figured out what this part does yet\n";
-      exit(42142);
-    }
-
-    // --- Integration loop over all Gauss points in each direction --------
-
-    fe.iGP = iEl*nGauss*nGauss*nGauss; // Global integration point counter
-
-    std::vector<Matrix> B(m_basis.size());
-    size_t ig = 1;
-    for (int k = 0; k < nGauss; k++)
-      for (int j = 0; j < nGauss; j++)
-        for (int i = 0; i < nGauss; i++, fe.iGP++, ig++)
-        {
-          // Local element coordinates of current integration point
-          fe.xi   = xg[i];
-          fe.eta  = xg[j];
-          fe.zeta = xg[k];
-
-          // Parameter values of current integration point
-          fe.u = param[0] = gpar[0][i];
-          fe.v = param[1] = gpar[1][j];
-          fe.w = param[2] = gpar[2][k];
-
-          // Extract bezier basis functions
-          for (size_t b = 0; b < m_basis.size(); ++b) {
-            Matrix B(BN[b].rows(), 4);
-            B.fillColumn(1, BN[b].getColumn(ig));
-            B.fillColumn(2, BdNdu[b].getColumn(ig)*2.0/du);
-            B.fillColumn(3, BdNdv[b].getColumn(ig)*2.0/dv);
-            B.fillColumn(4, BdNdw[b].getColumn(ig)*2.0/dw);
-
-            // Fetch basis function derivatives at current integration point
-            if (integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES)
-              this->evaluateBasis(els[b]-1, fe, dNxdu[b], d2Nxdu2[b], b+1);
-            else
-              this->evaluateBasis(fe, dNxdu[b], bezierExtractmx[b][els[b]-1], B, b+1);
-          }
-
-          // Compute Jacobian inverse of coordinate mapping and derivatives
-          fe.detJxW = utl::Jacobian(Jac,fe.grad(geoBasis),Xnod,dNxdu[geoBasis-1]);
-          if (fe.detJxW == 0.0) continue; // skip singular points
-          for (size_t b = 0; b < m_basis.size(); ++b)
-            if (b != (size_t)geoBasis-1)
-              fe.grad(b+1).multiply(dNxdu[b],Jac);
-
-          // Compute Hessian of coordinate mapping and 2nd order derivatives
-          if (integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES) {
-            if (!utl::Hessian(Hess,fe.hess(geoBasis),Jac,Xnod,
-                              d2Nxdu2[geoBasis-1],dNxdu[geoBasis-1]))
-              return false;
-
-            for (size_t b = 0; b < m_basis.size() && ok; ++b)
-              if ((int)b != geoBasis)
-                if (!utl::Hessian(Hess,fe.hess(b+1),Jac,Xnod,
-                                  d2Nxdu2[b],fe.grad(b+1),false))
-                  return false;
-          }
-
-          // Compute G-matrix
-          if (integrand.getIntegrandType() & Integrand::G_MATRIX)
-            utl::getGmat(Jac,dXidu,fe.G);
-
-          // Cartesian coordinates of current integration point
-          X.assign(Xnod * fe.basis(geoBasis));
-          X.t = time.t;
-
-          // Evaluate the integrand and accumulate element contributions
-          fe.detJxW *= 0.125*vol*wg[i]*wg[j]*wg[k];
-          if (!integrand.evalIntMx(*A,fe,time,X))
-            ok = false;
-
-    } // end gauss integrand
-
-    // Finalize the element quantities
-    if (ok && !integrand.finalizeElement(*A,time,0))
-      ok = false;
-
-    // Assembly of global system integral
-    if (ok && !glInt.assemble(A->ref(),fe.iel))
-      ok = false;
-
-    A->destruct();
-  }
-
   return ok;
 }
 
diff --git a/src/ASM/SAMpatchPETSc.C b/src/ASM/SAMpatchPETSc.C
index ebd2110c..f08e39bb 100644
--- a/src/ASM/SAMpatchPETSc.C
+++ b/src/ASM/SAMpatchPETSc.C
@@ -54,6 +54,9 @@ Real SAMpatchPETSc::dot (const Vector& x, const Vector& y, char dofType) const
 {
 #ifdef HAVE_MPI
   if (adm.isParallel()) {
+    if (adm.dd.isPartitioned())
+      return this->SAMpatch::dot(x,y,dofType);
+
     DofIS& dis = this->getIS(dofType);
 
     Vec lx, ly;
@@ -94,7 +97,7 @@ Real SAMpatchPETSc::dot (const Vector& x, const Vector& y, char dofType) const
 Real SAMpatchPETSc::normL2 (const Vector& x, char dofType) const
 {
 #ifdef HAVE_MPI
-  if (adm.isParallel()) {
+  if (adm.isParallel() && !adm.dd.isPartitioned()) {
     DofIS& dis = this->getIS(dofType);
 
     Vec lx;
@@ -130,7 +133,7 @@ Real SAMpatchPETSc::normInf (const Vector& x, size_t& comp, char dofType) const
 {
   Real max = this->SAM::normInf(x,comp,dofType);
 #ifdef HAVE_MPI
-  if (adm.isParallel())
+  if (adm.isParallel() && !adm.dd.isPartitioned())
     max = adm.allReduce(max, MPI_MAX);
   // TODO: comp (node index of the max value) is not necessarily correct here
 #endif
@@ -187,23 +190,33 @@ bool SAMpatchPETSc::expandSolution(const SystemVector& solVec,
 
 #ifdef HAVE_MPI
   if (adm.isParallel()) {
-    if (!glob2LocEq) {
+    if (!glob2LocEq && !adm.dd.isPartitioned()) {
       IntVec mlgeq(adm.dd.getMLGEQ());
       for (int& ieq : mlgeq) --ieq;
       ISCreateGeneral(*adm.getCommunicator(),adm.dd.getMLGEQ().size(),
                       mlgeq.data(), PETSC_COPY_VALUES, &glob2LocEq);
     }
 
-    Vec solution;
-    VecCreateSeq(PETSC_COMM_SELF, Bptr->dim(), &solution);
-    VecScatter ctx;
-    VecScatterCreate(Bptr->getVector(), glob2LocEq, solution, nullptr, &ctx);
+   VecScatter ctx;
+   Vec solution;
+   if (adm.dd.isPartitioned())
+      VecScatterCreateToAll(Bptr->getVector(), &ctx, &solution);
+    else {
+      VecCreateSeq(PETSC_COMM_SELF, Bptr->dim(), &solution);
+      VecScatterCreate(Bptr->getVector(), glob2LocEq, solution, nullptr, &ctx);
+   }
+
     VecScatterBegin(ctx, Bptr->getVector(), solution, INSERT_VALUES, SCATTER_FORWARD);
     VecScatterEnd(ctx, Bptr->getVector(),solution,INSERT_VALUES,SCATTER_FORWARD);
     VecScatterDestroy(&ctx);
     PetscScalar* data;
     VecGetArray(solution, &data);
-    std::copy(data, data + Bptr->dim(), Bptr->getPtr());
+    if (adm.dd.isPartitioned()) {
+      for (const auto& it : adm.dd.getG2LEQ(0))
+        (*Bptr)(it.second) = data[it.first-1];
+    } else
+      std::copy(data, data + Bptr->dim(), Bptr->getPtr());
+
     VecDestroy(&solution);
   } else
 #endif
diff --git a/src/IFEM.C b/src/IFEM.C
index 0ab0f78d..a1cfe3f6 100644
--- a/src/IFEM.C
+++ b/src/IFEM.C
@@ -93,7 +93,7 @@ int IFEM::Init (int arg_c, char** arg_v, const char* title)
 #endif
 
   std::cout <<"\n        MPI support: ";
-#if HAVE_MPI
+#ifdef HAVE_MPI
   std::cout <<"enabled";
 #else
   std::cout <<"disabled";
diff --git a/src/LinAlg/PETScMatrix.C b/src/LinAlg/PETScMatrix.C
index 23c80100..a676be22 100644
--- a/src/LinAlg/PETScMatrix.C
+++ b/src/LinAlg/PETScMatrix.C
@@ -91,8 +91,8 @@ void PETScVector::redim(size_t n)
 
 bool PETScVector::beginAssembly()
 {
-  for (size_t i = 0; i < size(); ++i)
-    VecSetValue(x , adm.dd.getGlobalEq(i+1)-1, (*this)[i], ADD_VALUES);
+  for (size_t i = 0; i < this->size(); ++i)
+    VecSetValue(x, adm.dd.getGlobalEq(i+1)-1, (*this)[i], ADD_VALUES);
 
   VecAssemblyBegin(x);
   return true;
@@ -180,31 +180,32 @@ PETScMatrix::~PETScMatrix ()
 
 void PETScMatrix::initAssembly (const SAM& sam, bool delayLocking)
 {
-  SparseMatrix::initAssembly(sam, delayLocking);
-  SparseMatrix::preAssemble(sam, delayLocking);
+  if (!adm.dd.isPartitioned()) {
+    SparseMatrix::initAssembly(sam, delayLocking);
+    SparseMatrix::preAssemble(sam, delayLocking);
+  } else
+    this->resize(sam.neq,sam.neq);
 
   const SAMpatchPETSc* samp = dynamic_cast<const SAMpatchPETSc*>(&sam);
   if (!samp)
     return;
 
   // Get number of local equations in linear system
-  const PetscInt neq  = adm.dd.getMaxEq()- adm.dd.getMinEq() + 1;
-
+  PetscInt neq;
+  neq  = adm.dd.getMaxEq() - adm.dd.getMinEq() + 1;
   // Set correct number of rows and columns for matrix.
-  MatSetSizes(pA,neq,neq,PETSC_DECIDE,PETSC_DECIDE);
+  MatSetSizes(pA,neq,neq,PETSC_DETERMINE,PETSC_DETERMINE);
 
   // Allocate sparsity pattern
   std::vector<std::set<int>> dofc;
-  sam.getDofCouplings(dofc);
+  if (!adm.dd.isPartitioned())
+    sam.getDofCouplings(dofc);
 
   if (matvec.empty()) {
-    // Set correct number of rows and columns for matrix.
-    MatSetSizes(pA,neq,neq,PETSC_DECIDE,PETSC_DECIDE);
-
     MatSetFromOptions(pA);
 
     // Allocate sparsity pattern
-    if (adm.isParallel()) {
+    if (adm.isParallel() && !adm.dd.isPartitioned()) {
       int ifirst = adm.dd.getMinEq();
       int ilast  = adm.dd.getMaxEq();
       PetscIntVec d_nnz(neq, 0);
@@ -235,6 +236,34 @@ void PETScMatrix::initAssembly (const SAM& sam, bool delayLocking)
 
       MatMPIAIJSetPreallocation(pA,PETSC_DEFAULT,d_nnz.data(),
                                    PETSC_DEFAULT,o_nnz.data());
+    } else if (adm.dd.isPartitioned()) {
+      // Setup sparsity pattern for global matrix
+      SparseMatrix* lA = new SparseMatrix(neq, sam.getNoEquations());
+      for (int elm : adm.dd.getElms()) {
+        IntVec meen;
+        sam.getElmEqns(meen,elm+1);
+        for (int i : meen)
+          if (i > 0 && i >= adm.dd.getMinEq(0) && i <= adm.dd.getMaxEq(0))
+            for (int j : meen)
+              if (j > 0)
+                (*lA)(i-adm.dd.getMinEq(0)+1,adm.dd.getGlobalEq(j)) = 0.0;
+      }
+      IntVec iA, jA;
+      SparseMatrix::calcCSR(iA, jA, neq, lA->getValues());
+      delete lA;
+      MatMPIAIJSetPreallocationCSR(pA, iA.data(), jA.data(), nullptr);
+
+      // Setup sparsity pattern for local matrix
+      for (int elm : adm.dd.getElms()) {
+        IntVec meen;
+        sam.getElmEqns(meen,elm+1);
+        for (int i : meen)
+          if (i > 0)
+            for (int j : meen)
+              if (j > 0)
+                (*this)(i,j) = 0.0;
+      }
+      this->optimiseSLU();
     } else {
       PetscIntVec Nnz;
       for (const auto& it : dofc)
diff --git a/src/SIM/SIMbase.C b/src/SIM/SIMbase.C
index 0e47e51c..097b3e31 100644
--- a/src/SIM/SIMbase.C
+++ b/src/SIM/SIMbase.C
@@ -107,6 +107,7 @@ void SIMbase::clearProperties ()
   myProps.clear();
   myInts.clear();
   mixedMADOFs.clear();
+  adm.dd.setElms({},"");
 }
 
 
@@ -173,13 +174,6 @@ bool SIMbase::preprocess (const IntVec& ignored, bool fixDup)
     return false;
   }
 
-  if (nProc > 1 && myPatches.empty() && adm.isParallel())
-  {
-    std::cerr <<" *** SIMbase::preprocess: No partitioning information for "
-              << nProc <<" processors found."<< std::endl;
-    return false;
-  }
-
   PROFILE1("Model preprocessing");
 
   static int substep = 10;
@@ -367,12 +361,23 @@ bool SIMbase::preprocess (const IntVec& ignored, bool fixDup)
   if (!static_cast<SAMpatch*>(mySam)->init(myModel,ngnod,dofTypes))
     return false;
 
+  if (nProc > 1 && myPatches.empty() && adm.isParallel())
+  {
+    IFEM::cout <<" *** SIMbase::preprocess: No partitioning information for "
+               << nProc <<" processors found. Using graph partitioning\n";
+  }
+
   if (!adm.dd.setup(adm,*this))
   {
     std::cerr <<"\n *** SIMbase::preprocess(): Failed to establish "
               <<" domain decomposition data."<< std::endl;
     return false;
   }
+  else if (adm.dd.isPartitioned()) // reuse thread groups for element list
+  {
+    for (ASMbase* pch : this->getFEModel())
+      pch->generateThreadGroupsFromElms(adm.dd.getElms());
+  }
 
   if (!myProblem)
   {
@@ -1488,8 +1493,9 @@ bool SIMbase::solutionNorms (const TimeDomain& time,
 
   delete norm;
 
-  for (k = 0; k < gNorm.size(); k++)
-    adm.allReduceAsSum(gNorm[k]);
+  if (!adm.dd.isPartitioned())
+    for (k = 0; k < gNorm.size(); k++)
+      adm.allReduceAsSum(gNorm[k]);
 
   if (ok)
     ok = this->postProcessNorms(gNorm, eNorm);
diff --git a/src/SIM/SIMdummy.h b/src/SIM/SIMdummy.h
index 31faba1f..76c0769c 100644
--- a/src/SIM/SIMdummy.h
+++ b/src/SIM/SIMdummy.h
@@ -41,7 +41,9 @@ public:
   virtual unsigned short int getNoParamDim() const { return 0; }
   //! \brief Creates the computational FEM model from the spline patches.
   virtual bool createFEMmodel(char) { return false; }
-
+  //! \brief Element-element connectivities.
+  virtual std::vector<std::vector<int>> getElmConnectivities() const
+  { return std::vector<std::vector<int>>(); }
 protected:
   //! \brief Preprocesses a user-defined Dirichlet boundary property.
   virtual bool addConstraint(int,int,int,int,int,int&,char)
diff --git a/src/SIM/SIMoutput.C b/src/SIM/SIMoutput.C
index 328f1188..1e57e627 100644
--- a/src/SIM/SIMoutput.C
+++ b/src/SIM/SIMoutput.C
@@ -286,18 +286,29 @@ bool SIMoutput::writeGlvG (int& nBlock, const char* inpFile, bool doClear)
     // Open a new VTF-file
     char* vtfName = new char[strlen(inpFile)+10];
     strtok(strcpy(vtfName,inpFile),".");
-    if (nProc > 1)
-      sprintf(vtfName+strlen(vtfName),"_p%04d%s",myPid,ext);
-    else
-      strcat(vtfName,ext);
+    if (adm.dd.isPartitioned()) {
+      if (adm.getProcId() == 0) {
+        strcat(vtfName,ext);
+        IFEM::cout <<"\nWriting VTF-file "<< vtfName << std::endl;
+        myVtf = new VTF(vtfName,opt.format);
+      }
+    } else {
+      if (nProc > 1)
+        sprintf(vtfName+strlen(vtfName),"_p%04d%s",myPid,ext);
+      else
+        strcat(vtfName,ext);
 
-    IFEM::cout <<"\nWriting VTF-file "<< vtfName << std::endl;
-    myVtf = new VTF(vtfName,opt.format);
+      IFEM::cout <<"\nWriting VTF-file "<< vtfName << std::endl;
+      myVtf = new VTF(vtfName,opt.format);
+    }
     delete[] vtfName;
   }
   else if (doClear)
     myVtf->clearGeometryBlocks();
 
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   ElementBlock* lvb;
   char pname[32];
 
@@ -337,6 +348,9 @@ bool SIMoutput::writeGlvG (int& nBlock, const char* inpFile, bool doClear)
 
 bool SIMoutput::writeGlvBC (int& nBlock, int iStep) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (!myVtf) return false;
 
   Matrix field;
@@ -403,6 +417,9 @@ bool SIMoutput::writeGlvBC (int& nBlock, int iStep) const
 
 bool SIMoutput::writeGlvT (int iStep, int& geoBlk, int& nBlock) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (myProblem->hasTractionValues())
   {
     if (msgLevel > 1)
@@ -417,6 +434,9 @@ bool SIMoutput::writeGlvT (int iStep, int& geoBlk, int& nBlock) const
 bool SIMoutput::writeGlvV (const Vector& vec, const char* fieldName,
                            int iStep, int& nBlock, int idBlock, int ncmp) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (vec.empty())
     return true;
   else if (!myVtf)
@@ -459,6 +479,9 @@ bool SIMoutput::writeGlvV (const Vector& vec, const char* fieldName,
 bool SIMoutput::writeGlvS (const Vector& scl, const char* fieldName,
                            int iStep, int& nBlock, int idBlock) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (scl.empty())
     return true;
   else if (!myVtf)
@@ -496,6 +519,9 @@ bool SIMoutput::writeGlvS (const Vector& psol, int iStep, int& nBlock,
                            double time, const char* pvecName,
                            int idBlock, int psolComps)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   idBlock = this->writeGlvS1(psol,iStep,nBlock,time,
                              pvecName,idBlock,psolComps);
   if (idBlock < 0)
@@ -526,6 +552,9 @@ int SIMoutput::writeGlvS1 (const Vector& psol, int iStep, int& nBlock,
                            double time, const char* pvecName,
                            int idBlock, int psolComps, bool scalarOnly)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (psol.empty())
     return 0;
   else if (!myVtf)
@@ -680,6 +709,9 @@ int SIMoutput::writeGlvS1 (const Vector& psol, int iStep, int& nBlock,
 bool SIMoutput::writeGlvS2 (const Vector& psol, int iStep, int& nBlock,
                             double time, int idBlock, int psolComps)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (psol.empty())
     return true; // No primary solution
   else if (!myVtf || !myProblem)
@@ -860,6 +892,9 @@ bool SIMoutput::writeGlvP (const Vector& ssol, int iStep, int& nBlock,
                            int idBlock, const char* prefix,
                            std::vector<PointValue>* maxVal)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (ssol.empty())
     return true;
   else if (!myVtf)
@@ -984,6 +1019,9 @@ bool SIMoutput::evalProjSolution (const Vector& ssol,
 bool SIMoutput::writeGlvF (const RealFunc& f, const char* fname,
                            int iStep, int& nBlock, int idBlock, double time)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (!myVtf) return false;
 
   IntVec sID;
@@ -1009,6 +1047,9 @@ bool SIMoutput::writeGlvF (const RealFunc& f, const char* fname,
 
 bool SIMoutput::writeGlvStep (int iStep, double value, int itype)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   myVtf->writeGeometryBlocks(iStep);
 
   if (itype == 0)
@@ -1020,6 +1061,9 @@ bool SIMoutput::writeGlvStep (int iStep, double value, int itype)
 
 bool SIMoutput::writeGlvM (const Mode& mode, bool freq, int& nBlock)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (mode.eigVec.empty())
     return true;
   else if (!myVtf)
@@ -1067,6 +1111,9 @@ bool SIMoutput::writeGlvM (const Mode& mode, bool freq, int& nBlock)
 bool SIMoutput::writeGlvN (const Matrix& norms, int iStep, int& nBlock,
                            const std::vector<std::string>& prefix, int idBlock)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (norms.empty())
     return true;
   else if (!myVtf)
@@ -1139,6 +1186,9 @@ bool SIMoutput::writeGlvN (const Matrix& norms, int iStep, int& nBlock,
 bool SIMoutput::writeGlvE (const Vector& vec, int iStep, int& nBlock,
                            const char* name)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
   if (!myVtf)
     return false;
 
diff --git a/src/Utility/HDF5Writer.C b/src/Utility/HDF5Writer.C
index 7347879d..6cfe647e 100644
--- a/src/Utility/HDF5Writer.C
+++ b/src/Utility/HDF5Writer.C
@@ -309,6 +309,8 @@ void HDF5Writer::writeSIM (int level, const DataEntry& entry,
   size_t projOfs = 0;
   for (int i = 0; i < sim->getNoPatches(); ++i) {
     int loc = sim->getLocalPatchIndex(i+1);
+    if (sim->getProcessAdm().dd.isPartitioned() && sim->getProcessAdm().getProcId() != 0)
+      loc = 0;
     if (loc > 0 && (!(results & DataExporter::REDUNDANT) ||
                     sim->getGlobalProcessID() == 0)) // we own the patch
     {