added: support for partitioning based parallel computations

automatically partition mesh and distribute in parallel. this is rather suboptimal in many ways, but still useful.
2019-04-09 17:37:45 +02:00 · 2019-04-09 17:37:45 +02:00 · 4b35c00f0d
commit 4b35c00f0d
parent 41ce8928de
22 changed files with 733 additions and 257 deletions
--- a/src/ASM/ASMbase.h
+++ b/src/ASM/ASMbase.h
@ -425,6 +425,8 @@ public:
  virtual void generateThreadGroups(const Integrand&, bool, bool) {}
  //! \brief Generates element groups for multi-threading of boundary integrals.
  virtual void generateThreadGroups(char, bool, bool) {}
+  //! \brief Generate element-groups for multi-threading based on a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>&) {}


  // Methods for integration of finite element quantities.
@ -839,6 +841,7 @@ protected:
  IntVec myMLGE; //!< The actual Matrix of Local to Global Element numbers
  IntVec myMLGN; //!< The actual Matrix of Local to Global Node numbers
  IntMat myMNPC; //!< The actual Matrix of Nodal Point Correspondance
+  IntVec myElms; //!< Elements on patch - used with partitioning

  //! \brief Numerical integration scheme for this patch.
  //! \details A value in the range [1,10] means use that number of Gauss
--- a/src/ASM/ASMs2D.C
+++ b/src/ASM/ASMs2D.C
@ -2318,6 +2318,10 @@ bool ASMs2D::integrate (Integrand& integrand, int lIndex,
      fe.iel = abs(MLGE[doXelms+iel-1]);
      if (fe.iel < 1) continue; // zero-area element

+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
 #ifdef SP_DEBUG
      if (dbgElm < 0 && iel != -dbgElm)
        continue; // Skipping all elements, except for -dbgElm
@ -3119,3 +3123,14 @@ void ASMs2D::getBoundaryElms (int lIndex, int, IntVec& elms) const
      elms.push_back(MLGE[i + (lIndex-3)*N1m*(N2m-1)] - 1);
  }
 }
+
+
+void ASMs2D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1) - 1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
--- a/src/ASM/ASMs2D.h
+++ b/src/ASM/ASMs2D.h
@ -588,6 +588,9 @@ protected:
  void generateThreadGroups(size_t strip1, size_t strip2,
                            bool silence, bool ignoreGlobalLM);

+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
 public:
  //! \brief Auxilliary function for computation of basis function indices.
  static void scatterInd(int n1, int n2, int p1, int p2,
--- a/src/ASM/ASMs2Dmx.C
+++ b/src/ASM/ASMs2Dmx.C
@ -528,13 +528,17 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
  const int n1 = surf->numCoefs_u();
  const int nel1 = n1 - p1 + 1;

+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneStripe(nel);
+  const ThreadGroups& groups = glInt.threadSafe() ? oneGroup : threadGroups;
+

  // === Assembly loop over all elements in the patch ==========================

  bool ok=true;
-  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+  for (size_t g = 0; g < groups.size() && ok; g++) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroups[g].size();++t) {
+    for (size_t t = 0; t < groups[g].size(); t++) {
      MxFiniteElement fe(elem_size);
      std::vector<Matrix> dNxdu(m_basis.size());
      std::vector<Matrix3D> d2Nxdu2(m_basis.size());
@ -543,9 +547,9 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
      Matrix Xnod, Jac;
      double   param[3] = { 0.0, 0.0, 0.0 };
      Vec4   X(param);
-      for (size_t i = 0; i < threadGroups[g][t].size() && ok; ++i)
+      for (size_t i = 0; i < groups[g][t].size() && ok; ++i)
      {
-        int iel = threadGroups[g][t][i];
+        int iel = groups[g][t][i];
        fe.iel = MLGE[iel];
        if (fe.iel < 1) continue; // zero-area element

@ -737,6 +741,10 @@ bool ASMs2Dmx::integrate (Integrand& integrand, int lIndex,
      fe.iel = MLGE[iel-1];
      if (fe.iel < 1) continue; // zero-area element

+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
      // Skip elements that are not on current boundary edge
      bool skipMe = false;
      switch (edgeDir)
@ -866,6 +874,10 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
      fe.iel = abs(MLGE[iel]);
      if (fe.iel < 1) continue; // zero-area element

+      if (!myElms.empty() && !glInt.threadSafe() &&
+          std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end())
+        continue;
+
      short int status = iChk.hasContribution(iel,i1,i2);
      if (!status) continue; // no interface contributions for this element

--- a/src/ASM/ASMs3D.C
+++ b/src/ASM/ASMs3D.C
@ -2706,6 +2706,10 @@ bool ASMs3D::integrateEdge (Integrand& integrand, int lEdge,
 	fe.iel = MLGE[iel-1];
 	if (fe.iel < 1) continue; // zero-volume element

+        if (!myElms.empty() &&
+            std::find(myElms.begin(), myElms.end(), fe.iel-1) == myElms.end())
+          continue;
+
 	// Skip elements that are not on current boundary edge
 	bool skipMe = false;
 	switch (lEdge)
@ -3533,3 +3537,17 @@ void ASMs3D::getBoundaryElms (int lIndex, int, IntVec& elms) const
        elms.push_back(MLGE[i + j*N1m + (lIndex-5)*(N1m*N2m*(N3m-1))] - 1);
  }
 }
+
+
+void ASMs3D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroupsVol = threadGroupsVol.filter(myElms);
+
+  for (auto& group : threadGroupsFace)
+    group.second = group.second.filter(myElms);
+}
--- a/src/ASM/ASMs3D.h
+++ b/src/ASM/ASMs3D.h
@ -653,6 +653,9 @@ protected:
  //! \param[in] silence If \e true, suppress threading group outprint
  virtual void generateThreadGroups(char lIndex, bool silence, bool);

+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
 public:
  //! \brief Auxilliary function for computation of basis function indices.
  static void scatterInd(int n1, int n2, int n3, int p1, int p2, int p3,
--- a/src/ASM/ASMs3Dmx.C
+++ b/src/ASM/ASMs3Dmx.C
@ -483,12 +483,17 @@ bool ASMs3Dmx::integrate (Integrand& integrand,
  const int nel1 = n1 - p1 + 1;
  const int nel2 = n2 - p2 + 1;

+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneStripe(nel);
+  const ThreadGroups& groups = glInt.threadSafe() ? oneGroup : threadGroupsVol;
+
+
  // === Assembly loop over all elements in the patch ==========================

-  bool ok=true;
-  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < groups.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+    for (size_t t = 0; t < groups[g].size(); ++t) {
      MxFiniteElement fe(elem_size);
      std::vector<Matrix> dNxdu(m_basis.size());
      std::vector<Matrix3D> d2Nxdu2(m_basis.size());
@ -497,9 +502,9 @@ bool ASMs3Dmx::integrate (Integrand& integrand,
      Matrix Xnod, Jac;
      double   param[3];
      Vec4   X(param);
-      for (size_t l = 0; l < threadGroupsVol[g][t].size() && ok; ++l)
+      for (size_t l = 0; l < groups[g][t].size() && ok; ++l)
      {
-        int iel = threadGroupsVol[g][t][l];
+        int iel = groups[g][t][l];
        fe.iel = MLGE[iel];
        if (fe.iel < 1) continue; // zero-volume element

--- a/src/ASM/DomainDecomposition.C
+++ b/src/ASM/DomainDecomposition.C
@ -18,6 +18,7 @@
 #include "ASMunstruct.h"
 #include "LinSolParams.h"
 #include "ProcessAdm.h"
+#include "Profiler.h"
 #include "SAMpatch.h"
 #include "SIMbase.h"
 #include "Utilities.h"
@ -26,6 +27,14 @@
 #include <functional>
 #include <numeric>

+#ifdef HAS_ZOLTAN
+#define OLD_MPI HAVE_MPI
+#undef HAVE_MPI
+#include <zoltan.h>
+#undef HAVE_MPI
+#define HAVE_MPI OLD_MPI
+#endif
+

 DomainDecomposition::
 OrientIterator::OrientIterator(const ASMbase* pch,
@ -170,6 +179,95 @@ OrientIterator::OrientIterator(const ASMbase* pch, int orient, int lIdx)
 }


+#ifdef HAS_ZOLTAN
+static int getNumElements(void* mesh, int* err)
+{
+  *err = ZOLTAN_OK;
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  return neigh.size();
+}
+
+
+static void getElementList(void* mesh, int numGlobalEntries,
+                           int, ZOLTAN_ID_PTR gids,
+                           ZOLTAN_ID_PTR lids, int,
+                           float*, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  std::iota(gids, gids+neigh.size(), 0);
+  std::iota(lids, lids+neigh.size(), 0);
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNumEdges(void* mesh, int sizeGID, int sizeLID,
+                        int numCells, ZOLTAN_ID_PTR globalID,
+                        ZOLTAN_ID_PTR localID, int* numEdges, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+  int* ne = numEdges;
+  for (const std::vector<int>& n : neigh)
+    *ne++ = std::accumulate(n.begin(), n.end(), 0,
+                            [](const int& a, const int& b)
+                            {
+                              return b != -1 ? a + 1 : a;
+                            });
+
+  *err = ZOLTAN_OK;
+}
+
+
+static void getEdges(void* mesh, int sizeGID, int sizeLID, int numCells,
+                     ZOLTAN_ID_PTR globalID, ZOLTAN_ID_PTR localID,
+                     int* numEdges, ZOLTAN_ID_PTR nborGID, int* nborProc,
+                     int wgtDim, float* egts, int* err)
+{
+  IntMat& neigh = *static_cast<IntMat*>(mesh);
+
+  memset(nborProc, 0, numCells*sizeof(int));
+  for (const std::vector<int>& elm : neigh)
+    for (int n : elm)
+      if (n != -1)
+        *nborGID++ = n;
+
+  *err = ZOLTAN_OK;
+}
+
+
+static int getNullElements(void*, int* err)
+{
+  *err = ZOLTAN_OK;
+  return 0;
+}
+
+
+static void getNullList(void*, int,
+                        int, ZOLTAN_ID_PTR,
+                        ZOLTAN_ID_PTR, int,
+                        float*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNumNullEdges(void*, int, int, int,
+                            ZOLTAN_ID_PTR, ZOLTAN_ID_PTR,
+                            int*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+
+
+static void getNullEdges(void*, int, int, int,
+                         ZOLTAN_ID_PTR, ZOLTAN_ID_PTR,
+                         int*, ZOLTAN_ID_PTR, int*,
+                         int, float*, int* err)
+{
+  *err = ZOLTAN_OK;
+}
+#endif
+
+
 std::vector<std::set<int>> DomainDecomposition::getSubdomains(int nx, int ny, int nz,
                                                              int overlap, size_t block) const
 {
@ -811,6 +909,47 @@ bool DomainDecomposition::calcGlobalEqNumbers(const ProcessAdm& adm,
 }


+bool DomainDecomposition::calcGlobalEqNumbersPart(const ProcessAdm& adm,
+                                                  const SIMbase& sim)
+{
+#ifdef HAVE_MPI
+  blocks[0].MLGEQ.clear();
+  blocks[0].MLGEQ.resize(sim.getSAM()->getNoEquations(), -1);
+
+  blocks[0].minEq = 1;
+  blocks[0].maxEq = 0;
+
+  if (adm.getProcId() != 0) {
+    adm.receive(blocks[0].MLGEQ, adm.getProcId()-1);
+    adm.receive(blocks[0].minEq, adm.getProcId()-1);
+    blocks[0].maxEq = blocks[0].minEq;
+    blocks[0].minEq++;
+  }
+
+  for (size_t i = 0; i < myElms.size(); ++i) {
+    IntVec meen;
+    sim.getSAM()->getElmEqns(meen, myElms[i]+1);
+    for (int eq : meen)
+      if (eq > 0 && blocks[0].MLGEQ[eq-1] == -1)
+        blocks[0].MLGEQ[eq-1] = ++blocks[0].maxEq;
+  }
+
+  if (adm.getProcId() < adm.getNoProcs()-1) {
+    adm.send(blocks[0].MLGEQ, adm.getProcId()+1);
+    adm.send(blocks[0].maxEq, adm.getProcId()+1);
+  }
+
+  MPI_Bcast(&blocks[0].MLGEQ[0], blocks[0].MLGEQ.size(), MPI_INT,
+            adm.getNoProcs()-1, *adm.getCommunicator());
+
+  for (size_t i = 0; i < blocks[0].MLGEQ.size(); ++i)
+    blocks[0].G2LEQ[blocks[0].MLGEQ[i]] = i+1;
+#endif
+
+  return true;
+}
+
+
 int DomainDecomposition::getGlobalEq(int lEq, size_t idx) const
 {
  if (lEq < 1 || idx >= blocks.size())
@ -933,10 +1072,17 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
 #endif
 #endif

+  if (ghostConnections.empty() && adm.getNoProcs() > 1 && myElms.empty())
+    if (!this->graphPartition(adm, sim))
+      return false;
+
  sam = dynamic_cast<const SAMpatch*>(sim.getSAM());

  int ok = 1;
-
+#ifdef HAVE_MPI
+  int lok = ok;
+#endif
+  if (myElms.empty()) {
    // Establish global node numbers
    if (!calcGlobalNodeNumbers(adm, sim))
      ok = 0;
@ -949,7 +1095,6 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
    }

 #ifdef HAVE_MPI
-  int lok = ok;
    MPI_Allreduce(&lok, &ok, 1, MPI_INT, MPI_SUM, *adm.getCommunicator());
 #endif

@ -963,6 +1108,7 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
 #ifdef HAVE_MPI
    ok = 1;
 #endif
+  }

  // Establish local equation mappings for each block.
  if (sim.getSolParams() && sim.getSolParams()->getNoBlocks() > 1) {
@ -1011,7 +1157,10 @@ bool DomainDecomposition::setup(const ProcessAdm& adm, const SIMbase& sim)
  }

  // Establish global equation numbers for all blocks.
-  if (!calcGlobalEqNumbers(adm, sim))
+  if (!myElms.empty()) {
+    if (!calcGlobalEqNumbersPart(adm, sim))
+      return false;
+  } else if (!calcGlobalEqNumbers(adm, sim))
 #ifdef HAVE_MPI
    ok = 0;
 #else
@ -1072,3 +1221,91 @@ int DomainDecomposition::getPatchOwner(size_t p) const

  return it->second;
 }
+
+
+bool DomainDecomposition::graphPartition(const ProcessAdm& adm, const SIMbase& sim)
+{
+  PROFILE("Mesh partition");
+  myElms.clear();
+#ifdef HAS_ZOLTAN
+  static bool inited = false;
+  if (!inited)
+  {
+    float ver;
+    Zoltan_Initialize(0, nullptr, &ver);
+    inited = true;
+  }
+  struct Zoltan_Struct* zz = Zoltan_Create(*adm.getCommunicator());
+  IntMat neigh;
+  if (adm.getProcId() == 0) {
+    neigh = sim.getElmConnectivities();
+    Zoltan_Set_Num_Obj_Fn(zz, getNumElements, &neigh);
+    Zoltan_Set_Obj_List_Fn(zz, getElementList, &neigh);
+    Zoltan_Set_Num_Edges_Multi_Fn(zz, getNumEdges, &neigh);
+    Zoltan_Set_Edge_List_Multi_Fn(zz, getEdges, &neigh);
+  } else {
+    Zoltan_Set_Num_Obj_Fn(zz, getNullElements, nullptr);
+    Zoltan_Set_Obj_List_Fn(zz, getNullList, nullptr);
+    Zoltan_Set_Num_Edges_Multi_Fn(zz, getNumNullEdges, nullptr);
+    Zoltan_Set_Edge_List_Multi_Fn(zz, getNullEdges, nullptr);
+  }
+
+  Zoltan_Set_Param(zz, "DEBUG_LEVEL", "0");
+  Zoltan_Set_Param(zz, "LB_METHOD", "GRAPH");
+  Zoltan_Set_Param(zz, "GRAPH_PACKAGE", "Scotch");
+  Zoltan_Set_Param(zz, "LB_APPROACH", "PARTITION");
+  Zoltan_Set_Param(zz, "NUM_GID_ENTRIES", "1");
+  Zoltan_Set_Param(zz, "NUM_LID_ENTRIES", "1");
+  Zoltan_Set_Param(zz, "RETURN_LISTS", "ALL");
+  Zoltan_Set_Param(zz, "CHECK_GRAPH", "2");
+  Zoltan_Set_Param(zz,"EDGE_WEIGHT_DIM","0");
+  Zoltan_Set_Param(zz, "OBJ_WEIGHT_DIM", "0");
+  Zoltan_Set_Param(zz, "PHG_EDGE_SIZE_THRESHOLD", ".35");  /* 0-remove all, 1-remove none */
+  int changes, numGidEntries, numLidEntries, numImport, numExport;
+  ZOLTAN_ID_PTR importGlobalGids, importLocalGids, exportGlobalGids, exportLocalGids;
+  int* importProcs, *importToPart, *exportProcs, *exportToPart;
+  Zoltan_LB_Partition(zz, /* input (all remaining fields are output) */
+                           &changes,        /* 1 if partitioning was changed, 0 otherwise */
+                           &numGidEntries,  /* Number of integers used for a global ID */
+                           &numLidEntries,  /* Number of integers used for a local ID */
+                           &numImport,      /* Number of vertices to be sent to me */
+                           &importGlobalGids,  /* Global IDs of vertices to be sent to me */
+                           &importLocalGids,   /* Local IDs of vertices to be sent to me */
+                           &importProcs,    /* Process rank for source of each incoming vertex */
+                           &importToPart,   /* New partition for each incoming vertex */
+                           &numExport,      /* Number of vertices I must send to other processes*/
+                           &exportGlobalGids,  /* Global IDs of the vertices I must send */
+                           &exportLocalGids,   /* Local IDs of the vertices I must send */
+                           &exportProcs,    /* Process to which I send each of the vertices */
+                           &exportToPart);  /* Partition to which each vertex will belong */
+
+  if (sim.getProcessAdm().getProcId() == 0) {
+    std::vector<std::vector<int>> toExp(adm.getNoProcs());
+    std::vector<bool> offProc(sim.getNoElms(), false);
+    for (int i = 0; i < numExport; ++i) {
+      int gid = exportGlobalGids[i];
+      offProc[gid] = true;
+    }
+    myElms.reserve(numExport);
+    for (size_t i = 0; i < sim.getNoElms(); ++i)
+      if (!offProc[i])
+        myElms.push_back(i);
+  }
+  else {
+    myElms.resize(numImport);
+    std::copy(importGlobalGids, importGlobalGids+numImport, myElms.begin());
+  }
+  Zoltan_LB_Free_Part(&importGlobalGids, &importLocalGids, &importProcs, &importToPart);
+  Zoltan_LB_Free_Part(&exportGlobalGids, &exportLocalGids, &exportProcs, &exportToPart);
+  Zoltan_Destroy(&zz);
+#else
+  std::cerr << "*** DomainDecompositon::graphPartition: Compiled without Zoltan support. No partitioning available." << std::endl;
+  return false;
+#endif
+
+  if (!myElms.empty())
+    IFEM::cout << "Graph partitioning: " << myElms.size() << " elements in partition." << std::endl;
+
+  partitioned = !myElms.empty();
+  return partitioned;
+}
--- a/src/ASM/DomainDecomposition.h
+++ b/src/ASM/DomainDecomposition.h
@ -174,6 +174,12 @@ public:
  //! \brief Returns number of matrix blocks.
  size_t getNoBlocks() const { return blocks.size()-1; }

+  //! \brief Returns whether a graph based partition is used.
+  bool isPartitioned() const { return partitioned; }
+
+  //! \brief Returns elements in partition.
+  const std::vector<int>& getElms() const { return myElms; }
+
 private:
  //! \brief Calculates a 1D partitioning with a given overlap.
  //! \param[in] nel1 Number of knot-spans in first parameter direction.
@ -237,11 +243,17 @@ private:
  //! \brief Calculate the global equation numbers for given finite element model.
  bool calcGlobalEqNumbers(const ProcessAdm& adm, const SIMbase& sim);

+  //! \brief Calculate the global equation numbers for a partitioned model.
+  bool calcGlobalEqNumbersPart(const ProcessAdm& adm, const SIMbase& sim);
+
  //! \brief Sanity check model.
  //! \details Collects the corners of all patches in the model and make sure nodes
  //!          with matching coordinates have the same global node ID.
  bool sanityCheckCorners(const SIMbase& sim);

+  //! \brief Setup domain decomposition based on graph partitioning.
+  bool graphPartition(const ProcessAdm& adm, const SIMbase& sim);
+
  std::map<int,int> patchOwner; //!< Process that owns a particular patch

  //! \brief Struct with information per matrix block.
@ -258,12 +270,14 @@ private:

  std::vector<int> MLGN; //!< Process-local-to-global node numbers
  std::vector<BlockInfo> blocks; //!< Equation mappings for all matrix blocks.
+  std::vector<int> myElms; //!< Elements in partition
  int minDof = 0; //!< First DOF we own
  int maxDof = 0; //!< Last DOF we own
  int minNode = 0; //!< First node we own
  int maxNode = 0; //!< Last node we own

  const SAMpatch* sam = nullptr; //!< The assembly handler the DD is constructed for.
+  bool partitioned = false; //!< \e true if graph based decomposition
 };

 #endif
--- a/src/ASM/LR/ASMu2D.C
+++ b/src/ASM/LR/ASMu2D.C
@ -1574,6 +1574,12 @@ bool ASMu2D::integrate (Integrand& integrand, int lIndex,
  int iel = 0;
  for (const LR::Element* el : lrspline->getAllElements())
  {
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel) == myElms.end()) {
+        ++iel;
+      continue;
+    }
+
    fe.iel = MLGE[iel++];
 #ifdef SP_DEBUG
    if (dbgElm < 0 && iel != -dbgElm)
@ -2705,3 +2711,14 @@ void ASMu2D::getBoundaryElms (int lIndex, int orient, IntVec& elms) const
  for (const LR::Element* elem : elements)
    elms.push_back(MLGE[elem->getId()]-1);
 }
+
+
+void ASMu2D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
--- a/src/ASM/LR/ASMu2D.h
+++ b/src/ASM/LR/ASMu2D.h
@ -591,6 +591,9 @@ protected:
  void generateThreadGroups(const Integrand& integrand, bool silence,
                            bool ignoreGlobalLM);

+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
  //! \brief Remap element wise errors to basis functions.
  //! \param     errors The remapped errors
  //! \param[in] origErr The element wise errors on the geometry mesh
--- a/src/ASM/LR/ASMu2Dmx.C
+++ b/src/ASM/LR/ASMu2Dmx.C
@ -301,16 +301,21 @@ bool ASMu2Dmx::integrate (Integrand& integrand,
  if (!xg || !wg) return false;
  bool use2ndDer = integrand.getIntegrandType() & Integrand::SECOND_DERIVATIVES;

+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneGroup(nel);
+  const IntMat& groups = glInt.threadSafe() ? oneGroup[0] : threadGroups[0];
+
+
  // === Assembly loop over all elements in the patch ==========================
  bool ok = true;
-  for (size_t t = 0; t < threadGroups[0].size() && ok; ++t)
+  for (size_t t = 0; t < groups.size() && ok; ++t)
  {
 //#pragma omp parallel for schedule(static)
-    for (size_t e = 0; e < threadGroups[0][t].size(); ++e)
+    for (size_t e = 0; e < groups[t].size(); ++e)
    {
      if (!ok)
        continue;
-      int iel = threadGroups[0][t][e] + 1;
+      int iel = groups[t][e] + 1;
      auto el1 = threadBasis->elementBegin()+iel-1;
      double uh = ((*el1)->umin()+(*el1)->umax())/2.0;
      double vh = ((*el1)->vmin()+(*el1)->vmax())/2.0;
@ -524,6 +529,12 @@ bool ASMu2Dmx::integrate (Integrand& integrand, int lIndex,
    }
    if (skipMe) continue;

+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end()) {
+        ++iel;
+      continue;
+    }
+
    double uh = ((*el1)->umin()+(*el1)->umax())/2.0;
    double vh = ((*el1)->vmin()+(*el1)->vmax())/2.0;
    std::vector<size_t> els;
@ -644,6 +655,12 @@ bool ASMu2Dmx::integrate (Integrand& integrand,
  for (int iel = 1; el1 != m_basis[0]->elementEnd(); ++el1, ++iel) {
    short int status = iChk.hasContribution(iel);
    if (!status) continue; // no interface contributions for this element
+
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iel-1) == myElms.end()) {
+        ++iel, ++el1;
+      continue;
+    }
    // first push the hosting elements
    std::vector<size_t> els(1,iel);
    std::vector<size_t> elem_sizes(1,(*el1)->nBasisFunctions());
--- a/src/ASM/LR/ASMu3D.C
+++ b/src/ASM/LR/ASMu3D.C
@ -1209,6 +1209,9 @@ bool ASMu3D::integrate (Integrand& integrand, int lIndex,
    if (dbgElm < 0 && iEl+1 != -dbgElm)
      continue; // Skipping all elements, except for -dbgElm
 #endif
+    if (!myElms.empty() && !glInt.threadSafe() &&
+        std::find(myElms.begin(), myElms.end(), iEl) == myElms.end())
+     continue;

    FiniteElement fe;
    fe.iel = MLGE[iEl];
@ -2324,3 +2327,14 @@ void ASMu3D::getBoundaryElms (int lIndex, int orient, IntVec& elms) const
  for (const LR::Element* elem : elements)
    elms.push_back(MLGE[elem->getId()]-1);
 }
+
+
+void ASMu3D::generateThreadGroupsFromElms(const std::vector<int>& elms)
+{
+  myElms.clear();
+  for (int elm : elms)
+    if (this->getElmIndex(elm+1) > 0)
+      myElms.push_back(this->getElmIndex(elm+1)-1);
+
+  threadGroups = threadGroups.filter(myElms);
+}
--- a/src/ASM/LR/ASMu3D.h
+++ b/src/ASM/LR/ASMu3D.h
@ -553,6 +553,9 @@ protected:
  void generateThreadGroups(const Integrand& integrand, bool silence,
                            bool ignoreGlobalLM);

+  //! \brief Generate element groups from a partition.
+  virtual void generateThreadGroupsFromElms(const std::vector<int>& elms);
+
  //! \brief Remap element wise errors to basis functions.
  //! \param     errors The remapped errors
  //! \param[in] origErr The element wise errors on the geometry mesh
@ -587,6 +590,7 @@ protected:
  Matrices      myBezierExtract; //!< Bezier extraction matrices

  ThreadGroups threadGroups; //!< Element groups for multi-threaded assembly
+  IntVec myElms; //!< Elements on patch - used with partitioning
  mutable double vMin; //!< Minimum element volume for adaptive refinement
 };

--- a/src/ASM/LR/ASMu3Dmx.C
+++ b/src/ASM/LR/ASMu3Dmx.C
@ -360,11 +360,20 @@ bool ASMu3Dmx::integrate (Integrand& integrand,
  else if (nRed < 0)
    nRed = nGauss; // The integrand needs to know nGauss

+  ThreadGroups oneGroup;
+  if (glInt.threadSafe()) oneGroup.oneGroup(nel);
+  const IntMat& group = glInt.threadSafe() ? oneGroup[0] : threadGroups[0];
+

  // === Assembly loop over all elements in the patch ==========================

-  bool ok=true;
-  for(LR::Element *el : this->getBasis(geoBasis)->getAllElements()) {
+  bool ok = true;
+  for (size_t t = 0; t < group.size() && ok; t++)
+//#pragma omp parallel for schedule(static)
+    for (size_t e = 0; e < group[t].size(); e++)
+    {
+      int iel = group[t][e] + 1;
+      const LR::Element* el = lrspline->getElement(iel-1);
      double uh = (el->umin()+el->umax())/2.0;
      double vh = (el->vmin()+el->vmax())/2.0;
      double wh = (el->wmin()+el->wmax())/2.0;
--- a/src/ASM/SAMpatchPETSc.C
+++ b/src/ASM/SAMpatchPETSc.C
@ -54,6 +54,9 @@ Real SAMpatchPETSc::dot (const Vector& x, const Vector& y, char dofType) const
 {
 #ifdef HAVE_MPI
  if (adm.isParallel()) {
+    if (adm.dd.isPartitioned())
+      return this->SAMpatch::dot(x,y,dofType);
+
    DofIS& dis = this->getIS(dofType);

    Vec lx, ly;
@ -94,7 +97,7 @@ Real SAMpatchPETSc::dot (const Vector& x, const Vector& y, char dofType) const
 Real SAMpatchPETSc::normL2 (const Vector& x, char dofType) const
 {
 #ifdef HAVE_MPI
-  if (adm.isParallel()) {
+  if (adm.isParallel() && !adm.dd.isPartitioned()) {
    DofIS& dis = this->getIS(dofType);

    Vec lx;
@ -130,7 +133,7 @@ Real SAMpatchPETSc::normInf (const Vector& x, size_t& comp, char dofType) const
 {
  Real max = this->SAM::normInf(x,comp,dofType);
 #ifdef HAVE_MPI
-  if (adm.isParallel())
+  if (adm.isParallel() && !adm.dd.isPartitioned())
    max = adm.allReduce(max, MPI_MAX);
  // TODO: comp (node index of the max value) is not necessarily correct here
 #endif
@ -187,23 +190,33 @@ bool SAMpatchPETSc::expandSolution(const SystemVector& solVec,

 #ifdef HAVE_MPI
  if (adm.isParallel()) {
-    if (!glob2LocEq) {
+    if (!glob2LocEq && !adm.dd.isPartitioned()) {
      IntVec mlgeq(adm.dd.getMLGEQ());
      for (int& ieq : mlgeq) --ieq;
      ISCreateGeneral(*adm.getCommunicator(),adm.dd.getMLGEQ().size(),
                      mlgeq.data(), PETSC_COPY_VALUES, &glob2LocEq);
    }

-    Vec solution;
-    VecCreateSeq(PETSC_COMM_SELF, Bptr->dim(), &solution);
   VecScatter ctx;
+   Vec solution;
+   if (adm.dd.isPartitioned())
+      VecScatterCreateToAll(Bptr->getVector(), &ctx, &solution);
+    else {
+      VecCreateSeq(PETSC_COMM_SELF, Bptr->dim(), &solution);
      VecScatterCreate(Bptr->getVector(), glob2LocEq, solution, nullptr, &ctx);
+   }
+
    VecScatterBegin(ctx, Bptr->getVector(), solution, INSERT_VALUES, SCATTER_FORWARD);
    VecScatterEnd(ctx, Bptr->getVector(),solution,INSERT_VALUES,SCATTER_FORWARD);
    VecScatterDestroy(&ctx);
    PetscScalar* data;
    VecGetArray(solution, &data);
+    if (adm.dd.isPartitioned()) {
+      for (const auto& it : adm.dd.getG2LEQ(0))
+        (*Bptr)(it.second) = data[it.first-1];
+    } else
      std::copy(data, data + Bptr->dim(), Bptr->getPtr());
+
    VecDestroy(&solution);
  } else
 #endif
--- a/src/IFEM.C
+++ b/src/IFEM.C
@ -93,7 +93,7 @@ int IFEM::Init (int arg_c, char** arg_v, const char* title)
 #endif

  std::cout <<"\n        MPI support: ";
-#if HAVE_MPI
+#ifdef HAVE_MPI
  std::cout <<"enabled";
 #else
  std::cout <<"disabled";
--- a/src/LinAlg/PETScMatrix.C
+++ b/src/LinAlg/PETScMatrix.C
@ -91,8 +91,8 @@ void PETScVector::redim(size_t n)

 bool PETScVector::beginAssembly()
 {
-  for (size_t i = 0; i < size(); ++i)
-    VecSetValue(x , adm.dd.getGlobalEq(i+1)-1, (*this)[i], ADD_VALUES);
+  for (size_t i = 0; i < this->size(); ++i)
+    VecSetValue(x, adm.dd.getGlobalEq(i+1)-1, (*this)[i], ADD_VALUES);

  VecAssemblyBegin(x);
  return true;
@ -180,31 +180,32 @@ PETScMatrix::~PETScMatrix ()

 void PETScMatrix::initAssembly (const SAM& sam, bool delayLocking)
 {
+  if (!adm.dd.isPartitioned()) {
    SparseMatrix::initAssembly(sam, delayLocking);
    SparseMatrix::preAssemble(sam, delayLocking);
+  } else
+    this->resize(sam.neq,sam.neq);

  const SAMpatchPETSc* samp = dynamic_cast<const SAMpatchPETSc*>(&sam);
  if (!samp)
    return;

  // Get number of local equations in linear system
-  const PetscInt neq  = adm.dd.getMaxEq()- adm.dd.getMinEq() + 1;
-
+  PetscInt neq;
+  neq  = adm.dd.getMaxEq() - adm.dd.getMinEq() + 1;
  // Set correct number of rows and columns for matrix.
-  MatSetSizes(pA,neq,neq,PETSC_DECIDE,PETSC_DECIDE);
+  MatSetSizes(pA,neq,neq,PETSC_DETERMINE,PETSC_DETERMINE);

  // Allocate sparsity pattern
  std::vector<std::set<int>> dofc;
+  if (!adm.dd.isPartitioned())
    sam.getDofCouplings(dofc);

  if (matvec.empty()) {
-    // Set correct number of rows and columns for matrix.
-    MatSetSizes(pA,neq,neq,PETSC_DECIDE,PETSC_DECIDE);
-
    MatSetFromOptions(pA);

    // Allocate sparsity pattern
-    if (adm.isParallel()) {
+    if (adm.isParallel() && !adm.dd.isPartitioned()) {
      int ifirst = adm.dd.getMinEq();
      int ilast  = adm.dd.getMaxEq();
      PetscIntVec d_nnz(neq, 0);
@ -235,6 +236,34 @@ void PETScMatrix::initAssembly (const SAM& sam, bool delayLocking)

      MatMPIAIJSetPreallocation(pA,PETSC_DEFAULT,d_nnz.data(),
                                   PETSC_DEFAULT,o_nnz.data());
+    } else if (adm.dd.isPartitioned()) {
+      // Setup sparsity pattern for global matrix
+      SparseMatrix* lA = new SparseMatrix(neq, sam.getNoEquations());
+      for (int elm : adm.dd.getElms()) {
+        IntVec meen;
+        sam.getElmEqns(meen,elm+1);
+        for (int i : meen)
+          if (i > 0 && i >= adm.dd.getMinEq(0) && i <= adm.dd.getMaxEq(0))
+            for (int j : meen)
+              if (j > 0)
+                (*lA)(i-adm.dd.getMinEq(0)+1,adm.dd.getGlobalEq(j)) = 0.0;
+      }
+      IntVec iA, jA;
+      SparseMatrix::calcCSR(iA, jA, neq, lA->getValues());
+      delete lA;
+      MatMPIAIJSetPreallocationCSR(pA, iA.data(), jA.data(), nullptr);
+
+      // Setup sparsity pattern for local matrix
+      for (int elm : adm.dd.getElms()) {
+        IntVec meen;
+        sam.getElmEqns(meen,elm+1);
+        for (int i : meen)
+          if (i > 0)
+            for (int j : meen)
+              if (j > 0)
+                (*this)(i,j) = 0.0;
+      }
+      this->optimiseSLU();
    } else {
      PetscIntVec Nnz;
      for (const auto& it : dofc)
--- a/src/SIM/SIMbase.C
+++ b/src/SIM/SIMbase.C
@ -107,6 +107,7 @@ void SIMbase::clearProperties ()
  myProps.clear();
  myInts.clear();
  mixedMADOFs.clear();
+  adm.dd.setElms({},"");
 }


@ -173,13 +174,6 @@ bool SIMbase::preprocess (const IntVec& ignored, bool fixDup)
    return false;
  }

-  if (nProc > 1 && myPatches.empty() && adm.isParallel())
-  {
-    std::cerr <<" *** SIMbase::preprocess: No partitioning information for "
-              << nProc <<" processors found."<< std::endl;
-    return false;
-  }
-
  PROFILE1("Model preprocessing");

  static int substep = 10;
@ -367,12 +361,23 @@ bool SIMbase::preprocess (const IntVec& ignored, bool fixDup)
  if (!static_cast<SAMpatch*>(mySam)->init(myModel,ngnod,dofTypes))
    return false;

+  if (nProc > 1 && myPatches.empty() && adm.isParallel())
+  {
+    IFEM::cout <<" *** SIMbase::preprocess: No partitioning information for "
+               << nProc <<" processors found. Using graph partitioning\n";
+  }
+
  if (!adm.dd.setup(adm,*this))
  {
    std::cerr <<"\n *** SIMbase::preprocess(): Failed to establish "
              <<" domain decomposition data."<< std::endl;
    return false;
  }
+  else if (adm.dd.isPartitioned()) // reuse thread groups for element list
+  {
+    for (ASMbase* pch : this->getFEModel())
+      pch->generateThreadGroupsFromElms(adm.dd.getElms());
+  }

  if (!myProblem)
  {
@ -1488,6 +1493,7 @@ bool SIMbase::solutionNorms (const TimeDomain& time,

  delete norm;

+  if (!adm.dd.isPartitioned())
    for (k = 0; k < gNorm.size(); k++)
      adm.allReduceAsSum(gNorm[k]);

--- a/src/SIM/SIMdummy.h
+++ b/src/SIM/SIMdummy.h
@ -41,7 +41,9 @@ public:
  virtual unsigned short int getNoParamDim() const { return 0; }
  //! \brief Creates the computational FEM model from the spline patches.
  virtual bool createFEMmodel(char) { return false; }
-
+  //! \brief Element-element connectivities.
+  virtual std::vector<std::vector<int>> getElmConnectivities() const
+  { return std::vector<std::vector<int>>(); }
 protected:
  //! \brief Preprocesses a user-defined Dirichlet boundary property.
  virtual bool addConstraint(int,int,int,int,int,int&,char)
--- a/src/SIM/SIMoutput.C
+++ b/src/SIM/SIMoutput.C
@ -286,6 +286,13 @@ bool SIMoutput::writeGlvG (int& nBlock, const char* inpFile, bool doClear)
    // Open a new VTF-file
    char* vtfName = new char[strlen(inpFile)+10];
    strtok(strcpy(vtfName,inpFile),".");
+    if (adm.dd.isPartitioned()) {
+      if (adm.getProcId() == 0) {
+        strcat(vtfName,ext);
+        IFEM::cout <<"\nWriting VTF-file "<< vtfName << std::endl;
+        myVtf = new VTF(vtfName,opt.format);
+      }
+    } else {
      if (nProc > 1)
        sprintf(vtfName+strlen(vtfName),"_p%04d%s",myPid,ext);
      else
@ -293,11 +300,15 @@ bool SIMoutput::writeGlvG (int& nBlock, const char* inpFile, bool doClear)

      IFEM::cout <<"\nWriting VTF-file "<< vtfName << std::endl;
      myVtf = new VTF(vtfName,opt.format);
+    }
    delete[] vtfName;
  }
  else if (doClear)
    myVtf->clearGeometryBlocks();

+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  ElementBlock* lvb;
  char pname[32];

@ -337,6 +348,9 @@ bool SIMoutput::writeGlvG (int& nBlock, const char* inpFile, bool doClear)

 bool SIMoutput::writeGlvBC (int& nBlock, int iStep) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (!myVtf) return false;

  Matrix field;
@ -403,6 +417,9 @@ bool SIMoutput::writeGlvBC (int& nBlock, int iStep) const

 bool SIMoutput::writeGlvT (int iStep, int& geoBlk, int& nBlock) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (myProblem->hasTractionValues())
  {
    if (msgLevel > 1)
@ -417,6 +434,9 @@ bool SIMoutput::writeGlvT (int iStep, int& geoBlk, int& nBlock) const
 bool SIMoutput::writeGlvV (const Vector& vec, const char* fieldName,
                           int iStep, int& nBlock, int idBlock, int ncmp) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (vec.empty())
    return true;
  else if (!myVtf)
@ -459,6 +479,9 @@ bool SIMoutput::writeGlvV (const Vector& vec, const char* fieldName,
 bool SIMoutput::writeGlvS (const Vector& scl, const char* fieldName,
                           int iStep, int& nBlock, int idBlock) const
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (scl.empty())
    return true;
  else if (!myVtf)
@ -496,6 +519,9 @@ bool SIMoutput::writeGlvS (const Vector& psol, int iStep, int& nBlock,
                           double time, const char* pvecName,
                           int idBlock, int psolComps)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  idBlock = this->writeGlvS1(psol,iStep,nBlock,time,
                             pvecName,idBlock,psolComps);
  if (idBlock < 0)
@ -526,6 +552,9 @@ int SIMoutput::writeGlvS1 (const Vector& psol, int iStep, int& nBlock,
                           double time, const char* pvecName,
                           int idBlock, int psolComps, bool scalarOnly)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (psol.empty())
    return 0;
  else if (!myVtf)
@ -680,6 +709,9 @@ int SIMoutput::writeGlvS1 (const Vector& psol, int iStep, int& nBlock,
 bool SIMoutput::writeGlvS2 (const Vector& psol, int iStep, int& nBlock,
                            double time, int idBlock, int psolComps)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (psol.empty())
    return true; // No primary solution
  else if (!myVtf || !myProblem)
@ -860,6 +892,9 @@ bool SIMoutput::writeGlvP (const Vector& ssol, int iStep, int& nBlock,
                           int idBlock, const char* prefix,
                           std::vector<PointValue>* maxVal)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (ssol.empty())
    return true;
  else if (!myVtf)
@ -984,6 +1019,9 @@ bool SIMoutput::evalProjSolution (const Vector& ssol,
 bool SIMoutput::writeGlvF (const RealFunc& f, const char* fname,
                           int iStep, int& nBlock, int idBlock, double time)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (!myVtf) return false;

  IntVec sID;
@ -1009,6 +1047,9 @@ bool SIMoutput::writeGlvF (const RealFunc& f, const char* fname,

 bool SIMoutput::writeGlvStep (int iStep, double value, int itype)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  myVtf->writeGeometryBlocks(iStep);

  if (itype == 0)
@ -1020,6 +1061,9 @@ bool SIMoutput::writeGlvStep (int iStep, double value, int itype)

 bool SIMoutput::writeGlvM (const Mode& mode, bool freq, int& nBlock)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (mode.eigVec.empty())
    return true;
  else if (!myVtf)
@ -1067,6 +1111,9 @@ bool SIMoutput::writeGlvM (const Mode& mode, bool freq, int& nBlock)
 bool SIMoutput::writeGlvN (const Matrix& norms, int iStep, int& nBlock,
                           const std::vector<std::string>& prefix, int idBlock)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (norms.empty())
    return true;
  else if (!myVtf)
@ -1139,6 +1186,9 @@ bool SIMoutput::writeGlvN (const Matrix& norms, int iStep, int& nBlock,
 bool SIMoutput::writeGlvE (const Vector& vec, int iStep, int& nBlock,
                           const char* name)
 {
+  if (adm.dd.isPartitioned() && adm.getProcId() != 0)
+    return true;
+
  if (!myVtf)
    return false;

--- a/src/Utility/HDF5Writer.C
+++ b/src/Utility/HDF5Writer.C
@ -309,6 +309,8 @@ void HDF5Writer::writeSIM (int level, const DataEntry& entry,
  size_t projOfs = 0;
  for (int i = 0; i < sim->getNoPatches(); ++i) {
    int loc = sim->getLocalPatchIndex(i+1);
+    if (sim->getProcessAdm().dd.isPartitioned() && sim->getProcessAdm().getProcId() != 0)
+      loc = 0;
    if (loc > 0 && (!(results & DataExporter::REDUNDANT) ||
                    sim->getGlobalProcessID() == 0)) // we own the patch
    {