diff --git a/src/ASM/ASMbase.h b/src/ASM/ASMbase.h
index b6ad84a3..f329b630 100644
--- a/src/ASM/ASMbase.h
+++ b/src/ASM/ASMbase.h
@@ -252,6 +252,11 @@ public:
   //! \brief Computes the number of boundary integration points in this patch.
   virtual void getNoBouPoints(size_t& nPt, char ldim, char lindx) = 0;
 
+  //! \brief Generates element groups for multi-threading of interior integrals.
+  virtual void generateThreadGroups() {}
+  //! \brief Generates element groups for multi-threading of boundary integrals.
+  virtual void generateThreadGroups(char) {}
+
 
   // Methods for integration of finite element quantities.
   // These are the main computational methods of the ASM class hierarchy.
diff --git a/src/ASM/ASMs2D.C b/src/ASM/ASMs2D.C
index a71f652c..4813fee1 100644
--- a/src/ASM/ASMs2D.C
+++ b/src/ASM/ASMs2D.C
@@ -1008,9 +1008,6 @@ bool ASMs2D::integrate (Integrand& integrand,
   const int n1 = surf->numCoefs_u();
   const int nel1 = n1 - p1 + 1;
 
-  if (threadGroups.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -1046,10 +1043,17 @@ bool ASMs2D::integrate (Integrand& integrand,
           break;
         }
 
-        // Compute characteristic element length, if needed
         if (integrand.getIntegrandType() == 2)
+        {
+          // Compute characteristic element length
           fe.h = getElmSize(p1,p2,Xnod);
 
+          // Element size in parametric space
+          int inod = MNPC[iel-1].back();
+          dXidu[0] = surf->knotSpan(0,nodeInd[inod].I);
+          dXidu[1] = surf->knotSpan(1,nodeInd[inod].J);
+        }
+
         else if (integrand.getIntegrandType() == 3)
         {
           // --- Compute average value of basis functions over the element -----
@@ -1088,11 +1092,6 @@ bool ASMs2D::integrate (Integrand& integrand,
             X[i] = X0[i];
         }
 
-        // TODO: Do this conditionally (only when the CFD solver requires it)
-        int inod = MNPC[iel-1].back();
-        dXidu[0] = surf->knotSpan(0,nodeInd[inod].I);
-        dXidu[1] = surf->knotSpan(1,nodeInd[inod].J);
-
         // Initialize element quantities
         LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
         if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed,*A))
@@ -1167,14 +1166,15 @@ bool ASMs2D::integrate (Integrand& integrand,
 
             // Compute Hessian of coordinate mapping and 2nd order derivatives
             if (integrand.getIntegrandType() == 2)
+            {
               if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu))
               {
                 ok = false;
                 break;
               }
 
-	    // RUNAR: Do this conditionally (when the CFD solver requires it)
-	    utl::getGmat(Jac,dXidu,fe.G);
+              utl::getGmat(Jac,dXidu,fe.G);
+            }
 
 #if SP_DEBUG > 4
             std::cout <<"\niel, ip = "<< iel <<" "<< ip
@@ -1716,23 +1716,26 @@ bool ASMs2D::evalSolution (Matrix& sField, const Integrand& integrand,
 }
 
 
-void ASMs2D::generateThreadGroups()
+void ASMs2D::generateThreadGroups ()
 {
   const int p1 = surf->order_u();
   const int p2 = surf->order_v();
   const int n1 = surf->numCoefs_u();
   const int n2 = surf->numCoefs_v();
+
   const int nel1 = n1 - p1 + 1;
   const int nel2 = n2 - p2 + 1;
 
   utl::calcThreadGroups(nel1,nel2,threadGroups);
+  if (threadGroups.size() < 2) return;
 
-  if (threadGroups.size() > 1)
-    for (size_t i = 0; i < threadGroups.size(); i++) {
-      std::cout <<" Thread group "<< i+1;
-      for (size_t j = 0; j < threadGroups[i].size(); j++)
-	std::cout <<"\n\t thread "<< j+1
-		  << ": "<< threadGroups[i][j].size() <<" elements";
-      std::cout << std::endl;
-    }
+  std::cout <<"\nMultiple threads are utilized during element assembly.";
+  for (size_t i = 0; i < threadGroups.size(); i++)
+  {
+    std::cout <<"\n Thread group "<< i+1;
+    for (size_t j = 0; j < threadGroups[i].size(); j++)
+      std::cout <<"\n\tthread "<< j+1
+		<< ": "<< threadGroups[i][j].size() <<" elements";
+  }
+  std::cout << std::endl;
 }
diff --git a/src/ASM/ASMs2D.h b/src/ASM/ASMs2D.h
index 479417b0..22afad0a 100644
--- a/src/ASM/ASMs2D.h
+++ b/src/ASM/ASMs2D.h
@@ -101,7 +101,7 @@ public:
   //! This is used to reinitialize the patch after it has been refined.
   virtual void clear(bool retainGeometry = false);
 
-    //! \brief Returns a matrix with nodal coordinates for an element.
+  //! \brief Returns a matrix with nodal coordinates for an element.
   //! \param[in] iel Element index
   //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
   //! in one element
@@ -346,6 +346,9 @@ protected:
   //! \param[in] basis Which basis to return size parameters for (mixed methods)
   virtual bool getSize(int& n1, int& n2, int basis = 0) const;
 
+  //! \brief Generates element groups for multi-threading of interior integrals.
+  virtual void generateThreadGroups();
+
   //! \brief Establishes matrices with basis functions and 1st derivatives.
   static void extractBasis(const Go::BasisDerivsSf& spline,
 			   Vector& N, Matrix& dNdu);
@@ -377,11 +380,8 @@ protected:
   const IndexVec& nodeInd; //!< IJ-pairs for the control points (nodes)
   IndexVec      myNodeInd; //!< The actual IJ-pair container
 
-  //! Element groups for multithreaded assembly
+  //! Element groups for multi-threaded assembly
   utl::ThreadGroups threadGroups;
-
-  //! \brief Generate thread groups
-  virtual void generateThreadGroups();
 };
 
 #endif
diff --git a/src/ASM/ASMs2DLag.C b/src/ASM/ASMs2DLag.C
index e0341070..643fd795 100644
--- a/src/ASM/ASMs2DLag.C
+++ b/src/ASM/ASMs2DLag.C
@@ -187,6 +187,27 @@ bool ASMs2DLag::getSize (int& n1, int& n2, int) const
 }
 
 
+size_t ASMs2DLag::getNoBoundaryElms (char lIndex, char ldim) const
+{
+  if (!surf) return 0;
+
+  if (ldim < 1 && lIndex > 0)
+    return 1;
+
+  switch (lIndex)
+    {
+    case 1:
+    case 2:
+      return (ny-1)/(surf->order_v()-1);
+    case 3:
+    case 4:
+      return (nx-1)/(surf->order_u()-1);
+    }
+
+  return 0;
+}
+
+
 bool ASMs2DLag::integrate (Integrand& integrand,
 			   GlobalIntegral& glInt,
 			   const TimeDomain& time)
@@ -218,9 +239,6 @@ bool ASMs2DLag::integrate (Integrand& integrand,
   const int p1 = surf->order_u();
   const int p2 = surf->order_v();
 
-  if (threadGroups.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -631,18 +649,13 @@ bool ASMs2DLag::evalSolution (Matrix& sField, const Integrand& integrand,
 }
 
 
-void ASMs2DLag::generateThreadGroups()
+void ASMs2DLag::generateThreadGroups ()
 {
   const int p1 = surf->order_u();
   const int p2 = surf->order_v();
 
-  // Evaluate the parametric values
-  RealArray gpar[2];
-  getGridParameters(gpar[0],0,p1-1);
-  getGridParameters(gpar[1],1,p2-1);
-
-  const int nel1 = (gpar[0].size()-1)/(p1-1);
-  const int nel2 = (gpar[1].size()-1)/(p2-1);
+  const int nel1 = (nx-1)/(p1-1);
+  const int nel2 = (ny-1)/(p2-1);
 
   utl::calcThreadGroups(nel1,nel2,threadGroups);
 }
diff --git a/src/ASM/ASMs2DLag.h b/src/ASM/ASMs2DLag.h
index d8b6bb4c..6cc554bd 100644
--- a/src/ASM/ASMs2DLag.h
+++ b/src/ASM/ASMs2DLag.h
@@ -48,9 +48,9 @@ public:
   virtual void clear(bool retainGeometry = false);
 
   //! \brief Returns a matrix with nodal coordinates for an element.
-  //! \param[in] iel Element index
   //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
   //! in one element
+  //! \param[in] iel Element index
   virtual bool getElementCoordinates(Matrix& X, int iel) const;
 
   //! \brief Returns a matrix with all nodal coordinates within the patch.
@@ -137,15 +137,17 @@ public:
   virtual bool evalSolution(Matrix& sField, const Integrand& integrand,
 			    const RealArray* gpar, bool regular = true) const;
 
-protected:
-  //! \brief Generate thread groups
-  void generateThreadGroups();
-public:
   //! \brief Returns the number of nodal points in each parameter direction.
   //! \param[out] n1 Number of nodes in first (u) direction
   //! \param[out] n2 Number of nodes in second (v) direction
   virtual bool getSize(int& n1, int& n2, int = 0) const;
 
+  //! \brief Generates element groups for multi-threading of interior integrals.
+  virtual void generateThreadGroups();
+
+  //! \brief Returns the number of elements on a boundary.
+  virtual size_t getNoBoundaryElms(char lIndex, char ldim) const;
+
 private:
   size_t nx; //!< Number of nodes in first parameter direction
   size_t ny; //!< Number of nodes in second parameter direction
diff --git a/src/ASM/ASMs2DSpec.C b/src/ASM/ASMs2DSpec.C
index fe35db10..8f7ebe96 100644
--- a/src/ASM/ASMs2DSpec.C
+++ b/src/ASM/ASMs2DSpec.C
@@ -96,9 +96,6 @@ bool ASMs2DSpec::integrate (Integrand& integrand,
   if (!Legendre::basisDerivatives(p1,D1)) return false;
   if (!Legendre::basisDerivatives(p2,D2)) return false;
 
-  if (threadGroups.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -113,7 +110,8 @@ bool ASMs2DSpec::integrate (Integrand& integrand,
         int iel = threadGroups[g][t][e]+1;
 
         // Set up control point coordinates for current element
-        if (!this->getElementCoordinates(Xnod,iel)) {
+        if (!this->getElementCoordinates(Xnod,iel))
+        {
           ok = false;
           break;
         }
@@ -121,11 +119,13 @@ bool ASMs2DSpec::integrate (Integrand& integrand,
         // Initialize element quantities
         fe.iel = MLGE[iel-1];
         LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
-        if (!integrand.initElement(MNPC[iel-1],*A)) {
+        if (!integrand.initElement(MNPC[iel-1],*A))
+        {
           ok = false;
           break;
         }
 
+
         // --- Integration loop over all Gauss points in each direction --------
 
         int count = 1;
@@ -147,14 +147,16 @@ bool ASMs2DSpec::integrate (Integrand& integrand,
 
             // Evaluate the integrand and accumulate element contributions
             fe.detJxW *= wg1(i)*wg2(j);
-            if (!integrand.evalInt(*A,fe,time,X)) {
+            if (!integrand.evalInt(*A,fe,time,X))
+            {
               ok = false;
               break;
             }
           }
 
         // Assembly of global system integral
-        if (!glInt.assemble(A->ref(),fe.iel)) {
+        if (!glInt.assemble(A->ref(),fe.iel))
+        {
           ok = false;
           break;
         }
@@ -235,6 +237,7 @@ bool ASMs2DSpec::integrate (Integrand& integrand, int lIndex,
       LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
       if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;
 
+
       // --- Integration loop over all Gauss points along the edge -------------
 
       for (int i = 0; i < p[t2-1]; i++)
diff --git a/src/ASM/ASMs2Dmx.C b/src/ASM/ASMs2Dmx.C
index 7ce25789..51f5c8f5 100644
--- a/src/ASM/ASMs2Dmx.C
+++ b/src/ASM/ASMs2Dmx.C
@@ -506,9 +506,6 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
   const int n1 = surf->numCoefs_u();
   const int nel1 = n1 - p1 + 1;
 
-  if (threadGroups.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
diff --git a/src/ASM/ASMs2DmxLag.C b/src/ASM/ASMs2DmxLag.C
index 80f6b14f..2dd3f324 100644
--- a/src/ASM/ASMs2DmxLag.C
+++ b/src/ASM/ASMs2DmxLag.C
@@ -231,9 +231,6 @@ bool ASMs2DmxLag::integrate (Integrand& integrand,
   const int q1 = p1 - 1;
   const int q2 = p2 - 1;
 
-  if (threadGroups.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
diff --git a/src/ASM/ASMs3D.C b/src/ASM/ASMs3D.C
index 23d616db..aa045411 100644
--- a/src/ASM/ASMs3D.C
+++ b/src/ASM/ASMs3D.C
@@ -1025,6 +1025,8 @@ bool ASMs3D::getSize (int& n1, int& n2, int& n3, int) const
 
 size_t ASMs3D::getNoBoundaryElms (char lIndex, char ldim) const
 {
+  if (!svol) return 0;
+
   if (ldim < 1 && lIndex > 0)
     return 1;
   else if (ldim < 2 && lIndex > 0 && lIndex <= 12)
@@ -1222,9 +1224,6 @@ bool ASMs3D::integrate (Integrand& integrand,
   const int nel1 = n1 - p1 + 1;
   const int nel2 = n2 - p2 + 1;
 
-  if (threadGroupsVol.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -1261,10 +1260,18 @@ bool ASMs3D::integrate (Integrand& integrand,
           break;
         }
 
-        // Compute characteristic element length, if needed
         if (integrand.getIntegrandType() == 2)
+        {
+          // Compute characteristic element length
           fe.h = getElmSize(p1,p2,p3,Xnod);
 
+          // Element size in parametric space
+          int inod = MNPC[iel-1].back();
+          dXidu[0] = svol->knotSpan(0,nodeInd[inod].I);
+          dXidu[1] = svol->knotSpan(1,nodeInd[inod].J);
+          dXidu[2] = svol->knotSpan(2,nodeInd[inod].K);
+        }
+
         else if (integrand.getIntegrandType() == 3)
         {
           // --- Compute average value of basis functions over the element -----
@@ -1306,13 +1313,7 @@ bool ASMs3D::integrate (Integrand& integrand,
           X.z = X0[2];
         }
 
-	// TODO: Do this conditionally (only when the CFD solver requires it)
-	int inod = MNPC[iel-1].back();
-	dXidu[0] = svol->knotSpan(0,nodeInd[inod].I);
-	dXidu[1] = svol->knotSpan(1,nodeInd[inod].J);
-	dXidu[2] = svol->knotSpan(2,nodeInd[inod].K);
-
-	// Initialize element quantities
+        // Initialize element quantities
         LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
         if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,*A))
         {
@@ -1391,14 +1392,15 @@ bool ASMs3D::integrate (Integrand& integrand,
 
               // Compute Hessian of coordinate mapping and 2nd order derivatives
               if (integrand.getIntegrandType() == 2)
+              {
                 if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu))
                 {
                   ok = false;
                   break;
                 }
 
-              // RUNAR: Do this conditionally (when the CFD solver requires it)
-              utl::getGmat(Jac,dXidu,fe.G);
+                utl::getGmat(Jac,dXidu,fe.G);
+              }
 
 #if SP_DEBUG > 4
               std::cout <<"\niel, ip = "<< iel <<" "<< ip
@@ -1418,16 +1420,16 @@ bool ASMs3D::integrate (Integrand& integrand,
               }
             }
 
-	// Finalize the element quantities
-	if (!integrand.finalizeElement(*A,time,firstIp+jp))
-	{
+        // Finalize the element quantities
+        if (!integrand.finalizeElement(*A,time,firstIp+jp))
+        {
           ok = false;
           break;
         }
 
         // Assembly of global system integral
         if (!glInt.assemble(A->ref(),fe.iel))
-	{
+        {
           ok = false;
           break;
         }
@@ -1449,6 +1451,15 @@ bool ASMs3D::integrate (Integrand& integrand, int lIndex,
 
   PROFILE2("ASMs3D::integrate(B)");
 
+  std::map<char,utl::ThreadGroups>::const_iterator tit;
+  if ((tit = threadGroupsFace.find(lIndex)) == threadGroupsFace.end())
+  {
+    std::cerr <<" *** ASMs3D::integrate: No thread groups for face "<< lIndex
+	      << std::endl;
+    return false;
+  }
+  const utl::ThreadGroups& threadGrp = tit->second;
+
   // Get Gaussian quadrature points and weights
   const double* xg = GaussQuadrature::getCoord(nGauss);
   const double* wg = GaussQuadrature::getWeight(nGauss);
@@ -1493,16 +1504,13 @@ bool ASMs3D::integrate (Integrand& integrand, int lIndex,
   const int nel1 = n1 - p1 + 1;
   const int nel2 = n2 - p2 + 1;
 
-  if (threadGroupsFace.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements on the patch face =====================
 
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < threadGrp.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+    for (size_t t = 0; t < threadGrp[g].size(); ++t) {
       FiniteElement fe(p1*p2*p3);
       fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
       fe.u = gpar[0](1,1);
@@ -1512,8 +1520,8 @@ bool ASMs3D::integrate (Integrand& integrand, int lIndex,
       Matrix dNdu, Xnod, Jac;
       Vec4   X;
       Vec3   normal;
-      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
-        int iel = threadGroupsFace[lIndex-1][g][t][l];
+      for (size_t l = 0; l < threadGrp[g][t].size(); ++l) {
+        int iel = threadGrp[g][t][l];
         fe.iel = MLGE[iel];
         if (fe.iel < 1) continue; // zero-volume element
 
@@ -2170,60 +2178,81 @@ bool ASMs3D::evalSolution (Matrix& sField, const Integrand& integrand,
 }
 
 
-void ASMs3D::generateThreadGroups()
+void ASMs3D::generateThreadGroups ()
 {
+  const int nel1 = svol->numCoefs(0) - svol->order(0) + 1;
+  const int nel2 = svol->numCoefs(1) - svol->order(1) + 1;
+  const int nel3 = svol->numCoefs(2) - svol->order(2) + 1;
+
+  utl::calcThreadGroups(nel1,nel2,nel3,threadGroupsVol);
+  if (threadGroupsVol.size() < 2) return;
+
+  std::cout <<"\nMultiple threads are utilized during element assembly.";
+  for (size_t i = 0; i < threadGroupsVol.size(); i++)
+  {
+    std::cout <<"\n Thread group "<< i+1;
+    for (size_t j = 0; j < threadGroupsVol[i].size(); j++)
+      std::cout <<"\n\tthread "<< j+1
+		<< ": "<< threadGroupsVol[i][j].size() <<" elements";
+  }
+  std::cout << std::endl;
+}
+
+
+void ASMs3D::generateThreadGroups (char lIndex)
+{
+  if (threadGroupsFace.find(lIndex) != threadGroupsFace.end()) return;
+
   const int p1 = svol->order(0);
   const int p2 = svol->order(1);
   const int p3 = svol->order(2);
   const int n1 = svol->numCoefs(0);
   const int n2 = svol->numCoefs(1);
   const int n3 = svol->numCoefs(2);
-  const int nel1 = n1 - p1 + 1;
-  const int nel2 = n2 - p2 + 1;
-  const int nel3 = n3 - p3 + 1;
 
-  utl::calcThreadGroups(nel1,nel2,nel3,threadGroupsVol);
-
-  // now the faces
-  threadGroupsFace.resize(6);
-  for (int face=0;face<6;++face) {
-    const int faceDir = (face+2)/((face+1)%2 ? -2 : 2);
-    std::vector<int> map;
-    int iel=0;
-    for (int i3 = p3; i3 <= n3; i3++) {
-      for (int i2 = p2; i2 <= n2; i2++) {
-        for (int i1 = p1; i1 <= n1; i1++, iel++) {
-          // Skip elements that are not on current boundary face
-          bool skipMe = false;
-          switch (faceDir)
+  // Find elements that are on the boundary face 'lIndex'
+  IntVec map; map.reserve(this->getNoBoundaryElms(lIndex,2));
+  int d1, d2, iel = 0;
+  for (int i3 = p3; i3 <= n3; i3++)
+    for (int i2 = p2; i2 <= n2; i2++)
+      for (int i1 = p1; i1 <= n1; i1++, iel++)
+	switch (lIndex)
           {
-            case -1: if (i1 > p1) skipMe = true; break;
-            case  1: if (i1 < n1) skipMe = true; break;
-            case -2: if (i2 > p2) skipMe = true; break;
-            case  2: if (i2 < n2) skipMe = true; break;
-            case -3: if (i3 > p3) skipMe = true; break;
-            case  3: if (i3 < n3) skipMe = true; break;
+	  case 1: if (i1 == p1) map.push_back(iel); break;
+	  case 2: if (i1 == n1) map.push_back(iel); break;
+	  case 3: if (i2 == p2) map.push_back(iel); break;
+	  case 4: if (i2 == n2) map.push_back(iel); break;
+	  case 5: if (i3 == p3) map.push_back(iel); break;
+	  case 6: if (i3 == n3) map.push_back(iel); break;
           }
-          if (skipMe)
-            continue;
 
-          map.push_back(iel);
-        }
-      }
+  switch (lIndex)
+    {
+    case 1:
+    case 2:
+      d1 = n2 - p2 + 1;
+      d2 = n3 - p3 + 1;
+      break;
+    case 3:
+    case 4:
+      d1 = n1 - p1 + 1;
+      d2 = n3 - p3 + 1;
+      break;
+    default:
+      d1 = n1 - p1 + 1;
+      d2 = n2 - p2 + 1;
     }
 
-    int d1, d2;
-    if (face < 2) {
-      d1 = nel2;
-      d2 = nel3;
-    } else if (face < 4) {
-      d1 = nel1;
-      d2 = nel3;
-    } else {
-      d1 = nel1;
-      d2 = nel2;
+  utl::ThreadGroups& fGrp = threadGroupsFace[lIndex];
+  utl::calcThreadGroups(d1,d2,fGrp);
+  utl::mapThreadGroups(fGrp,map);
+
+  if (fGrp.size() > 1)
+    for (size_t i = 0; i < threadGroupsVol.size(); i++)
+    {
+      std::cout <<"\n Thread group "<< i+1 <<" for boundary face "<<(int)lIndex;
+      for (size_t j = 0; j < fGrp.size(); j++)
+	std::cout <<"\n\tthread "<< j+1
+		  << ": "<< fGrp[i][j].size() <<" elements";
     }
-    utl::calcThreadGroups(d1,d2,threadGroupsFace[face]);
-    utl::mapThreadGroups(threadGroupsFace[face],map);
-  }
 }
diff --git a/src/ASM/ASMs3D.h b/src/ASM/ASMs3D.h
index 213edfd8..ff37af04 100644
--- a/src/ASM/ASMs3D.h
+++ b/src/ASM/ASMs3D.h
@@ -337,6 +337,7 @@ public:
   //! \param[in] integrand Object with problem-specific data and methods
   Go::SplineVolume* projectSolution(const Integrand& integrand) const;
   //! \brief Projects the secondary solution field onto the primary basis.
+  //! \param[in] integrand Object with problem-specific data and methods
   virtual Go::GeomObject* evalSolution(const Integrand& integrand) const;
 
   //! \brief Evaluates the secondary solution field at the given points.
@@ -352,8 +353,8 @@ public:
   //! \a gpar[0].size() \a X \a gpar[1].size() \a X \a gpar[2].size().
   //! Otherwise, we assume that it contains the \a u, \a v and \a w parameters
   //! directly for each sampling point.
-  bool evalSolution(Matrix& sField, const Integrand& integrand,
-		    const RealArray* gpar, bool regular = true) const;
+  virtual bool evalSolution(Matrix& sField, const Integrand& integrand,
+			    const RealArray* gpar, bool regular = true) const;
 
   //! \brief Calculates parameter values for visualization nodal points.
   //! \param[out] prm Parameter values in given direction for all points
@@ -405,6 +406,12 @@ protected:
   //! \param[in] basis Which basis to return size parameters for (mixed methods)
   virtual bool getSize(int& n1, int& n2, int& n3, int basis = 0) const;
 
+  //! \brief Generates element groups for multi-threading of interior integrals.
+  virtual void generateThreadGroups();
+  //! \brief Generates element groups for multi-threading of boundary integrals.
+  //! \param[in] lIndex Local index [1,6] of the boundary face
+  virtual void generateThreadGroups(char lIndex);
+
   //! \brief Establishes matrices with basis functions and 1st derivatives.
   static void extractBasis(const Go::BasisDerivs& spline,
 			   Vector& N, Matrix& dNdu);
@@ -438,13 +445,10 @@ protected:
   const IndexVec& nodeInd; //!< IJK-triplets for the control points (nodes)
   IndexVec      myNodeInd; //!< The actual IJK-triplet container
 
-  //! Element groups for multithreaded volume assembly
+  //! Element groups for multi-threaded volume assembly
   utl::ThreadGroups threadGroupsVol;
-  //! Element groups for multithreaded face assembly
-  std::vector<utl::ThreadGroups> threadGroupsFace;
-
-  //! \brief Generate thread groups
-  virtual void generateThreadGroups();
+  //! Element groups for multi-threaded face assembly
+  std::map<char,utl::ThreadGroups> threadGroupsFace;
 };
 
 #endif
diff --git a/src/ASM/ASMs3DLag.C b/src/ASM/ASMs3DLag.C
index a470297a..2560ef62 100644
--- a/src/ASM/ASMs3DLag.C
+++ b/src/ASM/ASMs3DLag.C
@@ -201,6 +201,38 @@ bool ASMs3DLag::getSize (int& n1, int& n2, int& n3, int) const
 }
 
 
+size_t ASMs3DLag::getNoBoundaryElms (char lIndex, char ldim) const
+{
+  if (!svol) return 0;
+
+  if (ldim < 1 && lIndex > 0)
+    return 1;
+
+  int nel[3]; // Number of elements in each direction
+  nel[0] = (nx-1)/(svol->order(0)-1);
+  nel[1] = (ny-1)/(svol->order(1)-1);
+  nel[2] = (nz-1)/(svol->order(2)-1);
+
+  if (ldim < 2 && lIndex > 0 && lIndex <= 12)
+    return nel[(lIndex-1)/4];
+
+  switch (lIndex)
+    {
+    case 1:
+    case 2:
+      return nel[1]*nel[2];
+    case 3:
+    case 4:
+      return nel[0]*nel[2];
+    case 5:
+    case 6:
+      return nel[0]*nel[1];
+    }
+
+  return 0;
+}
+
+
 bool ASMs3DLag::integrate (Integrand& integrand,
 			   GlobalIntegral& glInt,
 			   const TimeDomain& time)
@@ -235,9 +267,6 @@ bool ASMs3DLag::integrate (Integrand& integrand,
   const int p2 = svol->order(1);
   const int p3 = svol->order(2);
 
-  if (threadGroupsVol.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -272,7 +301,7 @@ bool ASMs3DLag::integrate (Integrand& integrand,
           X *= 1.0/(double)Xnod.cols();
         }
 
-	// Initialize element quantities
+        // Initialize element quantities
         fe.iel = MLGE[iel-1];
         LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
         if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,*A))
@@ -367,8 +396,8 @@ bool ASMs3DLag::integrate (Integrand& integrand,
               }
             }
 
-	// Finalize the element quantities
- 	if (!integrand.finalizeElement(*A,time,firstIp+jp))
+        // Finalize the element quantities
+        if (!integrand.finalizeElement(*A,time,firstIp+jp))
         {
           ok = false;
           break;
@@ -396,6 +425,15 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
 {
   if (!svol) return true; // silently ignore empty patches
 
+  std::map<char,utl::ThreadGroups>::const_iterator tit;
+  if ((tit = threadGroupsFace.find(lIndex)) == threadGroupsFace.end())
+  {
+    std::cerr <<" *** ASMs3DLag::integrate: No thread groups for face "<< lIndex
+	      << std::endl;
+    return false;
+  }
+  const utl::ThreadGroups& threadGrp = tit->second;
+
   // Get Gaussian quadrature points and weights
   const double* xg = GaussQuadrature::getCoord(nGauss);
   const double* wg = GaussQuadrature::getWeight(nGauss);
@@ -430,16 +468,13 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
   if (vpar.empty()) this->getGridParameters(vpar,1,1);
   if (wpar.empty()) this->getGridParameters(wpar,2,1);
 
-  if (threadGroupsFace.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements on the patch face =====================
 
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < threadGrp.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+    for (size_t t = 0; t < threadGrp[g].size(); ++t) {
       FiniteElement fe(p1*p2*p3);
       fe.u = upar.front();
       fe.v = vpar.front();
@@ -449,8 +484,8 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
       Vec4   X;
       Vec3   normal;
       double xi[3];
-      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
-        int iel = threadGroupsFace[lIndex-1][g][t][l];
+      for (size_t l = 0; l < threadGrp[g][t].size(); ++l) {
+        int iel = threadGrp[g][t][l];
         int i1  =  iel % nel1;
         int i2  = (iel / nel1) % nel2;
         int i3  =  iel / (nel1*nel2);
@@ -542,8 +577,8 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
             }
 	  }
 
-	// Assembly of global system integral
-	if (!glInt.assemble(A->ref(),fe.iel))
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel))
         {
           ok = false;
           break;
@@ -840,63 +875,64 @@ bool ASMs3DLag::evalSolution (Matrix& sField, const Integrand& integrand,
 }
 
 
-void ASMs3DLag::generateThreadGroups()
+void ASMs3DLag::generateThreadGroups ()
 {
   const int p1 = svol->order(0);
   const int p2 = svol->order(1);
   const int p3 = svol->order(2);
 
-  // Evaluate the parametric values
-  RealArray gpar[3];
-  getGridParameters(gpar[0],0,p1-1);
-  getGridParameters(gpar[1],1,p2-1);
-  getGridParameters(gpar[2],2,p3-1);
-
-  const int nel1 = (gpar[0].size()-1)/(p1-1);
-  const int nel2 = (gpar[1].size()-1)/(p2-1);
-  const int nel3 = (gpar[2].size()-1)/(p3-1);
+  const int nel1 = (nx-1)/(p1-1);
+  const int nel2 = (ny-1)/(p2-1);
+  const int nel3 = (nz-1)/(p3-1);
 
   utl::calcThreadGroups(nel1,nel2,nel3,threadGroupsVol);
-
-  // now the faces
-  threadGroupsFace.resize(6);
-  for (int face=0;face<6;++face) {
-    const int faceDir = (face+2)/((face+1)%2 ? -2 : 2);
-    std::vector<int> map;
-    int iel=0;
-    for (int i3 = 0; i3 < nel3; i3++) {
-      for (int i2 = 0; i2 < nel2; i2++) {
-        for (int i1 = 0; i1 < nel1; i1++, iel++) {
-          // Skip elements that are not on current boundary face
-          bool skipMe = false;
-          switch (faceDir)
-          {
-            case -1: if (i1 > 0)      skipMe = true; break;
-            case  1: if (i1 < nel1-1) skipMe = true; break;
-            case -2: if (i2 > 0)      skipMe = true; break;
-            case  2: if (i2 < nel2-1) skipMe = true; break;
-            case -3: if (i3 > 0)      skipMe = true; break;
-            case  3: if (i3 < nel3-1) skipMe = true; break;
-          }
-          if (skipMe)
-            continue;
-
-          map.push_back(iel);
-        }
-      }
-    }
-    int d1, d2;
-    if (face < 2) {
-      d1 = nel2;
-      d2 = nel3;
-    } else if (face < 4) {
-      d1 = nel1;
-      d2 = nel3;
-    } else {
-      d1 = nel1;
-      d2 = nel2;
-    }
-    utl::calcThreadGroups(d1,d2,threadGroupsFace[face]);
-    utl::mapThreadGroups(threadGroupsFace[face],map);
-  }
+}
+
+
+void ASMs3DLag::generateThreadGroups (char lIndex)
+{
+  if (threadGroupsFace.find(lIndex) != threadGroupsFace.end()) return;
+
+  const int p1 = svol->order(0);
+  const int p2 = svol->order(1);
+  const int p3 = svol->order(2);
+  const int n1 = (nx-1)/(p1-1);
+  const int n2 = (ny-1)/(p2-1);
+  const int n3 = (nz-1)/(p3-1);
+
+  // Find elements that are on the boundary face 'lIndex'
+  IntVec map; map.reserve(this->getNoBoundaryElms(lIndex,2));
+  int d1, d2, iel = 0;
+  for (int i3 = 1; i3 <= n3; i3++)
+    for (int i2 = 1; i2 <= n2; i2++)
+      for (int i1 = 1; i1 <= n1; i1++, iel++)
+	switch (lIndex)
+          {
+	  case 1: if (i1 ==  1) map.push_back(iel); break;
+	  case 2: if (i1 == n1) map.push_back(iel); break;
+	  case 3: if (i2 ==  1) map.push_back(iel); break;
+	  case 4: if (i2 == n2) map.push_back(iel); break;
+	  case 5: if (i3 ==  1) map.push_back(iel); break;
+	  case 6: if (i3 == n3) map.push_back(iel); break;
+          }
+
+  switch (lIndex)
+    {
+    case 1:
+    case 2:
+      d1 = n2;
+      d2 = n3;
+      break;
+    case 3:
+    case 4:
+      d1 = n1;
+      d2 = n3;
+      break;
+    default:
+      d1 = n1;
+      d2 = n2;
+    }
+
+  utl::calcThreadGroups(d1,d2,threadGroupsFace[lIndex]);
+  utl::mapThreadGroups(threadGroupsFace[lIndex],map);
 }
diff --git a/src/ASM/ASMs3DLag.h b/src/ASM/ASMs3DLag.h
index 003ce70a..aea305ff 100644
--- a/src/ASM/ASMs3DLag.h
+++ b/src/ASM/ASMs3DLag.h
@@ -47,6 +47,17 @@ public:
   //! This is used to reinitialize the patch after it has been refined.
   virtual void clear(bool retainGeometry = false);
 
+  //! \brief Returns a matrix with nodal coordinates for an element.
+  //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
+  //! in one element
+  //! \param[in] iel Element index
+  virtual bool getElementCoordinates(Matrix& X, int iel) const;
+
+  //! \brief Returns a matrix with all nodal coordinates within the patch.
+  //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
+  //! in the patch
+  virtual void getNodalCoordinates(Matrix& X) const;
+
   //! \brief Returns the global coordinates for the given node.
   //! \param[in] inod 1-based node index local to current patch
   virtual Vec3 getCoord(size_t inod) const;
@@ -134,26 +145,21 @@ public:
   virtual bool evalSolution(Matrix& sField, const Integrand& integrand,
 			    const RealArray* gpar, bool regular = true) const;
 
-  //! \brief Returns a matrix with nodal coordinates for an element.
-  //! \param[in] iel Element index
-  //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
-  //! in one element
-  virtual bool getElementCoordinates(Matrix& X, int iel) const;
-
-  //! \brief Generate thread groups
-  void generateThreadGroups();
-public:
-  //! \brief Returns a matrix with all nodal coordinates within the patch.
-  //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
-  //! in the patch
-  virtual void getNodalCoordinates(Matrix& X) const;
-
   //! \brief Returns the number of nodal points in each parameter direction.
   //! \param[out] n1 Number of nodes in first (u) direction
   //! \param[out] n2 Number of nodes in second (v) direction
   //! \param[out] n3 Number of nodes in third (w) direction
   virtual bool getSize(int& n1, int& n2, int& n3, int = 0) const;
 
+  //! \brief Generates element groups for multi-threading of interior integrals.
+  virtual void generateThreadGroups();
+  //! \brief Generates element groups for multi-threading of boundary integrals.
+  //! \param[in] lIndex Local index [1,6] of the boundary face
+  virtual void generateThreadGroups(char lIndex);
+
+  //! \brief Returns the number of elements on a boundary.
+  virtual size_t getNoBoundaryElms(char lIndex, char ldim) const;
+
 private:
   size_t nx; //!< Number of nodes in first parameter direction
   size_t ny; //!< Number of nodes in second parameter direction
diff --git a/src/ASM/ASMs3DSpec.C b/src/ASM/ASMs3DSpec.C
index a380ab6c..184962ac 100644
--- a/src/ASM/ASMs3DSpec.C
+++ b/src/ASM/ASMs3DSpec.C
@@ -101,9 +101,6 @@ bool ASMs3DSpec::integrate (Integrand& integrand,
   if (!Legendre::basisDerivatives(p2,D2)) return false;
   if (!Legendre::basisDerivatives(p3,D3)) return false;
 
-  if (threadGroupsVol.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -133,6 +130,7 @@ bool ASMs3DSpec::integrate (Integrand& integrand,
           break;
         }
 
+
         // --- Integration loop over all Gauss points in each direction --------
 
         int count = 1;
@@ -185,6 +183,15 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,
 {
   if (!svol) return true; // silently ignore empty patches
 
+  std::map<char,utl::ThreadGroups>::const_iterator tit;
+  if ((tit = threadGroupsFace.find(lIndex)) == threadGroupsFace.end())
+  {
+    std::cerr <<" *** ASMs3DSpec::integrate: No thread groups for face "<<lIndex
+	      << std::endl;
+    return false;
+  }
+  const utl::ThreadGroups& threadGrp = tit->second;
+
   // Find the parametric direction of the face normal {-3,-2,-1, 1, 2, 3}
   const int faceDir = (lIndex+1)/(lIndex%2 ? -2 : 2);
 
@@ -212,33 +219,38 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,
 
   int nen = p[0]*p[1]*p[2];
 
+
   // === Assembly loop over all elements on the patch face =====================
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+
+  bool ok = true;
+  for (size_t g = 0; g < threadGrp.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+    for (size_t t = 0; t < threadGrp[g].size(); ++t) {
       FiniteElement fe(nen);
       Matrix dNdu(nen,3), Xnod, Jac;
       Vec4   X;
       Vec3   normal;
       int    xi[3];
-      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
-        int iel = threadGroupsFace[lIndex-1][g][t][l];
+      for (size_t l = 0; l < threadGrp[g][t].size(); ++l) {
+        int iel = threadGrp[g][t][l];
 
 	// Set up nodal point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,++iel)) {
+        if (!this->getElementCoordinates(Xnod,++iel))
+        {
           ok = false;
           break;
         }
 
 	// Initialize element quantities
-	fe.iel = MLGE[iel-1];
+        fe.iel = MLGE[iel-1];
         LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
-        if (!integrand.initElementBou(MNPC[iel-1],*A)) {
+        if (!integrand.initElementBou(MNPC[iel-1],*A))
+        {
           ok = false;
           break;
         }
 
+
 	// --- Integration loop over all Gauss points in each direction --------
 
 	for (int j = 0; j < p[t2-1]; j++)
@@ -265,14 +277,16 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,
 
 	    // Evaluate the integrand and accumulate element contributions
 	    fe.detJxW *=  wg[t1-1][i]*wg[t2-1][j];
-            if (!integrand.evalBou(*A,fe,time,X,normal)) {
+            if (!integrand.evalBou(*A,fe,time,X,normal))
+            {
               ok = false;
               break;
             }
 	  }
 
 	// Assembly of global system integral
-	if (!glInt.assemble(A->ref(),fe.iel)) {
+        if (!glInt.assemble(A->ref(),fe.iel))
+        {
           ok = false;
           break;
         }
@@ -379,6 +393,7 @@ bool ASMs3DSpec::integrateEdge (Integrand& integrand, int lEdge,
         LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
         if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;
 
+
 	// --- Integration loop over all Gauss points along the edge -----------
 
 	for (int i = 0; i < pe; i++)
diff --git a/src/ASM/ASMs3Dmx.C b/src/ASM/ASMs3Dmx.C
index 824458dc..035106fd 100644
--- a/src/ASM/ASMs3Dmx.C
+++ b/src/ASM/ASMs3Dmx.C
@@ -538,9 +538,6 @@ bool ASMs3Dmx::integrate (Integrand& integrand,
   const int nel2 = n2 - p2 + 1;
   const int nel3 = n3 - p3 + 1;
 
-  if (threadGroupsVol.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -664,6 +661,15 @@ bool ASMs3Dmx::integrate (Integrand& integrand, int lIndex,
 
   PROFILE2("ASMs3Dmx::integrate(B)");
 
+  std::map<char,utl::ThreadGroups>::const_iterator tit;
+  if ((tit = threadGroupsFace.find(lIndex)) == threadGroupsFace.end())
+  {
+    std::cerr <<" *** ASMs3D::integrate: No thread groups for face "<< lIndex
+	      << std::endl;
+    return false;
+  }
+  const utl::ThreadGroups& threadGrp = tit->second;
+
   // Get Gaussian quadrature points and weights
   const double* xg = GaussQuadrature::getCoord(nGauss);
   const double* wg = GaussQuadrature::getWeight(nGauss);
@@ -706,16 +712,13 @@ bool ASMs3Dmx::integrate (Integrand& integrand, int lIndex,
   const int nel1 = n1 - p1 + 1;
   const int nel2 = n2 - p2 + 1;
 
-  if (threadGroupsFace.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements on the patch face =====================
 
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < threadGrp.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+    for (size_t t = 0; t < threadGrp[g].size(); ++t) {
       MxFiniteElement fe(basis1->order(0)*basis1->order(1)*basis1->order(2),
                          basis2->order(0)*basis2->order(1)*basis2->order(2));
       fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
@@ -726,8 +729,8 @@ bool ASMs3Dmx::integrate (Integrand& integrand, int lIndex,
       Matrix dN1du, dN2du, Xnod, Jac;
       Vec4   X;
       Vec3   normal;
-      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
-        int iel = threadGroupsFace[lIndex-1][g][t][l];
+      for (size_t l = 0; l < threadGrp[g][t].size(); ++l) {
+        int iel = threadGrp[g][t][l];
         fe.iel = MLGE[iel];
         if (fe.iel < 1) continue; // zero-volume element
 
diff --git a/src/ASM/ASMs3DmxLag.C b/src/ASM/ASMs3DmxLag.C
index 74204a47..1338d405 100644
--- a/src/ASM/ASMs3DmxLag.C
+++ b/src/ASM/ASMs3DmxLag.C
@@ -256,9 +256,6 @@ bool ASMs3DmxLag::integrate (Integrand& integrand,
   const int q2 = p2 - 1;
   const int q3 = p3 - 1;
 
-  if (threadGroupsVol.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements in the patch ==========================
 
@@ -364,6 +361,15 @@ bool ASMs3DmxLag::integrate (Integrand& integrand, int lIndex,
 {
   if (!svol) return true; // silently ignore empty patches
 
+  std::map<char,utl::ThreadGroups>::const_iterator tit;
+  if ((tit = threadGroupsFace.find(lIndex)) == threadGroupsFace.end())
+  {
+    std::cerr <<" *** ASMs3DLag::integrate: No thread groups for face "<< lIndex
+	      << std::endl;
+    return false;
+  }
+  const utl::ThreadGroups& threadGrp = tit->second;
+
   // Get Gaussian quadrature points and weights
   const double* xg = GaussQuadrature::getCoord(nGauss);
   const double* wg = GaussQuadrature::getWeight(nGauss);
@@ -389,23 +395,20 @@ bool ASMs3DmxLag::integrate (Integrand& integrand, int lIndex,
   const int nel1 = (nx2-1)/(p1-1);
   const int nel2 = (ny2-1)/(p2-1);
 
-  if (threadGroupsFace.empty())
-    generateThreadGroups();
-
 
   // === Assembly loop over all elements on the patch face =====================
 
-  bool ok=true;
-  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+  bool ok = true;
+  for (size_t g = 0; g < threadGrp.size() && ok; ++g) {
 #pragma omp parallel for schedule(static)
-    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+    for (size_t t = 0; t < threadGrp[g].size(); ++t) {
       MxFiniteElement fe(p1*p2*p3,q1*q2*q3);
       Matrix dN1du, dN2du, Xnod, Jac;
       Vec4   X;
       Vec3   normal;
       double xi[3];
-      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
-        int iel = threadGroupsFace[lIndex-1][g][t][l];
+      for (size_t l = 0; l < threadGrp[g][t].size(); ++l) {
+        int iel = threadGrp[g][t][l];
         int i1  =  iel % nel1;
         int i2  = (iel / nel1) % nel2;
         int i3  =  iel / (nel1*nel2);
diff --git a/src/SIM/SIMbase.C b/src/SIM/SIMbase.C
index 2e740b9c..8119285d 100644
--- a/src/SIM/SIMbase.C
+++ b/src/SIM/SIMbase.C
@@ -667,18 +667,19 @@ bool SIMbase::preprocess (const std::vector<int>& ignored, bool fixDup)
   std::cout <<"\nResolving Dirichlet boundary conditions"<< std::endl;
 
   bool ok = true;
-  for (PropertyVec::const_iterator p = myProps.begin(); p != myProps.end(); p++)
-    switch (p->pcode) {
+  PropertyVec::const_iterator q;
+  for (q = myProps.begin(); q != myProps.end(); q++)
+    switch (q->pcode) {
 
     case Property::DIRICHLET:
-      if (this->addConstraint(p->patch,p->lindx,p->ldim,p->pindx))
+      if (this->addConstraint(q->patch,q->lindx,q->ldim,q->pindx))
 	std::cout << std::endl;
       else
 	ok = false;
       break;
 
     case Property::DIRICHLET_INHOM:
-      if (this->addConstraint(p->patch,p->lindx,p->ldim,p->pindx%1000,p->pindx))
+      if (this->addConstraint(q->patch,q->lindx,q->ldim,q->pindx%1000,q->pindx))
 	std::cout << std::endl;
       else
 	ok = false;
@@ -700,6 +701,15 @@ bool SIMbase::preprocess (const std::vector<int>& ignored, bool fixDup)
   // Resolve possibly chaining of the MPC equations
   if (!allMPCs.empty()) ASMbase::resolveMPCchains(allMPCs);
 
+  // Generate element groups for multi-threading
+  for (mit = myModel.begin(); mit != myModel.end(); mit++)
+    (*mit)->generateThreadGroups();
+  for (q = myProps.begin(); q != myProps.end(); q++)
+    if (q->pcode == Property::NEUMANN)
+      if (q->patch > 0 && q->patch <= myModel.size())
+	if (q->ldim+1 == myModel[q->patch-1]->getNoParamDim())
+	  myModel[q->patch-1]->generateThreadGroups(q->lindx);
+
   // Preprocess the result points
   for (ResPointVec::iterator p = myPoints.begin(); p != myPoints.end();)
   {