Added: Multi-threaded assembly

git-svn-id: http://svn.sintef.no/trondheim/IFEM/trunk@1406 e10b68d5-8a6e-419e-a041-bce267b0401d
2012-01-18 13:43:34 +00:00 · 2012-01-18 13:43:34 +00:00 · 35564f1ec9
commit 35564f1ec9
parent f8e50c2d6e
16 changed files with 1759 additions and 1183 deletions
--- a/src/ASM/ASMs2D.C
+++ b/src/ASM/ASMs2D.C
@ -20,6 +20,7 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
@ -29,8 +30,6 @@
 #include "Vec3Oper.h"
 #include "Tensor.h"
 #include "MPC.h"
-#include <algorithm>
-#include "ElmNorm.h"


 ASMs2D::ASMs2D (unsigned char n_s, unsigned char n_f)
@ -904,7 +903,7 @@ static bool getG (const Matrix& Jinv, const Vector& du, Matrix& G)

  G.resize(nsd,nsd,true);

-  for (size_t k = 1;k <= nsd;k++) 
+  for (size_t k = 1;k <= nsd;k++)
    for (size_t l = 1;l <= nsd;l++)
      for (size_t m = 1;m <= nsd;m++)
 	G(k,l) += Jinv(m,k)*Jinv(m,l)/(du(k)*du(l));
@ -1009,174 +1008,211 @@ bool ASMs2D::integrate (Integrand& integrand,
  const int p1 = surf->order_u();
  const int p2 = surf->order_v();
  const int n1 = surf->numCoefs_u();
-  const int n2 = surf->numCoefs_v();
  const int nel1 = n1 - p1 + 1;

-  FiniteElement fe(p1*p2);
-  Matrix   dNdu, Xnod, Jac;
-  Matrix3D d2Ndu2, Hess;
-  Vec4     X;
+  if (threadGroups.empty())
+    generateThreadGroups();
+

  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i2 = p2; i2 <= n2; i2++)
-    for (int i1 = p1; i1 <= n1; i1++, iel++)
-    {
-      fe.iel = MLGE[iel-1];
-      if (fe.iel < 1) continue; // zero-area element
+  bool ok=true;
+  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroups[g].size();++t) {
+      FiniteElement fe(p1*p2);
+      Matrix   dNdu, Xnod, Jac;
+      Matrix3D d2Ndu2, Hess;
+      Vec4     X;
+      for (size_t i=0;i<threadGroups[g][t].size();++i) {
+        int iel = threadGroups[g][t][i];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-area element

-      // Get element area in the parameter space
-      double dA = this->getParametricArea(iel);
-      if (dA < 0.0) return false; // topology error (probably logic error)
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + iel / nel1;

-      // Set up control point (nodal) coordinates for current element
-      if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Get element area in the parameter space
+        double dA = this->getParametricArea(++iel);
+        if (dA < 0.0) // topology error (probably logic error)
+        {
+          ok = false;
+          break;
+        }

-      // Compute characteristic element length, if needed
-      if (integrand.getIntegrandType() == 2) 
-	fe.h   = getElmSize(p1,p2,Xnod);
-      else if (integrand.getIntegrandType() == 3)
-      {
-	// --- Compute average value of basis functions over the element -------
+        // Set up control point (nodal) coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel))
+        {
+          ok = false;
+          break;
+        }

-	fe.Navg.resize(p1*p2,true);
-	double area = 0.0;
-	int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-	for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	  for (int i = 0; i < nGauss; i++, ip++)
-	  {
-	    // Fetch basis function derivatives at current integration point
-	    extractBasis(spline[ip],fe.N,dNdu);
+        // Compute characteristic element length, if needed
+        if (integrand.getIntegrandType() == 2)
+          fe.h = getElmSize(p1,p2,Xnod);

-	    // Compute Jacobian determinant of coordinate mapping
-	    // and multiply by weight of current integration point
-	    double detJac = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu,false);
-	    double weight = 0.25*dA*wg[i]*wg[j];
+        else if (integrand.getIntegrandType() == 3)
+        {
+          // --- Compute average value of basis functions over the element -----

-	    // Numerical quadrature
-	    fe.Navg.add(fe.N,detJac*weight);
-	    area += detJac*weight;
-	  }
+          fe.Navg.resize(p1*p2,true);
+          double area = 0.0;
+          int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+          for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+            for (int i = 0; i < nGauss; i++, ip++)
+            {
+              // Fetch basis function derivatives at current integration point
+              extractBasis(spline[ip],fe.N,dNdu);

-	// Divide by element area
-	fe.Navg /= area;
-      }
+              // Compute Jacobian determinant of coordinate mapping
+              // and multiply by weight of current integration point
+              double detJac = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu,false);
+              double weight = 0.25*dA*wg[i]*wg[j];

-      else if (integrand.getIntegrandType() == 4)
-      {
-	// Compute the element center
-	Go::Point X0;
-	double u0 = 0.5*(gpar[0](1,i1-p1+1) + gpar[0](nGauss,i1-p1+1));
-	double v0 = 0.5*(gpar[1](1,i2-p2+1) + gpar[1](nGauss,i2-p2+1));
-	surf->point(X0,u0,v0);
-	for (unsigned char i = 0; i < nsd; i++)
-	  X[i] = X0[i];
-      }
+              // Numerical quadrature
+              fe.Navg.add(fe.N,detJac*weight);
+              area += detJac*weight;
+            }

-      // Initialize element quantities
-       Vectors vec;
-       LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-       if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed,vec,*A))
-         return false;
+          // Divide by element area
+          fe.Navg /= area;
+        }

-/*      if (integrand.getIntegrandType() > 10)
-      {
-	// --- Selective reduced integration loop ------------------------------
+        else if (integrand.getIntegrandType() == 4)
+        {
+          // Compute the element center
+          Go::Point X0;
+          double u0 = 0.5*(gpar[0](1,i1-p1+1) + gpar[0](nGauss,i1-p1+1));
+          double v0 = 0.5*(gpar[1](1,i2-p2+1) + gpar[1](nGauss,i2-p2+1));
+          surf->point(X0,u0,v0);
+          for (unsigned char i = 0; i < nsd; i++)
+            X[i] = X0[i];
+        }

-	int ip = ((i2-p2)*nRed*nel1 + i1-p1)*nRed;
-	for (int j = 0; j < nRed; j++, ip += nRed*(nel1-1))
-	  for (int i = 0; i < nRed; i++, ip++)
-	  {
-	    // Local element coordinates of current integration point
-	    fe.xi  = xr[i];
-	    fe.eta = xr[j];
+        // Initialize element quantities
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed,*A))
+        {
+          ok = false;
+          break;
+        }

-	    // Parameter values of current integration point
-	    fe.u = redpar[0](i+1,i1-p1+1);
-	    fe.v = redpar[1](j+1,i2-p2+1);
+        if (integrand.getIntegrandType() > 10)
+        {
+          // --- Selective reduced integration loop ----------------------------

-	    // Fetch basis function derivatives at current point
-	    extractBasis(splineRed[ip],fe.N,dNdu);
+          int ip = ((i2-p2)*nRed*nel1 + i1-p1)*nRed;
+          for (int j = 0; j < nRed; j++, ip += nRed*(nel1-1))
+            for (int i = 0; i < nRed; i++, ip++)
+            {
+              // Local element coordinates of current integration point
+              fe.xi  = xr[i];
+              fe.eta = xr[j];

-	    // Compute Jacobian inverse and derivatives
-	    fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+              // Parameter values of current integration point
+              fe.u = redpar[0](i+1,i1-p1+1);
+              fe.v = redpar[1](j+1,i2-p2+1);

-	    // Cartesian coordinates of current integration point
-	    X = Xnod * fe.N;
-	    X.t = time.t;
+              // Fetch basis function derivatives at current point
+              extractBasis(splineRed[ip],fe.N,dNdu);

-	    // Compute the reduced integration terms of the integrand
-	    if (!integrand.reducedInt(fe,X))
-	      return false;
-	  }
-      }
-*/
+              // Compute Jacobian inverse and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);

-      // --- Integration loop over all Gauss points in each direction ----------
+              // Cartesian coordinates of current integration point
+              X = Xnod * fe.N;
+              X.t = time.t;

-      int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-      for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	for (int i = 0; i < nGauss; i++, ip++)
-	{
-	  // Local element coordinates of current integration point
-	  fe.xi  = xg[i];
-	  fe.eta = xg[j];
+              // Compute the reduced integration terms of the integrand
+              if (!integrand.reducedInt(*A,fe,X))
+              {
+                ok = false;
+                break;
+              }
+            }
+        }

-	  // Parameter values of current integration point
-	  fe.u = gpar[0](i+1,i1-p1+1);
-	  fe.v = gpar[1](j+1,i2-p2+1);

-	  // Fetch basis function derivatives at current integration point
-	  if (integrand.getIntegrandType() == 2)
-	    extractBasis(spline2[ip],fe.N,dNdu,d2Ndu2);
-	  else
-	    extractBasis(spline[ip],fe.N,dNdu);
+        // --- Integration loop over all Gauss points in each direction --------

-	  // Compute Jacobian inverse of coordinate mapping and derivatives
-	  fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	  if (fe.detJxW == 0.0) continue; // skip singular points
+        int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+        for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+          for (int i = 0; i < nGauss; i++, ip++)
+          {
+            // Local element coordinates of current integration point
+            fe.xi  = xg[i];
+            fe.eta = xg[j];

-	  // Compute Hessian of coordinate mapping and 2nd order derivatives
-	  if (integrand.getIntegrandType() == 2) {
-	    if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu))
-	      return false;
+            // Parameter values of current integration point
+            fe.u = gpar[0](i+1,i1-p1+1);
+            fe.v = gpar[1](j+1,i2-p2+1);
+
+            // Fetch basis function derivatives at current integration point
+            if (integrand.getIntegrandType() == 2)
+              extractBasis(spline2[ip],fe.N,dNdu,d2Ndu2);
+            else
+              extractBasis(spline[ip],fe.N,dNdu);
+
+            // Compute Jacobian inverse of coordinate mapping and derivatives
+            fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+            if (fe.detJxW == 0.0) continue; // skip singular points
+
+            // Compute Hessian of coordinate mapping and 2nd order derivatives
+            if (integrand.getIntegrandType() == 2)
+              if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu))
+              {
+                ok = false;
+                break;
+              }

 	    // RUNAR
 	    Vector dXidu(2);
 	    dXidu(1) = this->getParametricLength(iel,1);
 	    dXidu(2) = this->getParametricLength(iel,2);
 	    if (!getG(Jac,dXidu,fe.G))
-	      return false;
-	  }
+            {
+              ok = false;
+              break;
+            }

 #if SP_DEBUG > 4
-	  std::cout <<"\niel, ip = "<< iel <<" "<< ip
-		    <<"\nN ="<< fe.N <<"dNdX ="<< fe.dNdX << std::endl;
+            std::cout <<"\niel, ip = "<< iel <<" "<< ip
+                      <<"\nN ="<< fe.N <<"dNdX ="<< fe.dNdX << std::endl;
 #endif

-	  // Cartesian coordinates of current integration point
-	  X = Xnod * fe.N;
-	  X.t = time.t;
+            // Cartesian coordinates of current integration point
+            X = Xnod * fe.N;
+            X.t = time.t;

-	  // Evaluate the integrand and accumulate element contributions
-	  fe.detJxW *= 0.25*dA*wg[i]*wg[j];
-	  if (!integrand.evalInt(*A,fe,time,X,vec))
-	    return false;
-	}
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= 0.25*dA*wg[i]*wg[j];
+            if (!integrand.evalInt(*A,fe,time,X))
+            {
+              ok = false;
+              break;
+            }
+          }

-      // Finalize the element quantities
-      if (!integrand.finalizeElement(A,time))
-	return false;
+        // Finalize the element quantities
+        if (!integrand.finalizeElement(*A,time))
+        {
+          ok = false;
+          break;
+        }

-      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
-	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel))
+        {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
+      }
    }
+  }

-  return true;
+  return ok;
 }


@ -1262,9 +1298,8 @@ bool ASMs2D::integrate (Integrand& integrand, int lIndex,
      if (!this->getElementCoordinates(Xnod,iel)) return false;

      // Initialize element quantities
-      Vectors vec;
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-      if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+      LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+      if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

      // --- Integration loop over all Gauss points along the edge -------------

@ -1299,15 +1334,15 @@ bool ASMs2D::integrate (Integrand& integrand, int lIndex,

 	// Evaluate the integrand and accumulate element contributions
 	fe.detJxW *= 0.5*dS*wg[i];
-	if (!integrand.evalBou(*A,fe,time,X,normal,vec))
+	if (!integrand.evalBou(*A,fe,time,X,normal))
 	  return false;
      }

      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
+      if (!glInt.assemble(A->ref(),fe.iel))
 	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+
+      A->destruct();
    }

  return true;
@ -1675,3 +1710,16 @@ bool ASMs2D::evalSolution (Matrix& sField, const Integrand& integrand,

  return true;
 }
+
+
+void ASMs2D::generateThreadGroups()
+{
+  const int p1 = surf->order_u();
+  const int p2 = surf->order_v();
+  const int n1 = surf->numCoefs_u();
+  const int n2 = surf->numCoefs_v();
+  const int nel1 = n1 - p1 + 1;
+  const int nel2 = n2 - p2 + 1;
+
+  utl::calcThreadGroups(nel1,nel2,threadGroups);
+}
--- a/src/ASM/ASMs2D.h
+++ b/src/ASM/ASMs2D.h
@ -16,6 +16,7 @@

 #include "ASMstruct.h"
 #include "ASM2D.h"
+#include "Utilities.h"

 namespace Go {
  class SplineSurface;
@ -207,7 +208,6 @@ public:
  //! \param integrand Object with problem-specific data and methods
  //! \param glbInt The integrated quantity
  //! \param[in] time Parameters for nonlinear/time-dependent simulations
-  //! \param locInt Vector of element-wise contributions to \a glbInt
  virtual bool integrate(Integrand& integrand,
 			 GlobalIntegral& glbInt, const TimeDomain& time);

@ -216,7 +216,6 @@ public:
  //! \param[in] lIndex Local index of the boundary edge
  //! \param glbInt The integrated quantity
  //! \param[in] time Parameters for nonlinear/time-dependent simulations
-  //! \param locInt Vector of element-wise contributions to \a glbInt
  virtual bool integrate(Integrand& integrand, int lIndex,
 			 GlobalIntegral& glbInt, const TimeDomain& time);

@ -374,6 +373,12 @@ protected:

  const IndexVec& nodeInd; //!< IJ-pairs for the control points (nodes)
  IndexVec      myNodeInd; //!< The actual IJ-pair container
+
+  //! \brief Element groups for multithreaded assembly
+  utl::ThreadGroups threadGroups;
+
+  //! \brief Generate thread groups
+  virtual void generateThreadGroups();
 };

 #endif
--- a/src/ASM/ASMs2DLag.C
+++ b/src/ASM/ASMs2DLag.C
@ -18,13 +18,13 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
 #include "ElementBlock.h"
 #include "Utilities.h"
 #include "Vec3Oper.h"
-#include "ElmNorm.h"


 ASMs2DLag::ASMs2DLag (unsigned char n_s, unsigned char n_f)
@ -213,120 +213,145 @@ bool ASMs2DLag::integrate (Integrand& integrand,

  // Number of elements in each direction
  const int nelx = upar.size() - 1;
-  const int nely = vpar.size() - 1;

  // Order of basis in the two parametric directions (order = degree + 1)
  const int p1 = surf->order_u();
  const int p2 = surf->order_v();

-  FiniteElement fe(p1*p2);
-  Matrix dNdu, Xnod, Jac;
-  Vec4   X;
+  if (threadGroups.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i2 = 0; i2 < nely; i2++)
-    for (int i1 = 0; i1 < nelx; i1++, iel++)
-    {
-      fe.iel = MLGE[iel-1];
+  bool ok=true;
+  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroups[g].size();++t) {
+      FiniteElement fe(p1*p2);
+      Matrix dNdu, Xnod, Jac;
+      Vec4   X;
+      for (size_t i=0;i<threadGroups[g][t].size();++i) {
+        int iel = threadGroups[g][t][i];
+        int i1  = iel % nelx;
+        int i2  = iel / nelx;

-      // Set up nodal point coordinates for current element
-      if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Set up nodal point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

-      if (integrand.getIntegrandType() == 4)
-      {
-	// Compute the element "center" (average of element node coordinates)
-	X = 0.0;
-	for (size_t i = 1; i <= nsd; i++)
-	  for (size_t j = 1; j <= Xnod.cols(); j++)
-	    X[i-1] += Xnod(i,j);
+        if (integrand.getIntegrandType() == 4)
+        {
+          // Compute the element "center" (average of element node coordinates)
+          X = 0.0;
+          for (size_t i = 1; i <= nsd; i++)
+            for (size_t j = 1; j <= Xnod.cols(); j++)
+              X[i-1] += Xnod(i,j);

-	X *= 1.0/(double)Xnod.cols();
+          X *= 1.0/(double)Xnod.cols();
+        }
+
+        // Initialize element quantities
+        fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed,*A)) {
+          ok = false;
+          break;
+        }
+
+        // --- Selective reduced integration loop ------------------------------
+
+        if (integrand.getIntegrandType() > 10)
+          for (int j = 0; j < nRed; j++)
+            for (int i = 0; i < nRed; i++)
+            {
+              // Local element coordinates of current integration point
+              fe.xi  = xr[i];
+              fe.eta = xr[j];
+
+              // Parameter value of current integration point
+              fe.u = 0.5*(upar[i1]*(1.0-xr[i]) + upar[i1+1]*(1.0+xr[i]));
+              fe.v = 0.5*(vpar[i2]*(1.0-xr[j]) + vpar[i2+1]*(1.0+xr[j]));
+
+              // Compute basis function derivatives at current point
+              // using tensor product of one-dimensional Lagrange polynomials
+              if (!Lagrange::computeBasis(fe.N,dNdu,p1,xr[i],p2,xr[j]))
+              {
+                ok = false;
+                break;
+              }
+
+              // Compute Jacobian inverse and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+
+              // Cartesian coordinates of current integration point
+              X = Xnod * fe.N;
+              X.t = time.t;
+
+              // Compute the reduced integration terms of the integrand
+              if (!integrand.reducedInt(*A,fe,X))
+              {
+                ok = false;
+                break;
+              }
+            }
+
+        // --- Integration loop over all Gauss points in each direction --------
+
+        for (int j = 0; j < nGauss; j++)
+          for (int i = 0; i < nGauss; i++)
+          {
+            // Local element coordinates of current integration point
+            fe.xi  = xg[i];
+            fe.eta = xg[j];
+
+            // Parameter value of current integration point
+            fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
+            fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));
+
+            // Compute basis function derivatives at current integration point
+            // using tensor product of one-dimensional Lagrange polynomials
+            if (!Lagrange::computeBasis(fe.N,dNdu,p1,xg[i],p2,xg[j])) {
+              ok = false;
+              break;
+            }
+
+            // Compute Jacobian inverse of coordinate mapping and derivatives
+            fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+            if (fe.detJxW == 0.0) continue; // skip singular points
+
+            // Cartesian coordinates of current integration point
+            X = Xnod * fe.N;
+            X.t = time.t;
+
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= wg[i]*wg[j];
+            if (!integrand.evalInt(*A,fe,time,X)) {
+              ok = false;
+              break;
+            }
+          }
+
+        // Finalize the element quantities
+        if (!integrand.finalizeElement(*A,time)) {
+          ok = false;
+          break;
+        }
+
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
-
-      // Initialize element quantities
-      Vectors vec;
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-      if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed,vec,*A)) return false;
-
-      // --- Selective reduced integration loop --------------------------------
-
-/*      if (integrand.getIntegrandType() > 10)
-	for (int j = 0; j < nRed; j++)
-	  for (int i = 0; i < nRed; i++)
-	  {
-	    // Local element coordinates of current integration point
-	    fe.xi  = xr[i];
-	    fe.eta = xr[j];
-
-	    // Parameter value of current integration point
-	    fe.u = 0.5*(upar[i1]*(1.0-xr[i]) + upar[i1+1]*(1.0+xr[i]));
-	    fe.v = 0.5*(vpar[i2]*(1.0-xr[j]) + vpar[i2+1]*(1.0+xr[j]));
-
-	    // Compute basis function derivatives at current point
-	    // using tensor product of one-dimensional Lagrange polynomials
-	    if (!Lagrange::computeBasis(fe.N,dNdu,p1,xr[i],p2,xr[j]))
-	      return false;
-
-	    // Compute Jacobian inverse and derivatives
-	    fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-
-	    // Cartesian coordinates of current integration point
-	    X = Xnod * fe.N;
-	    X.t = time.t;
-
-	    // Compute the reduced integration terms of the integrand
-	    if (!integrand.reducedInt(fe,X))
-	      return false;
-	  }
-*/
-
-      // --- Integration loop over all Gauss points in each direction ----------
-
-      for (int j = 0; j < nGauss; j++)
-	for (int i = 0; i < nGauss; i++)
-	{
-	  // Local element coordinates of current integration point
-	  fe.xi  = xg[i];
-	  fe.eta = xg[j];
-
-	  // Parameter value of current integration point
-	  fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
-	  fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));
-
-	  // Compute basis function derivatives at current integration point
-	  // using tensor product of one-dimensional Lagrange polynomials
-	  if (!Lagrange::computeBasis(fe.N,dNdu,p1,xg[i],p2,xg[j]))
-	    return false;
-
-	  // Compute Jacobian inverse of coordinate mapping and derivatives
-	  fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	  if (fe.detJxW == 0.0) continue; // skip singular points
-
-	  // Cartesian coordinates of current integration point
-	  X = Xnod * fe.N;
-	  X.t = time.t;
-
-	  // Evaluate the integrand and accumulate element contributions
-	  fe.detJxW *= wg[i]*wg[j];
-          if (!integrand.evalInt(*A,fe,time,X,vec))
-	    return false;
-	}
-
-      // Finalize the element quantities
-      if (!integrand.finalizeElement(A,time))
-	return false;
-
-      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
-	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
    }
+  }

-  return true;
+  return ok;
 }


@ -381,8 +406,6 @@ bool ASMs2DLag::integrate (Integrand& integrand, int lIndex,
  for (int i2 = 0; i2 < nely; i2++)
    for (int i1 = 0; i1 < nelx; i1++, iel++)
    {
-      fe.iel = MLGE[iel-1];
-
      // Skip elements that are not on current boundary edge
      bool skipMe = false;
      switch (edgeDir)
@ -398,9 +421,9 @@ bool ASMs2DLag::integrate (Integrand& integrand, int lIndex,
      if (!this->getElementCoordinates(Xnod,iel)) return false;

      // Initialize element quantities
-      Vectors vec;
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-      if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+      fe.iel = MLGE[iel-1];
+      LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+      if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

      // --- Integration loop over all Gauss points along the edge -------------

@ -435,15 +458,15 @@ bool ASMs2DLag::integrate (Integrand& integrand, int lIndex,

 	// Evaluate the integrand and accumulate element contributions
 	fe.detJxW *= wg[i];
-        if (!integrand.evalBou(*A,fe,time,X,normal,vec))
+        if (!integrand.evalBou(*A,fe,time,X,normal))
 	  return false;
      }

      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
+      if (!glInt.assemble(A->ref(),fe.iel))
 	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+
+      A->destruct();
    }

  return true;
@ -592,3 +615,20 @@ bool ASMs2DLag::evalSolution (Matrix& sField, const Integrand& integrand,

  return true;
 }
+
+
+void ASMs2DLag::generateThreadGroups()
+{
+  const int p1 = surf->order_u();
+  const int p2 = surf->order_v();
+
+  // Evaluate the parametric values
+  RealArray gpar[2];
+  getGridParameters(gpar[0],0,p1-1);
+  getGridParameters(gpar[1],1,p2-1);
+
+  const int nel1 = (gpar[0].size()-1)/(p1-1);
+  const int nel2 = (gpar[1].size()-1)/(p2-1);
+
+  utl::calcThreadGroups(nel1,nel2,threadGroups);
+}
--- a/src/ASM/ASMs2DLag.h
+++ b/src/ASM/ASMs2DLag.h
@ -137,6 +137,10 @@ public:
  virtual bool evalSolution(Matrix& sField, const Integrand& integrand,
 			    const RealArray* gpar, bool regular = true) const;

+protected:
+  //! \brief Generate thread groups
+  void generateThreadGroups();
+public:
  //! \brief Returns the number of nodal points in each parameter direction.
  //! \param[out] n1 Number of nodes in first (u) direction
  //! \param[out] n2 Number of nodes in second (v) direction
--- a/src/ASM/ASMs2DSpec.C
+++ b/src/ASM/ASMs2DSpec.C
@ -17,11 +17,11 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "Vec3Oper.h"
 #include "Legendre.h"
-#include "ElmNorm.h"


 bool ASMs2DSpec::getGridParameters (RealArray& prm, int dir,
@ -96,57 +96,75 @@ bool ASMs2DSpec::integrate (Integrand& integrand,
  if (!Legendre::basisDerivatives(p1,D1)) return false;
  if (!Legendre::basisDerivatives(p2,D2)) return false;

-  FiniteElement fe(p1*p2);
-  Matrix dNdu(p1*p2,2), Xnod, Jac;
-  Vec4   X;
+  if (threadGroups.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  const int nel = this->getNoElms();
-  for (int iel = 1; iel <= nel; iel++)
-    {
-      // Set up control point coordinates for current element
-      if (!this->getElementCoordinates(Xnod,iel)) return false;
+  bool ok=true;
+  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroups[g].size();++t) {
+      FiniteElement fe(p1*p2);
+      Matrix dNdu(p1*p2,2), Xnod, Jac;
+      Vec4   X;
+      for (size_t e=0;e<threadGroups[g][t].size();++e) {
+        int iel = threadGroups[g][t][e]+1;

-      // Initialize element quantities
-      Vectors vec;
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-      if (!integrand.initElement(MNPC[iel-1],vec,*A)) return false;
+        // Set up control point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

-      // --- Integration loop over all Gauss points in each direction ----------
+        // Initialize element quantities
+        fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],*A)) {
+          ok = false;
+          break;
+        }

-      int count = 1;
-      for (int j = 1; j <= p2; j++)
-	for (int i = 1; i <= p1; i++, count++)
-	{
-	  // Evaluate the basis functions and gradients using
-	  // tensor product of one-dimensional Lagrange polynomials
-	  evalBasis(i,j,p1,p2,D1,D2,fe.N,dNdu);
+        // --- Integration loop over all Gauss points in each direction --------

-	  // Compute Jacobian inverse of coordinate mapping and derivatives
-	  fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	  if (fe.detJxW == 0.0) continue; // skip singular points
+        int count = 1;
+        for (int j = 1; j <= p2; j++)
+          for (int i = 1; i <= p1; i++, count++)
+          {
+            // Evaluate the basis functions and gradients using
+            // tensor product of one-dimensional Lagrange polynomials
+            evalBasis(i,j,p1,p2,D1,D2,fe.N,dNdu);

-	  // Cartesian coordinates of current integration point
-	  X.x = Xnod(1,count);
-	  X.y = Xnod(2,count);
-	  X.t = time.t;
+            // Compute Jacobian inverse of coordinate mapping and derivatives
+            fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+            if (fe.detJxW == 0.0) continue; // skip singular points

-	  // Evaluate the integrand and accumulate element contributions
-	  fe.detJxW *= wg1(i)*wg2(j);
-          if (!integrand.evalInt(*A,fe,time,X,vec))
-	    return false;
-	}
+            // Cartesian coordinates of current integration point
+            X.x = Xnod(1,count);
+            X.y = Xnod(2,count);
+            X.t = time.t;

-      // Assembly of global system integral
-      if (!glInt.assemble(A,MLGE[iel-1]))
-	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= wg1(i)*wg2(j);
+            if (!integrand.evalInt(*A,fe,time,X)) {
+              ok = false;
+              break;
+            }
+          }
+
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
+      }
    }
+  }

-  return true;
+  return ok;
 }


@ -213,9 +231,9 @@ bool ASMs2DSpec::integrate (Integrand& integrand, int lIndex,
      if (!this->getElementCoordinates(Xnod,iel)) return false;

      // Initialize element quantities
-      Vectors vec;
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-      if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+      fe.iel = MLGE[iel-1];
+      LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
+      if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

      // --- Integration loop over all Gauss points along the edge -------------

@ -241,15 +259,15 @@ bool ASMs2DSpec::integrate (Integrand& integrand, int lIndex,

 	// Evaluate the integrand and accumulate element contributions
 	fe.detJxW *= wg[t2-1][i];
-        if (!integrand.evalBou(*A,fe,time,X,normal,vec))
+        if (!integrand.evalBou(*A,fe,time,X,normal))
 	  return false;
      }

      // Assembly of global system integral
-      if (!glInt.assemble(A,MLGE[iel-1]))
+      if (!glInt.assemble(A->ref(),fe.iel))
 	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+
+      A->destruct();
    }

  return true;
--- a/src/ASM/ASMs2Dmx.C
+++ b/src/ASM/ASMs2Dmx.C
@ -18,6 +18,7 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
@ -25,7 +26,6 @@
 #include "Profiler.h"
 #include "Vec3Oper.h"
 #include "Vec3.h"
-#include "ElmNorm.h"


 ASMs2Dmx::ASMs2Dmx (unsigned char n_s,
@ -190,7 +190,7 @@ bool ASMs2Dmx::generateFEMTopology ()
 							     ug,vg,XYZ,ndim,
 							     false,XYZ);
    }
-    else 
+    else
    {
      // Order-elevate basis1 such that it is of one degree higher than basis2
      // but only C^p-2 continuous
@ -198,7 +198,7 @@ bool ASMs2Dmx::generateFEMTopology ()
      basis1->raiseOrder(1,1);
    }
    basis2 = surf;
-    
+
    if (useLowOrderBasis1) {
      basis2 = basis1;
      basis1 = surf;
@ -503,87 +503,110 @@ bool ASMs2Dmx::integrate (Integrand& integrand,
  const int p1 = surf->order_u();
  const int p2 = surf->order_v();
  const int n1 = surf->numCoefs_u();
-  const int n2 = surf->numCoefs_v();
  const int nel1 = n1 - p1 + 1;

-  MxFiniteElement fe(basis1->order_u()*basis1->order_v(),
-		     basis2->order_u()*basis2->order_v());
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
+  if (threadGroups.empty())
+    generateThreadGroups();
+

  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i2 = p2; i2 <= n2; i2++)
-    for (int i1 = p1; i1 <= n1; i1++, iel++)
-    {
-      fe.iel = MLGE[iel-1];
-      if (fe.iel < 1) continue; // zero-area element
+  bool ok=true;
+  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroups[g].size();++t) {
+      MxFiniteElement fe(basis1->order_u()*basis1->order_v(),
+                         basis2->order_u()*basis2->order_v());
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      for (size_t i=0;i<threadGroups[g][t].size();++i) {
+        int iel = threadGroups[g][t][i];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-area element

-      // Get element area in the parameter space
-      double dA = this->getParametricArea(iel);
-      if (dA < 0.0) return false; // topology error (probably logic error)
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + iel / nel1;

-      // Set up control point (nodal) coordinates for current element
-      if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Get element area in the parameter space
+        double dA = this->getParametricArea(++iel);
+        if (dA < 0.0) { // topology error (probably logic error)
+          ok = false;
+          break;
+        }

-      // Initialize element quantities
-      IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-      if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
-				 IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	return false;
+        // Set up control point (nodal) coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

-      // --- Integration loop over all Gauss points in each direction ----------
+        // Initialize element quantities
+        IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,false);
+        if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
+                                   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
+        {
+          ok = false;
+          break;
+        }

-      int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-      for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	for (int i = 0; i < nGauss; i++, ip++)
-	{
-          // Local element coordinates of current integration point
-          fe.xi  = xg[i];
-          fe.eta = xg[j];
+        // --- Integration loop over all Gauss points in each direction --------

-	  // Parameter values of current integration point
-	  fe.u = gpar[0](i+1,i1-p1+1);
-	  fe.v = gpar[1](j+1,i2-p2+1);
+        int ip = ((i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+        for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+          for (int i = 0; i < nGauss; i++, ip++)
+          {
+            // Local element coordinates of current integration point
+            fe.xi  = xg[i];
+            fe.eta = xg[j];

-	  // Fetch basis function derivatives at current integration point
-	  extractBasis(spline1[ip],fe.N1,dN1du);
-	  extractBasis(spline2[ip],fe.N2,dN2du);
+            // Parameter values of current integration point
+            fe.u = gpar[0](i+1,i1-p1+1);
+            fe.v = gpar[1](j+1,i2-p2+1);

-	  // Compute Jacobian inverse of the coordinate mapping and
-	  // basis function derivatives w.r.t. Cartesian coordinates
-	  if (geoUsesBasis1)
-	  {
-	    fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
-	    fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
-	  }
-	  else
-	  {
-	    fe.detJxW = utl::Jacobian(Jac,fe.dN2dX,Xnod,dN2du);
-	    fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
-	  }
-	  if (fe.detJxW == 0.0) continue; // skip singular points
+            // Fetch basis function derivatives at current integration point
+            extractBasis(spline1[ip],fe.N1,dN1du);
+            extractBasis(spline2[ip],fe.N2,dN2du);

-	  // Cartesian coordinates of current integration point
-	  X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
-	  X.t = time.t;
+            // Compute Jacobian inverse of the coordinate mapping and
+            // basis function derivatives w.r.t. Cartesian coordinates
+            if (geoUsesBasis1)
+            {
+              fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
+              fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
+            }
+            else
+            {
+              fe.detJxW = utl::Jacobian(Jac,fe.dN2dX,Xnod,dN2du);
+              fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
+            }
+            if (fe.detJxW == 0.0) continue; // skip singular points

-	  // Evaluate the integrand and accumulate element contributions
-	  fe.detJxW *= 0.25*dA*wg[i]*wg[j];
-	  if (!integrand.evalIntMx(*A,fe,time,X))
-	    return false;
-	}
+            // Cartesian coordinates of current integration point
+            X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
+            X.t = time.t;

-      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
-	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= 0.25*dA*wg[i]*wg[j];
+            if (!integrand.evalIntMx(*A,fe,time,X)) {
+              ok = false;
+              break;
+            }
+          }
+
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
+      }
    }
+  }

-  return true;
+  return ok;
 }


@ -673,7 +696,8 @@ bool ASMs2Dmx::integrate (Integrand& integrand, int lIndex,

      // Initialize element quantities
      IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
+      LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+						    fe.iel,true);
      if (!integrand.initElementBou(IntVec(MNPC[iel-1].begin(),f2start),
 				    IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
 	return false;
@ -721,15 +745,15 @@ bool ASMs2Dmx::integrate (Integrand& integrand, int lIndex,

 	// Evaluate the integrand and accumulate element contributions
 	fe.detJxW *= 0.5*dS*wg[i];
-	if (!integrand.evalBouMx(A,fe,time,X,normal))
+	if (!integrand.evalBouMx(*A,fe,time,X,normal))
 	  return false;
      }

      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
+      if (!glInt.assemble(A->ref(),fe.iel))
 	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+
+      A->destruct();
    }

  return true;
--- a/src/ASM/ASMs2DmxLag.C
+++ b/src/ASM/ASMs2DmxLag.C
@ -18,12 +18,12 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
 #include "Utilities.h"
 #include "Vec3Oper.h"
-#include "ElmNorm.h"


 ASMs2DmxLag::ASMs2DmxLag (unsigned char n_s,
@ -222,9 +222,7 @@ bool ASMs2DmxLag::integrate (Integrand& integrand,
  this->getGridParameters(upar,0,1);
  this->getGridParameters(vpar,1,1);

-  // Number of elements in each direction
  const int nelx = upar.size() - 1;
-  const int nely = vpar.size() - 1;

  // Order of basis in the two parametric directions (order = degree + 1)
  const int p1 = surf->order_u();
@ -233,73 +231,93 @@ bool ASMs2DmxLag::integrate (Integrand& integrand,
  const int q1 = p1 - 1;
  const int q2 = p2 - 1;

-  MxFiniteElement fe(p1*p2,q1*q2);
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
+  if (threadGroups.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i2 = 0; i2 < nely; i2++)
-    for (int i1 = 0; i1 < nelx; i1++, iel++)
-    {
-      fe.iel = MLGE[iel-1];
+  bool ok=true;
+  for (size_t g=0;g<threadGroups.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroups[g].size();++t) {
+      MxFiniteElement fe(p1*p2,q1*q2);
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      for (size_t i=0;i<threadGroups[g][t].size();++i) {
+        int iel = threadGroups[g][t][i];
+        int i1  = iel % nelx;
+        int i2  = iel / nelx;

-      // Set up control point coordinates for current element
-      if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Set up control point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

-      // Initialize element quantities
-      IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-      if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
-				 IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	return false;
+        // Initialize element quantities
+        fe.iel = MLGE[iel-1];
+        IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,false);
+        if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
+                                   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
+        {
+          ok = false;
+          break;
+        }

-      // --- Integration loop over all Gauss points in each direction ----------
+        // --- Integration loop over all Gauss points in each direction --------

-      for (int j = 0; j < nGauss; j++)
-	for (int i = 0; i < nGauss; i++)
-	{
-	  // Parameter value of current integration point
-	  fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
-	  fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));
+        for (int j = 0; j < nGauss; j++)
+          for (int i = 0; i < nGauss; i++)
+          {
+            // Parameter value of current integration point
+            fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
+            fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));

-	  // Local coordinates of current integration point
-	  fe.xi  = xg[i];
-	  fe.eta = xg[j];
+            // Local coordinates of current integration point
+            fe.xi  = xg[i];
+            fe.eta = xg[j];

-	  // Compute basis function derivatives at current integration point
-	  // using tensor product of one-dimensional Lagrange polynomials
-	  if (!Lagrange::computeBasis(fe.N1,dN1du,p1,xg[i],p2,xg[j]))
-	    return false;
-	  if (!Lagrange::computeBasis(fe.N2,dN2du,q1,xg[i],q2,xg[j]))
-	    return false;
+            // Compute basis function derivatives at current integration point
+            // using tensor product of one-dimensional Lagrange polynomials
+            if (!Lagrange::computeBasis(fe.N1,dN1du,p1,xg[i],p2,xg[j]) ||
+                !Lagrange::computeBasis(fe.N2,dN2du,q1,xg[i],q2,xg[j])) {
+              ok = false;
+              break;
+            }

-	  // Compute Jacobian inverse of coordinate mapping and derivatives
-	  fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
-	  if (fe.detJxW == 0.0) continue; // skip singular points
+            // Compute Jacobian inverse of coordinate mapping and derivatives
+            fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
+            if (fe.detJxW == 0.0) continue; // skip singular points

-	  fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
+            fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1

-	  // Cartesian coordinates of current integration point
-	  X = Xnod * fe.N1;
-	  X.t = time.t;
+            // Cartesian coordinates of current integration point
+            X = Xnod * fe.N1;
+            X.t = time.t;

-	  // Evaluate the integrand and accumulate element contributions
-	  fe.detJxW *= wg[i]*wg[j];
-	  if (!integrand.evalIntMx(*A,fe,time,X))
-	    return false;
-	}
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= wg[i]*wg[j];
+            if (!integrand.evalIntMx(*A,fe,time,X)) {
+              ok = false;
+              break;
+            }
+          }

-      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
-	return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
+      }
    }
+  }

-  return true;
+  return ok;
 }


@ -344,8 +362,6 @@ bool ASMs2DmxLag::integrate (Integrand& integrand, int lIndex,
  for (int i2 = 0; i2 < nely; i2++)
    for (int i1 = 0; i1 < nelx; i1++, iel++)
    {
-      fe.iel = MLGE[iel-1];
-
      // Skip elements that are not on current boundary edge
      bool skipMe = false;
      switch (edgeDir)
@ -361,8 +377,10 @@ bool ASMs2DmxLag::integrate (Integrand& integrand, int lIndex,
      if (!this->getElementCoordinates(Xnod,iel)) return false;

      // Initialize element quantities
+      fe.iel = MLGE[iel-1];
      IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-      LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
+      LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                    fe.iel,true);
      if (!integrand.initElementBou(IntVec(MNPC[iel-1].begin(),f2start),
 				    IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
 	return false;
@ -396,15 +414,15 @@ bool ASMs2DmxLag::integrate (Integrand& integrand, int lIndex,

 	// Evaluate the integrand and accumulate element contributions
 	fe.detJxW *= wg[i];
-	if (!integrand.evalBouMx(A,fe,time,X,normal))
+	if (!integrand.evalBouMx(*A,fe,time,X,normal))
 	  return false;
      }

      // Assembly of global system integral
-      if (!glInt.assemble(A,fe.iel))
+      if (!glInt.assemble(A->ref(),fe.iel))
 	return false;
-      if (!dynamic_cast<ElmNorm*>(A))
-        delete A;
+
+      A->destruct();
    }

  return true;
--- a/src/ASM/ASMs3D.C
+++ b/src/ASM/ASMs3D.C
@ -19,6 +19,7 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
@ -26,8 +27,6 @@
 #include "Utilities.h"
 #include "Profiler.h"
 #include "Vec3Oper.h"
-#include "ElmMats.h"
-#include "ElmNorm.h"


 ASMs3D::ASMs3D (unsigned char n_f)
@ -1112,7 +1111,7 @@ static bool getG (const Matrix& Jinv, const Vector& du, Matrix& G)

  G.resize(nsd,nsd,true);

-  for (size_t k = 1;k <= nsd;k++) 
+  for (size_t k = 1;k <= nsd;k++)
    for (size_t l = 1;l <= nsd;l++)
      for (size_t m = 1;m <= nsd;m++)
 	G(k,l) += Jinv(m,k)*Jinv(m,l)/(du(k)*du(l));
@ -1206,7 +1205,6 @@ bool ASMs3D::integrate (Integrand& integrand,

  const int n1 = svol->numCoefs(0);
  const int n2 = svol->numCoefs(1);
-  const int n3 = svol->numCoefs(2);

  const int p1 = svol->order(0);
  const int p2 = svol->order(1);
@ -1215,184 +1213,208 @@ bool ASMs3D::integrate (Integrand& integrand,
  const int nel1 = n1 - p1 + 1;
  const int nel2 = n2 - p2 + 1;

-  FiniteElement fe(p1*p2*p3);
-  Matrix   dNdu, Xnod, Jac;
-  Matrix3D d2Ndu2, Hess;
-  Vec4     X;
+  if (threadGroupsVol.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i3 = p3; i3 <= n3; i3++)
-    for (int i2 = p2; i2 <= n2; i2++)
-      for (int i1 = p1; i1 <= n1; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-	if (fe.iel < 1) continue; // zero-volume element
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+      FiniteElement fe(p1*p2*p3);
+      Matrix   dNdu, Xnod, Jac;
+      Matrix3D d2Ndu2, Hess;
+      Vec4     X;
+      for (size_t l=0;l<threadGroupsVol[g][t].size();++l) {
+        int iel = threadGroupsVol[g][t][l];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-volume element

-	// Get element volume in the parameter space
-	double dV = this->getParametricVolume(iel);
-	if (dV < 0.0) return false; // topology error (probably logic error)
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + (iel / nel1) % nel2;
+        int i3 = p3 + iel / (nel1*nel2);

-	// Set up control point (nodal) coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Get element volume in the parameter space
+        double dV = this->getParametricVolume(++iel);
+        if (dV < 0.0) {
+          ok = false; // topology error (probably logic error)
+          break;
+        }

-	// Compute characteristic element length, if needed
-	if (integrand.getIntegrandType() == 2)
-	  fe.h = getElmSize(p1,p2,p3,Xnod);
+        // Set up control point (nodal) coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

-	else if (integrand.getIntegrandType() == 3)
-	{
-	  // --- Compute average value of basis functions over the element -----
+        // Compute characteristic element length, if needed
+        if (integrand.getIntegrandType() == 2)
+          fe.h = getElmSize(p1,p2,p3,Xnod);

-	  fe.Navg.resize(p1*p2*p3,true);
-	  double vol = 0.0;
-	  int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-	  for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
-	    for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	      for (int i = 0; i < nGauss; i++, ip++)
-	      {
-		// Fetch basis function derivatives at current integration point
-		extractBasis(spline[ip],fe.N,dNdu);
+        else if (integrand.getIntegrandType() == 3)
+        {
+          // --- Compute average value of basis functions over the element -----

-		// Compute Jacobian determinant of coordinate mapping
-		// and multiply by weight of current integration point
-		double detJac = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu,false);
-		double weight = 0.125*dV*wg[i]*wg[j]*wg[k];
+          fe.Navg.resize(p1*p2*p3,true);
+          double vol = 0.0;
+          int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+          for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
+            for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+              for (int i = 0; i < nGauss; i++, ip++)
+              {
+                // Fetch basis function derivatives at current integration point
+                extractBasis(spline[ip],fe.N,dNdu);

-		// Numerical quadrature
-		fe.Navg.add(fe.N,detJac*weight);
-		vol += detJac*weight;
-	      }
+                // Compute Jacobian determinant of coordinate mapping
+                // and multiply by weight of current integration point
+                double detJac = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu,false);
+                double weight = 0.125*dV*wg[i]*wg[j]*wg[k];

-	  // Divide by element volume
-	  fe.Navg /= vol;
-	}
+                // Numerical quadrature
+                fe.Navg.add(fe.N,detJac*weight);
+                vol += detJac*weight;
+              }

-	else if (integrand.getIntegrandType() == 4)
-	{
-	  // Compute the element center
-	  Go::Point X0;
-	  double u0 = 0.5*(gpar[0](1,i1-p1+1) + gpar[0](nGauss,i1-p1+1));
-	  double v0 = 0.5*(gpar[1](1,i2-p2+1) + gpar[1](nGauss,i2-p2+1));
-	  double w0 = 0.5*(gpar[2](1,i3-p3+1) + gpar[2](nGauss,i3-p3+1));
-	  svol->point(X0,u0,v0,w0);
-	  X.x = X0[0];
-	  X.y = X0[1];
-	  X.z = X0[2];
-	}
+          // Divide by element volume
+          fe.Navg /= vol;
+        }
+
+        else if (integrand.getIntegrandType() == 4)
+        {
+          // Compute the element center
+          Go::Point X0;
+          double u0 = 0.5*(gpar[0](1,i1-p1+1) + gpar[0](nGauss,i1-p1+1));
+          double v0 = 0.5*(gpar[1](1,i2-p2+1) + gpar[1](nGauss,i2-p2+1));
+          double w0 = 0.5*(gpar[2](1,i3-p3+1) + gpar[2](nGauss,i3-p3+1));
+          svol->point(X0,u0,v0,w0);
+          X.x = X0[0];
+          X.y = X0[1];
+          X.z = X0[2];
+        }

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,vec,*A))
-	  return false;
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,*A)) {
+          ok = false;
+          break;
+        }

-	/*if (integrand.getIntegrandType() > 10)
-	{
-	  // --- Selective reduced integration loop ----------------------------
+        if (integrand.getIntegrandType() > 10)
+        {
+          // --- Selective reduced integration loop ----------------------------

-	  int ip = (((i3-p3)*nRed*nel2 + i2-p2)*nRed*nel1 + i1-p1)*nRed;
-	  for (int k = 0; k < nRed; k++, ip += nRed*(nel2-1)*nRed*nel1)
-	    for (int j = 0; j < nRed; j++, ip += nRed*(nel1-1))
-	      for (int i = 0; i < nRed; i++, ip++)
-	      {
-		// Local element coordinates of current integration point
-		fe.xi   = xr[i];
-		fe.eta  = xr[j];
-		fe.zeta = xr[k];
+          int ip = (((i3-p3)*nRed*nel2 + i2-p2)*nRed*nel1 + i1-p1)*nRed;
+          for (int k = 0; k < nRed; k++, ip += nRed*(nel2-1)*nRed*nel1)
+            for (int j = 0; j < nRed; j++, ip += nRed*(nel1-1))
+              for (int i = 0; i < nRed; i++, ip++)
+              {
+                // Local element coordinates of current integration point
+                fe.xi   = xr[i];
+                fe.eta  = xr[j];
+                fe.zeta = xr[k];

-		// Parameter values of current integration point
-		fe.u = redpar[0](i+1,i1-p1+1);
-		fe.v = redpar[1](j+1,i2-p2+1);
-		fe.w = redpar[2](k+1,i3-p3+1);
+                // Parameter values of current integration point
+                fe.u = redpar[0](i+1,i1-p1+1);
+                fe.v = redpar[1](j+1,i2-p2+1);
+                fe.w = redpar[2](k+1,i3-p3+1);

-		// Fetch basis function derivatives at current point
-		extractBasis(splineRed[ip],fe.N,dNdu);
+                // Fetch basis function derivatives at current point
+                extractBasis(splineRed[ip],fe.N,dNdu);

-		// Compute Jacobian inverse and derivatives
-		fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+                // Compute Jacobian inverse and derivatives
+                fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);

-		// Cartesian coordinates of current integration point
-		X = Xnod * fe.N;
-		X.t = time.t;
+                // Cartesian coordinates of current integration point
+                X = Xnod * fe.N;
+                X.t = time.t;

-		// Compute the reduced integration terms of the integrand
-		if (!integrand.reducedInt(fe,X))
-		  return false;
-	      }
-	}*/
+                // Compute the reduced integration terms of the integrand
+                if (!integrand.reducedInt(*A,fe,X)) {
+                  ok = false;
+                  break;
+                }
+              }
+        }

+        // --- Integration loop over all Gauss points in each direction --------

-	// --- Integration loop over all Gauss points in each direction --------
+        int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+        for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
+          for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+            for (int i = 0; i < nGauss; i++, ip++)
+            {
+              // Local element coordinates of current integration point
+              fe.xi   = xg[i];
+              fe.eta  = xg[j];
+              fe.zeta = xg[k];

-	int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-	for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
-	  for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	    for (int i = 0; i < nGauss; i++, ip++)
-	    {
-	      // Local element coordinates of current integration point
-	      fe.xi   = xg[i];
-	      fe.eta  = xg[j];
-	      fe.zeta = xg[k];
+              // Parameter values of current integration point
+              fe.u = gpar[0](i+1,i1-p1+1);
+              fe.v = gpar[1](j+1,i2-p2+1);
+              fe.w = gpar[2](k+1,i3-p3+1);

-	      // Parameter values of current integration point
-	      fe.u = gpar[0](i+1,i1-p1+1);
-	      fe.v = gpar[1](j+1,i2-p2+1);
-	      fe.w = gpar[2](k+1,i3-p3+1);
+              // Fetch basis function derivatives at current integration point
+              if (integrand.getIntegrandType() == 2)
+                extractBasis(spline2[ip],fe.N,dNdu,d2Ndu2);
+              else
+                extractBasis(spline[ip],fe.N,dNdu);

-	      // Fetch basis function derivatives at current integration point
-	      if (integrand.getIntegrandType() == 2)
-		extractBasis(spline2[ip],fe.N,dNdu,d2Ndu2);
-	      else
-		extractBasis(spline[ip],fe.N,dNdu);
+              // Compute Jacobian inverse of coordinate mapping and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+              if (fe.detJxW == 0.0) continue; // skip singular points

- 	      // Compute Jacobian inverse of coordinate mapping and derivatives
-	      fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	      if (fe.detJxW == 0.0) continue; // skip singular points
-
- 	      // Compute Hessian of coordinate mapping and 2nd order derivatives
-	      if (integrand.getIntegrandType() == 2) {
-		if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu))
-		  return false;
-
-		// RUNAR
-		// Vector dXidu(3);
-// 		dXidu(1) = this->getParametricLength(iel,1);
-// 		dXidu(2) = this->getParametricLength(iel,2);
-// 		dXidu(3) = this->getParametricLength(iel,3);
-// 		if (!getG(Jac,dXidu,fe.G))
-// 		  return false;
-	      }
+              // Compute Hessian of coordinate mapping and 2nd order derivatives
+              if (integrand.getIntegrandType() == 2)
+                if (!utl::Hessian(Hess,fe.d2NdX2,Jac,Xnod,d2Ndu2,dNdu)) {
+                  ok = false;
+                  break;
+                }

+/*	      // RUNAR
+	      Vector dXidu(3);
+ 	      dXidu(1) = this->getParametricLength(iel,1);
+ 	      dXidu(2) = this->getParametricLength(iel,2);
+ 	      dXidu(3) = this->getParametricLength(iel,3);
+ 	      if (!getG(Jac,dXidu,fe.G))
+ 	        return false;
+*/
 #if SP_DEBUG > 4
-	      std::cout <<"\niel, ip = "<< iel <<" "<< ip
-			<<"\nN ="<< fe.N <<"dNdX ="<< fe.dNdX << std::endl;
+              std::cout <<"\niel, ip = "<< iel <<" "<< ip
+                <<"\nN ="<< fe.N <<"dNdX ="<< fe.dNdX << std::endl;
 #endif

-	      // Cartesian coordinates of current integration point
-	      X = Xnod * fe.N;
-	      X.t = time.t;
+              // Cartesian coordinates of current integration point
+              X = Xnod * fe.N;
+              X.t = time.t;

-	      // Evaluate the integrand and accumulate element contributions
-	      fe.detJxW *= 0.125*dV*wg[i]*wg[j]*wg[k];
-              if (!integrand.evalInt(*A,fe,time,X,vec))
-		return false;
-	    }
+              // Evaluate the integrand and accumulate element contributions
+              fe.detJxW *= 0.125*dV*wg[i]*wg[j]*wg[k];
+              if (!integrand.evalInt(*A,fe,time,X)) {
+                ok = false;
+                break;
+              }
+            }

 	// Finalize the element quantities
-	if (!integrand.finalizeElement(A,time))
-	  return false;
+	if (!integrand.finalizeElement(*A,time)) {
+          ok = false;
+          break;
+        }

-	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
+    }
+  }

-  return true;
+  return ok;
 }


@ -1440,7 +1462,6 @@ bool ASMs3D::integrate (Integrand& integrand, int lIndex,

  const int n1 = svol->numCoefs(0);
  const int n2 = svol->numCoefs(1);
-  const int n3 = svol->numCoefs(2);

  const int p1 = svol->order(0);
  const int p2 = svol->order(1);
@ -1449,120 +1470,127 @@ bool ASMs3D::integrate (Integrand& integrand, int lIndex,
  const int nel1 = n1 - p1 + 1;
  const int nel2 = n2 - p2 + 1;

-  FiniteElement fe(p1*p2*p3);
-  fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
-  fe.u = gpar[0](1,1);
-  fe.v = gpar[1](1,1);
-  fe.w = gpar[2](1,1);
-
-  Matrix dNdu, Xnod, Jac;
-  Vec4   X;
-  Vec3   normal;
-
+  if (threadGroupsFace.empty())
+    generateThreadGroups();

  // === Assembly loop over all elements on the patch face =====================

-  int iel = 1;
-  for (int i3 = p3; i3 <= n3; i3++)
-    for (int i2 = p2; i2 <= n2; i2++)
-      for (int i1 = p1; i1 <= n1; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-	if (fe.iel < 1) continue; // zero-volume element
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+      FiniteElement fe(p1*p2*p3);
+      fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
+      fe.u = gpar[0](1,1);
+      fe.v = gpar[1](1,1);
+      fe.w = gpar[2](1,1);

-	// Skip elements that are not on current boundary face
-	bool skipMe = false;
-	switch (faceDir)
-	  {
-	  case -1: if (i1 > p1) skipMe = true; break;
-	  case  1: if (i1 < n1) skipMe = true; break;
-	  case -2: if (i2 > p2) skipMe = true; break;
-	  case  2: if (i2 < n2) skipMe = true; break;
-	  case -3: if (i3 > p3) skipMe = true; break;
-	  case  3: if (i3 < n3) skipMe = true; break;
-	  }
-	if (skipMe) continue;
+      Matrix dNdu, Xnod, Jac;
+      Vec4   X;
+      Vec3   normal;
+      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
+        int iel = threadGroupsFace[lIndex-1][g][t][l];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-volume element

-	// Get element face area in the parameter space
-	double dA = this->getParametricArea(iel,abs(faceDir));
-	if (dA < 0.0) return false; // topology error (probably logic error)
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + (iel / nel1) % nel2;
+        int i3 = p3 + iel / (nel1*nel2);

-	// Set up control point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Get element face area in the parameter space
+        double dA = this->getParametricArea(++iel,abs(faceDir));
+        if (dA < 0.0) { // topology error (probably logic error)
+          ok = false;
+          break;
+        }

-	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+        // Set up control point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

-	// Define some loop control variables depending on which face we are on
-	int nf1, j1, j2;
-	switch (abs(faceDir))
-	  {
-	  case 1: nf1 = nel2; j2 = i3-p3; j1 = i2-p2; break;
-	  case 2: nf1 = nel1; j2 = i3-p3; j1 = i1-p1; break;
-	  case 3: nf1 = nel1; j2 = i2-p2; j1 = i1-p1; break;
-	  default: nf1 = j1 = j2 = 0;
-	  }
+        // Initialize element quantities
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) {
+          ok = false;
+          break;
+        }

-	// --- Integration loop over all Gauss points in each direction --------
+        // Define some loop control variables depending on which face we are on
+        int nf1, j1, j2;
+        switch (abs(faceDir))
+        {
+          case 1: nf1 = nel2; j2 = i3-p3; j1 = i2-p2; break;
+          case 2: nf1 = nel1; j2 = i3-p3; j1 = i1-p1; break;
+          case 3: nf1 = nel1; j2 = i2-p2; j1 = i1-p1; break;
+          default: nf1 = j1 = j2 = 0;
+        }

-	int k1, k2, k3;
-	int ip = (j2*nGauss*nf1 + j1)*nGauss;
-	for (int j = 0; j < nGauss; j++, ip += nGauss*(nf1-1))
-	  for (int i = 0; i < nGauss; i++, ip++)
-	  {
-	    // Local element coordinates and parameter values
-	    // of current integration point
-	    switch (abs(faceDir)) {
-	    case 1: k2 = i+1; k3 = j+1; k1 = 0; break;
-	    case 2: k1 = i+1; k3 = j+1; k2 = 0; break;
-	    case 3: k1 = i+1; k2 = j+1; k3 = 0; break;
-	    default: k1 = k2 = k3 = 0;
-	    }
-	    if (gpar[0].size() > 1)
-	    {
-	      fe.xi = xg[k1];
-	      fe.u = gpar[0](k1,i1-p1+1);
-	    }
-	    if (gpar[1].size() > 1)
-	    {
-	      fe.eta = xg[k2];
-	      fe.v = gpar[1](k2,i2-p2+1);
-	    }
-	    if (gpar[2].size() > 1)
-	    {
-	      fe.zeta = xg[k3];
-	      fe.w = gpar[2](k3,i3-p3+1);
-	    }
+        // --- Integration loop over all Gauss points in each direction --------

-	    // Fetch basis function derivatives at current integration point
-	    extractBasis(spline[ip],fe.N,dNdu);
+        int k1, k2, k3;
+        int ip = (j2*nGauss*nf1 + j1)*nGauss;
+        for (int j = 0; j < nGauss; j++, ip += nGauss*(nf1-1))
+          for (int i = 0; i < nGauss; i++, ip++)
+          {
+            // Local element coordinates and parameter values
+            // of current integration point
+            switch (abs(faceDir)) {
+              case 1: k2 = i+1; k3 = j+1; k1 = 0; break;
+              case 2: k1 = i+1; k3 = j+1; k2 = 0; break;
+              case 3: k1 = i+1; k2 = j+1; k3 = 0; break;
+              default: k1 = k2 = k3 = 0;
+            }
+            if (gpar[0].size() > 1)
+            {
+              fe.xi = xg[k1];
+              fe.u = gpar[0](k1,i1-p1+1);
+            }
+            if (gpar[1].size() > 1)
+            {
+              fe.eta = xg[k2];
+              fe.v = gpar[1](k2,i2-p2+1);
+            }
+            if (gpar[2].size() > 1)
+            {
+              fe.zeta = xg[k3];
+              fe.w = gpar[2](k3,i3-p3+1);
+            }

-	    // Compute basis function derivatives and the face normal
-	    fe.detJxW = utl::Jacobian(Jac,normal,fe.dNdX,Xnod,dNdu,t1,t2);
-	    if (fe.detJxW == 0.0) continue; // skip singular points
+            // Fetch basis function derivatives at current integration point
+            extractBasis(spline[ip],fe.N,dNdu);

-	    if (faceDir < 0) normal *= -1.0;
+            // Compute basis function derivatives and the face normal
+            fe.detJxW = utl::Jacobian(Jac,normal,fe.dNdX,Xnod,dNdu,t1,t2);
+            if (fe.detJxW == 0.0) continue; // skip singular points

-	    // Cartesian coordinates of current integration point
-	    X = Xnod * fe.N;
-	    X.t = time.t;
+            if (faceDir < 0) normal *= -1.0;

-	    // Evaluate the integrand and accumulate element contributions
-	    fe.detJxW *= 0.25*dA*wg[i]*wg[j];
-            if (!integrand.evalBou(*A,fe,time,X,normal,vec))
-	      return false;
-	  }
+            // Cartesian coordinates of current integration point
+            X = Xnod * fe.N;
+            X.t = time.t;

-	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= 0.25*dA*wg[i]*wg[j];
+            if (!integrand.evalBou(*A,fe,time,X,normal)) {
+              ok = false;
+              break;
+            }
+          }
+
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
+    }
+  }

-  return true;
+  return ok;
 }


@ -1695,10 +1723,8 @@ bool ASMs3D::integrateEdge (Integrand& integrand, int lEdge,
 	if (!this->getElementCoordinates(Xnod,iel)) return false;

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
-
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

 	// --- Integration loop over all Gauss points along the edge -----------

@ -1722,13 +1748,15 @@ bool ASMs3D::integrateEdge (Integrand& integrand, int lEdge,

 	  // Evaluate the integrand and accumulate element contributions
 	  fe.detJxW *= 0.5*dS*wg[i];
-          if (!integrand.evalBou(*A,fe,time,X,tang,vec))
+          if (!integrand.evalBou(*A,fe,time,X,tang))
 	    return false;
 	}

 	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
+	if (!glInt.assemble(A->ref(),fe.iel))
 	  return false;
+
+	A->destruct();
      }

  return true;
@ -2103,3 +2131,62 @@ bool ASMs3D::evalSolution (Matrix& sField, const Integrand& integrand,

  return true;
 }
+
+
+void ASMs3D::generateThreadGroups()
+{
+  const int p1 = svol->order(0);
+  const int p2 = svol->order(1);
+  const int p3 = svol->order(2);
+  const int n1 = svol->numCoefs(0);
+  const int n2 = svol->numCoefs(1);
+  const int n3 = svol->numCoefs(2);
+  const int nel1 = n1 - p1 + 1;
+  const int nel2 = n2 - p2 + 1;
+  const int nel3 = n3 - p3 + 1;
+
+  utl::calcThreadGroups(nel1,nel2,nel3,threadGroupsVol);
+
+  // now the faces
+  threadGroupsFace.resize(6);
+  for (int face=0;face<6;++face) {
+    const int faceDir = (face+2)/((face+1)%2 ? -2 : 2);
+    std::vector<int> map;
+    int iel=0;
+    for (int i3 = p3; i3 <= n3; i3++) {
+      for (int i2 = p2; i2 <= n2; i2++) {
+        for (int i1 = p1; i1 <= n1; i1++, iel++) {
+          // Skip elements that are not on current boundary face
+          bool skipMe = false;
+          switch (faceDir)
+          {
+            case -1: if (i1 > p1) skipMe = true; break;
+            case  1: if (i1 < n1) skipMe = true; break;
+            case -2: if (i2 > p2) skipMe = true; break;
+            case  2: if (i2 < n2) skipMe = true; break;
+            case -3: if (i3 > p3) skipMe = true; break;
+            case  3: if (i3 < n3) skipMe = true; break;
+          }
+          if (skipMe)
+            continue;
+
+          map.push_back(iel);
+        }
+      }
+    }
+
+    int d1, d2;
+    if (face < 2) {
+      d1 = nel2;
+      d2 = nel3;
+    } else if (face < 4) {
+      d1 = nel1;
+      d2 = nel3;
+    } else {
+      d1 = nel1;
+      d2 = nel2;
+    }
+    utl::calcThreadGroups(d1,d2,threadGroupsFace[face]);
+    utl::mapThreadGroups(threadGroupsFace[face],map);
+  }
+}
--- a/src/ASM/ASMs3D.h
+++ b/src/ASM/ASMs3D.h
@ -15,6 +15,7 @@
 #define _ASM_S3D_H

 #include "ASMstruct.h"
+#include "Utilities.h"

 namespace Go {
  class SplineVolume;
@ -433,6 +434,15 @@ protected:

  const IndexVec& nodeInd; //!< IJK-triplets for the control points (nodes)
  IndexVec      myNodeInd; //!< The actual IJK-triplet container
+
+  //! \brief Element groups for multithreaded volume assembly 
+  utl::ThreadGroups threadGroupsVol;
+
+  //! \brief Element groups for multithreaded face assembly 
+  std::vector<utl::ThreadGroups> threadGroupsFace;
+
+  //! \brief Generate thread groups
+  virtual void generateThreadGroups();
 };

 #endif
--- a/src/ASM/ASMs3DLag.C
+++ b/src/ASM/ASMs3DLag.C
@ -18,13 +18,13 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
 #include "ElementBlock.h"
 #include "Utilities.h"
 #include "Vec3Oper.h"
-#include "ElmNorm.h"


 ASMs3DLag::ASMs3DLag (unsigned char n_f) : ASMs3D(n_f), coord(myCoord)
@ -227,132 +227,162 @@ bool ASMs3DLag::integrate (Integrand& integrand,
  this->getGridParameters(wpar,2,1);

  // Number of elements in each direction
-  const int nelx = upar.size() - 1;
-  const int nely = vpar.size() - 1;
-  const int nelz = wpar.size() - 1;
+  const int nel1 = upar.size() - 1;
+  const int nel2 = vpar.size() - 1;

  // Order of basis in the three parametric directions (order = degree + 1)
  const int p1 = svol->order(0);
  const int p2 = svol->order(1);
  const int p3 = svol->order(2);

-  FiniteElement fe(p1*p2*p3);
-  Matrix dNdu, Xnod, Jac;
-  Vec4   X;
+  if (threadGroupsVol.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i3 = 0; i3 < nelz; i3++)
-    for (int i2 = 0; i2 < nely; i2++)
-      for (int i1 = 0; i1 < nelx; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+      FiniteElement fe(p1*p2*p3);
+      Matrix dNdu, Xnod, Jac;
+      Vec4   X;
+      for (size_t l=0;l<threadGroupsVol[g][t].size();++l) {
+        int iel = threadGroupsVol[g][t][l];
+        int i1  =  iel % nel1;
+        int i2  = (iel / nel1) % nel2;
+        int i3  =  iel / (nel1*nel2);

-	// Set up nodal point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Set up nodal point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,++iel))
+        {
+          ok = false;
+          break;
+        }

-	if (integrand.getIntegrandType() == 4)
-	{
-	  // Compute the element "center" (average of element node coordinates)
-	  X = 0.0;
-	  for (size_t i = 1; i <= 3; i++)
-	    for (size_t j = 1; j <= Xnod.cols(); j++)
-	      X[i-1] += Xnod(i,j);
+        if (integrand.getIntegrandType() == 4)
+        {
+          // Compute the element "center" (average of element node coordinates)
+          X = 0.0;
+          for (size_t i = 1; i <= 3; i++)
+            for (size_t j = 1; j <= Xnod.cols(); j++)
+              X[i-1] += Xnod(i,j);

-	  X *= 1.0/(double)Xnod.cols();
-	}
+          X *= 1.0/(double)Xnod.cols();
+        }

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,vec,*A))
-	  return false;
+        fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],X,nRed*nRed*nRed,*A))
+        {
+          ok = false;
+          break;
+        }

-	// --- Selective reduced integration loop ------------------------------
+        // --- Selective reduced integration loop ------------------------------

-	/*if (integrand.getIntegrandType() > 10)
-	  for (int k = 0; k < nRed; k++)
-	    for (int j = 0; j < nRed; j++)
-	      for (int i = 0; i < nRed; i++)
-	      {
-		// Local element coordinates of current integration point
-		fe.xi   = xr[i];
-		fe.eta  = xr[j];
-		fe.zeta = xr[k];
+        if (integrand.getIntegrandType() > 10)
+          for (int k = 0; k < nRed; k++)
+            for (int j = 0; j < nRed; j++)
+              for (int i = 0; i < nRed; i++)
+              {
+                // Local element coordinates of current integration point
+                fe.xi   = xr[i];
+                fe.eta  = xr[j];
+                fe.zeta = xr[k];

-		// Parameter value of current integration point
-		fe.u = 0.5*(upar[i1]*(1.0-xr[i]) + upar[i1+1]*(1.0+xr[i]));
-		fe.v = 0.5*(vpar[i2]*(1.0-xr[j]) + vpar[i2+1]*(1.0+xr[j]));
-		fe.w = 0.5*(wpar[i3]*(1.0-xr[k]) + wpar[i3+1]*(1.0+xr[k]));
+                // Parameter value of current integration point
+                fe.u = 0.5*(upar[i1]*(1.0-xr[i]) + upar[i1+1]*(1.0+xr[i]));
+                fe.v = 0.5*(vpar[i2]*(1.0-xr[j]) + vpar[i2+1]*(1.0+xr[j]));
+                fe.w = 0.5*(wpar[i3]*(1.0-xr[k]) + wpar[i3+1]*(1.0+xr[k]));

-		// Compute basis function derivatives at current point
-		// using tensor product of one-dimensional Lagrange polynomials
-		if (!Lagrange::computeBasis(fe.N,dNdu,
-					    p1,xr[i],p2,xr[j],p3,xr[k]))
-		  return false;
+                // Compute basis function derivatives at current point
+                // using tensor product of one-dimensional Lagrange polynomials
+                if (!Lagrange::computeBasis(fe.N,dNdu,
+                                            p1,xr[i],p2,xr[j],p3,xr[k]))
+                {
+                  ok = false;
+                  break;
+                }

-		// Compute Jacobian inverse and derivatives
-		fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+                // Compute Jacobian inverse and derivatives
+                fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);

-		// Cartesian coordinates of current integration point
-		X = Xnod * fe.N;
-		X.t = time.t;
+                // Cartesian coordinates of current integration point
+                X = Xnod * fe.N;
+                X.t = time.t;

-		// Compute the reduced integration terms of the integrand
-		if (!integrand.reducedInt(fe,X))
-		  return false;
-	      }
+                // Compute the reduced integration terms of the integrand
+                if (!integrand.reducedInt(*A,fe,X))
+                {
+                  ok = false;
+                  break;
+                }
+              }

-        */
-	// --- Integration loop over all Gauss points in each direction --------
+        // --- Integration loop over all Gauss points in each direction --------

-	for (int k = 0; k < nGauss; k++)
-	  for (int j = 0; j < nGauss; j++)
-	    for (int i = 0; i < nGauss; i++)
-	    {
-	      // Local element coordinates of current integration point
-	      fe.xi   = xg[i];
-	      fe.eta  = xg[j];
-	      fe.zeta = xg[k];
+        for (int k = 0; k < nGauss; k++)
+          for (int j = 0; j < nGauss; j++)
+            for (int i = 0; i < nGauss; i++)
+            {
+              // Local element coordinates of current integration point
+              fe.xi   = xg[i];
+              fe.eta  = xg[j];
+              fe.zeta = xg[k];

-	      // Parameter value of current integration point
-	      fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
-	      fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));
-	      fe.w = 0.5*(wpar[i3]*(1.0-xg[k]) + wpar[i3+1]*(1.0+xg[k]));
+              // Parameter value of current integration point
+              fe.u = 0.5*(upar[i1]*(1.0-xg[i]) + upar[i1+1]*(1.0+xg[i]));
+              fe.v = 0.5*(vpar[i2]*(1.0-xg[j]) + vpar[i2+1]*(1.0+xg[j]));
+              fe.w = 0.5*(wpar[i3]*(1.0-xg[k]) + wpar[i3+1]*(1.0+xg[k]));

-	      // Compute basis function derivatives at current integration point
-	      // using tensor product of one-dimensional Lagrange polynomials
-	      if (!Lagrange::computeBasis(fe.N,dNdu,p1,xg[i],p2,xg[j],p3,xg[k]))
-		return false;
+              // Compute basis function derivatives at current integration point
+              // using tensor product of one-dimensional Lagrange polynomials
+              if (!Lagrange::computeBasis(fe.N,dNdu,p1,xg[i],p2,xg[j],p3,xg[k]))
+              {
+                ok = false;
+                break;
+              }

-	      // Compute Jacobian inverse of coordinate mapping and derivatives
-	      fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	      if (fe.detJxW == 0.0) continue; // skip singular points
+              // Compute Jacobian inverse of coordinate mapping and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+              if (fe.detJxW == 0.0) continue; // skip singular points

-	      // Cartesian coordinates of current integration point
-	      X = Xnod * fe.N;
-	      X.t = time.t;
+              // Cartesian coordinates of current integration point
+              X = Xnod * fe.N;
+              X.t = time.t;

-	      // Evaluate the integrand and accumulate element contributions
-	      fe.detJxW *= wg[i]*wg[j]*wg[k];
-              if (!integrand.evalInt(*A,fe,time,X,vec))
-		return false;
-	    }
+              // Evaluate the integrand and accumulate element contributions
+              fe.detJxW *= wg[i]*wg[j]*wg[k];
+              if (!integrand.evalInt(*A,fe,time,X))
+              {
+                ok = false;
+                break;
+              }
+            }

 	// Finalize the element quantities
- 	if (!integrand.finalizeElement(A,time))
- 	  return false;
+ 	if (!integrand.finalizeElement(*A,time))
+        {
+          ok = false;
+          break;
+        }

-	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel))
+        {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
+    }
+  }

-  return true;
+  return ok;
 }


@ -380,9 +410,8 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
  const int p3 = svol->order(2);

  // Number of elements in each direction
-  const int nelx = (nx-1)/(p1-1);
-  const int nely = (ny-1)/(p2-1);
-  const int nelz = (nz-1)/(p3-1);
+  const int nel1 = (nx-1)/(p1-1);
+  const int nel2 = (ny-1)/(p2-1);

  // Get parametric coordinates of the elements
  RealArray upar, vpar, wpar;
@ -397,46 +426,43 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,
  if (vpar.empty()) this->getGridParameters(vpar,1,1);
  if (wpar.empty()) this->getGridParameters(wpar,2,1);

-  FiniteElement fe(p1*p2*p3);
-  fe.u = upar.front();
-  fe.v = vpar.front();
-  fe.w = wpar.front();
-
-  Matrix dNdu, Xnod, Jac;
-  Vec4   X;
-  Vec3   normal;
-  double xi[3];
-
+  if (threadGroupsFace.empty())
+    generateThreadGroups();

  // === Assembly loop over all elements on the patch face =====================
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+      FiniteElement fe(p1*p2*p3);
+      fe.u = upar.front();
+      fe.v = vpar.front();
+      fe.w = wpar.front();

-  int iel = 1;
-  for (int i3 = 0; i3 < nelz; i3++)
-    for (int i2 = 0; i2 < nely; i2++)
-      for (int i1 = 0; i1 < nelx; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-
-	// Skip elements that are not on current boundary face
-	bool skipMe = false;
-	switch (faceDir)
-	  {
-	  case -1: if (i1 > 0)      skipMe = true; break;
-	  case  1: if (i1 < nelx-1) skipMe = true; break;
-	  case -2: if (i2 > 0)      skipMe = true; break;
-	  case  2: if (i2 < nely-1) skipMe = true; break;
-	  case -3: if (i3 > 0)      skipMe = true; break;
-	  case  3: if (i3 < nelz-1) skipMe = true; break;
-	  }
-	if (skipMe) continue;
+      Matrix dNdu, Xnod, Jac;
+      Vec4   X;
+      Vec3   normal;
+      double xi[3];
+      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
+        int iel = threadGroupsFace[lIndex-1][g][t][l];
+        fe.iel = MLGE[iel];
+        int i1  =  iel % nel1;
+        int i2  = (iel / nel1) % nel2;
+        int i3  =  iel / (nel1*nel2);

 	// Set up nodal point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+	if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+	fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) {
+          ok = false;
+          break;
+        }

 	// --- Integration loop over all Gauss points in each direction --------

@ -469,8 +495,10 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,

 	    // Compute the basis functions and their derivatives, using
 	    // tensor product of one-dimensional Lagrange polynomials
-	    if (!Lagrange::computeBasis(fe.N,dNdu,p1,xi[0],p2,xi[1],p3,xi[2]))
-	      return false;
+	    if (!Lagrange::computeBasis(fe.N,dNdu,p1,xi[0],p2,xi[1],p3,xi[2])) {
+              ok = false;
+              break;
+            }

 	    // Compute basis function derivatives and the face normal
 	    fe.detJxW = utl::Jacobian(Jac,normal,fe.dNdX,Xnod,dNdu,t1,t2);
@ -484,18 +512,24 @@ bool ASMs3DLag::integrate (Integrand& integrand, int lIndex,

 	    // Evaluate the integrand and accumulate element contributions
 	    fe.detJxW *= wg[i]*wg[j];
-            if (!integrand.evalBou(*A,fe,time,X,normal,vec))
-	      return false;
+            if (!integrand.evalBou(*A,fe,time,X,normal)) {
+              ok = false;
+              break;
+            }
 	  }

 	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
-      }
+	if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }

-  return true;
+        A->destruct();
+      }
+    }
+  }
+
+  return ok;
 }


@ -553,8 +587,6 @@ bool ASMs3DLag::integrateEdge (Integrand& integrand, int lEdge,
    for (int i2 = 0; i2 < nely; i2++)
      for (int i1 = 0; i1 < nelx; i1++, iel++)
      {
-	fe.iel = MLGE[iel-1];
-
 	// Skip elements that are not on current boundary edge
 	bool skipMe = false;
 	switch (lEdge)
@ -578,10 +610,9 @@ bool ASMs3DLag::integrateEdge (Integrand& integrand, int lEdge,
 	if (!this->getElementCoordinates(Xnod,iel)) return false;

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
-
+	fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

 	// --- Integration loop over all Gauss points along the edge -----------

@ -604,12 +635,12 @@ bool ASMs3DLag::integrateEdge (Integrand& integrand, int lEdge,
 	  X.t = time.t;

 	  // Evaluate the integrand and accumulate element contributions
-          if (!integrand.evalBou(*A,fe,time,X,tangent,vec))
+          if (!integrand.evalBou(*A,fe,time,X,tangent))
 	    return false;
 	}

 	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
+	if (!glInt.assemble(A->ref(),fe.iel))
 	  return false;
      }

@ -772,3 +803,65 @@ bool ASMs3DLag::evalSolution (Matrix& sField, const Integrand& integrand,

  return true;
 }
+
+
+void ASMs3DLag::generateThreadGroups()
+{
+  const int p1 = svol->order(0);
+  const int p2 = svol->order(1);
+  const int p3 = svol->order(2);
+
+  // Evaluate the parametric values
+  RealArray gpar[3];
+  getGridParameters(gpar[0],0,p1-1);
+  getGridParameters(gpar[1],1,p2-1);
+  getGridParameters(gpar[2],2,p3-1);
+
+  const int nel1 = (gpar[0].size()-1)/(p1-1);
+  const int nel2 = (gpar[1].size()-1)/(p2-1);
+  const int nel3 = (gpar[2].size()-1)/(p3-1);
+
+  utl::calcThreadGroups(nel1,nel2,nel3,threadGroupsVol);
+
+  // now the faces
+  threadGroupsFace.resize(6);
+  for (int face=0;face<6;++face) {
+    const int faceDir = (face+2)/((face+1)%2 ? -2 : 2);
+    std::vector<int> map;
+    int iel=0;
+    for (int i3 = 0; i3 < nel3; i3++) {
+      for (int i2 = 0; i2 < nel2; i2++) {
+        for (int i1 = 0; i1 < nel1; i1++, iel++) {
+          // Skip elements that are not on current boundary face
+          bool skipMe = false;
+          switch (faceDir)
+          {
+            case -1: if (i1 > 0)      skipMe = true; break;
+            case  1: if (i1 < nel1-1) skipMe = true; break;
+            case -2: if (i2 > 0)      skipMe = true; break;
+            case  2: if (i2 < nel2-1) skipMe = true; break;
+            case -3: if (i3 > 0)      skipMe = true; break;
+            case  3: if (i3 < nel3-1) skipMe = true; break;
+          }
+          if (skipMe)
+            continue;
+
+          map.push_back(iel);
+        }
+      }
+    }
+    int d1, d2;
+    if (face < 2) {
+      d1 = nel2;
+      d2 = nel3;
+    } else if (face < 4) {
+      d1 = nel1;
+      d2 = nel3;
+    } else {
+      d1 = nel1;
+      d2 = nel2;
+    }
+    utl::calcThreadGroups(d1,d2,threadGroupsFace[face]);
+    utl::mapThreadGroups(threadGroupsFace[face],map);
+  }
+}
--- a/src/ASM/ASMs3DLag.h
+++ b/src/ASM/ASMs3DLag.h
@ -140,6 +140,9 @@ public:
  //! in one element
  virtual bool getElementCoordinates(Matrix& X, int iel) const;

+  //! \brief Generate thread groups
+  void generateThreadGroups();
+public:
  //! \brief Returns a matrix with all nodal coordinates within the patch.
  //! \param[out] X 3\f$\times\f$n-matrix, where \a n is the number of nodes
  //! in the patch
--- a/src/ASM/ASMs3DSpec.C
+++ b/src/ASM/ASMs3DSpec.C
@ -17,11 +17,11 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "Vec3Oper.h"
 #include "Legendre.h"
-#include "ElmNorm.h"


 bool ASMs3DSpec::getGridParameters (RealArray& prm, int dir,
@ -101,59 +101,81 @@ bool ASMs3DSpec::integrate (Integrand& integrand,
  if (!Legendre::basisDerivatives(p2,D2)) return false;
  if (!Legendre::basisDerivatives(p3,D3)) return false;

-  FiniteElement fe(p1*p2*p3);
-  Matrix dNdu(p1*p2*p3,3), Xnod, Jac;
-  Vec4   X;
+  if (threadGroupsVol.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  const int nel = this->getNoElms();
-  for (int iel = 1; iel <= nel; iel++)
-    {
-	// Set up nodal point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+      FiniteElement fe(p1*p2*p3);
+      Matrix   dNdu(p1*p2*p3,3), Xnod, Jac;
+      Vec4     X;
+      for (size_t l=0;l<threadGroupsVol[g][t].size();++l) {
+        int iel = threadGroupsVol[g][t][l]+1;
+
+        // Set up nodal point coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel))
+        {
+          ok = false;
+          break;
+        }

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElement(MNPC[iel-1],vec,*A)) return false;
+        fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N.size(),fe.iel);
+        if (!integrand.initElement(MNPC[iel-1],*A))
+        {
+          ok = false;
+          break;
+        }

-	// --- Integration loop over all Gauss points in each direction --------
+        // --- Integration loop over all Gauss points in each direction --------

-	int count = 1;
-	for (int k = 1; k <= p3; k++)
-	  for (int j = 1; j <= p2; j++)
-	    for (int i = 1; i <= p1; i++, count++)
-	    {
-	      // Evaluate the basis functions and gradients using tensor product
-	      // of the one-dimensional Lagrange polynomials
-	      evalBasis(i,j,k,p1,p2,p3,D1,D2,D3,fe.N,dNdu);
+        int count = 1;
+        for (int k = 1; k <= p3; k++)
+          for (int j = 1; j <= p2; j++)
+            for (int i = 1; i <= p1; i++, count++)
+            {
+              // Evaluate the basis functions and gradients using tensor product
+              // of the one-dimensional Lagrange polynomials
+              evalBasis(i,j,k,p1,p2,p3,D1,D2,D3,fe.N,dNdu);

-	      // Compute Jacobian inverse of coordinate mapping and derivatives
-	      fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
-	      if (fe.detJxW == 0.0) continue; // skip singular points
+              // Compute Jacobian inverse of coordinate mapping and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dNdX,Xnod,dNdu);
+              if (fe.detJxW == 0.0) continue; // skip singular points

-	      // Cartesian coordinates of current integration point
-	      X.x = Xnod(1,count);
-	      X.y = Xnod(2,count);
-	      X.z = Xnod(3,count);
-	      X.t = time.t;
+              // Cartesian coordinates of current integration point
+              X.x = Xnod(1,count);
+              X.y = Xnod(2,count);
+              X.z = Xnod(3,count);
+              X.t = time.t;

-	      // Evaluate the integrand and accumulate element contributions
-	      fe.detJxW *= wg1(i)*wg2(j)*wg3(k);
-              if (!integrand.evalInt(*A,fe,time,X,vec))
-		return false;
-	    }
+              // Evaluate the integrand and accumulate element contributions
+              fe.detJxW *= wg1(i)*wg2(j)*wg3(k);
+              if (!integrand.evalInt(*A,fe,time,X))
+              {
+                ok = false;
+                break;
+              }
+            }

 	// Assembly of global system integral
-	if (!glInt.assemble(A,MLGE[iel-1]))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
-    }
+	if (!glInt.assemble(A->ref(),fe.iel))
+        {
+          ok = false;
+          break;
+        }

-  return true;
+        A->destruct();
+      }
+    }
+  }
+
+  return ok;
 }


@ -176,13 +198,6 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,
  p[1] = svol->order(1);
  p[2] = svol->order(2);

-  // Number of elements in each direction
-  int n1, n2, n3;
-  this->getSize(n1,n2,n3);
-  const int nelx = (n1-1)/(p[0]-1);
-  const int nely = (n2-1)/(p[1]-1);
-  const int nelz = (n3-1)/(p[2]-1);
-
  // Evaluate integration points (=nodal points) and weights

  Vector wg[3],xg1,xg2,xg3;
@ -197,40 +212,32 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,

  int nen = p[0]*p[1]*p[2];

-  FiniteElement fe(nen);
-  Matrix dNdu(nen,3), Xnod, Jac;
-  Vec4   X;
-  Vec3   normal;
-  int    xi[3];
-
-
  // === Assembly loop over all elements on the patch face =====================
-
-  int iel = 1;
-  for (int i3 = 0; i3 < nelz; i3++)
-    for (int i2 = 0; i2 < nely; i2++)
-      for (int i1 = 0; i1 < nelx; i1++, iel++)
-      {
-	// Skip elements that are not on current boundary face
-	bool skipMe = false;
-	switch (faceDir)
-	  {
-	  case -1: if (i1 > 0)      skipMe = true; break;
-	  case  1: if (i1 < nelx-1) skipMe = true; break;
-	  case -2: if (i2 > 0)      skipMe = true; break;
-	  case  2: if (i2 < nely-1) skipMe = true; break;
-	  case -3: if (i3 > 0)      skipMe = true; break;
-	  case  3: if (i3 < nelz-1) skipMe = true; break;
-	  }
-	if (skipMe) continue;
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+      FiniteElement fe(nen);
+      Matrix dNdu(nen,3), Xnod, Jac;
+      Vec4   X;
+      Vec3   normal;
+      int    xi[3];
+      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
+        int iel = threadGroupsFace[lIndex-1][g][t][l];

 	// Set up nodal point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+	if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+	fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) {
+          ok = false;
+          break;
+        }

 	// --- Integration loop over all Gauss points in each direction --------

@ -258,18 +265,24 @@ bool ASMs3DSpec::integrate (Integrand& integrand, int lIndex,

 	    // Evaluate the integrand and accumulate element contributions
 	    fe.detJxW *=  wg[t1-1][i]*wg[t2-1][j];
-            if (!integrand.evalBou(*A,fe,time,X,normal,vec))
-	      return false;
+            if (!integrand.evalBou(*A,fe,time,X,normal)) {
+              ok = false;
+              break;
+            }
 	  }

 	// Assembly of global system integral
-	if (!glInt.assemble(A,MLGE[iel-1]))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
-      }
+	if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }

-  return true;
+        A->destruct();
+      }
+    }
+  }
+
+  return ok;
 }


@ -362,9 +375,9 @@ bool ASMs3DSpec::integrateEdge (Integrand& integrand, int lEdge,
 	if (!this->getElementCoordinates(Xnod,iel)) return false;

 	// Initialize element quantities
-        Vectors vec;
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-        if (!integrand.initElementBou(MNPC[iel-1],vec,*A)) return false;
+	fe.iel = MLGE[iel-1];
+        LocalIntegral* A = integrand.getLocalIntegral(nen,fe.iel,true);
+        if (!integrand.initElementBou(MNPC[iel-1],*A)) return false;

 	// --- Integration loop over all Gauss points along the edge -----------

@ -387,12 +400,12 @@ bool ASMs3DSpec::integrateEdge (Integrand& integrand, int lEdge,

 	  // Evaluate the integrand and accumulate element contributions
 	  fe.detJxW *= wg[lDir][i];
-          if (!integrand.evalBou(*A,fe,time,X,tangent,vec))
+          if (!integrand.evalBou(*A,fe,time,X,tangent))
 	    return false;
 	}

 	// Assembly of global system integral
-	if (!glInt.assemble(A,MLGE[iel-1]))
+	if (!glInt.assemble(A->ref(),fe.iel))
 	  return false;
      }

--- a/src/ASM/ASMs3Dmx.C
+++ b/src/ASM/ASMs3Dmx.C
@ -18,6 +18,7 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
@ -25,7 +26,6 @@
 #include "Profiler.h"
 #include "Vec3Oper.h"
 #include "Vec3.h"
-#include "ElmNorm.h"


 ASMs3Dmx::ASMs3Dmx (unsigned char n_f1, unsigned char n_f2)
@ -154,7 +154,7 @@ bool ASMs3Dmx::generateFEMTopology ()
      Go::BsplineBasis b1 = svol->basis(0).extendedBasis(svol->order(0)+1);
      Go::BsplineBasis b2 = svol->basis(1).extendedBasis(svol->order(1)+1);
      Go::BsplineBasis b3 = svol->basis(2).extendedBasis(svol->order(2)+1);
-      /*  To lower order and regularity this can be used instead     
+      /* To lower order and regularity this can be used instead
      std::vector<double>::const_iterator first =  ++svol->basis(0).begin();
      std::vector<double>::const_iterator last  =  --svol->basis(0).end();
      Go::BsplineBasis b1 = Go::BsplineBasis(svol->order(0)-1,first,last);
@ -536,90 +536,114 @@ bool ASMs3Dmx::integrate (Integrand& integrand,

  const int nel1 = n1 - p1 + 1;
  const int nel2 = n2 - p2 + 1;
+  const int nel3 = n3 - p3 + 1;

-  MxFiniteElement fe(basis1->order(0)*basis1->order(1)*basis1->order(2),
-		     basis2->order(0)*basis2->order(1)*basis2->order(2));
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
+  if (threadGroupsVol.empty())
+    generateThreadGroups();


  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i3 = p3; i3 <= n3; i3++)
-    for (int i2 = p2; i2 <= n2; i2++)
-      for (int i1 = p1; i1 <= n1; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-	if (fe.iel < 1) continue; // zero-volume element
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+      MxFiniteElement fe(basis1->order(0)*basis1->order(1)*basis1->order(2),
+                         basis2->order(0)*basis2->order(1)*basis2->order(2));
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      for (size_t l=0;l<threadGroupsVol[g][t].size();++l) {
+        int iel = threadGroupsVol[g][t][l];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-volume element

-	// Get element volume in the parameter space
-	double dV = this->getParametricVolume(iel);
-	if (dV < 0.0) return false; // topology error (probably logic error)
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + (iel / nel1) % nel3;
+        int i3 = p3 + iel / (nel1*nel2);

-	// Set up control point (nodal) coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+        // Get element volume in the parameter space
+        double dV = this->getParametricVolume(++iel);
+        if (dV < 0.0) { // topology error (probably logic error)
+          ok = false;
+          break;
+        }

-	// Initialize element quantities
-	IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-	if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
-				   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	  return false;
+        // Set up control point (nodal) coordinates for current element
+        if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

-	// --- Integration loop over all Gauss points in each direction --------
+        // Initialize element quantities
+        IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,false);
+        if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
+                                   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
+        {
+          ok = false;
+          break;
+        }

-	int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
-	for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
-	  for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
-	    for (int i = 0; i < nGauss; i++, ip++)
-	    {
-	      // Local element coordinates of current integration point
-	      fe.xi   = xg[i];
-	      fe.eta  = xg[j];
-	      fe.zeta = xg[k];
+        // --- Integration loop over all Gauss points in each direction --------

-	      // Parameter values of current integration point
-	      fe.u = gpar[0](i+1,i1-p1+1);
-	      fe.v = gpar[1](j+1,i2-p2+1);
-	      fe.w = gpar[2](k+1,i3-p3+1);
+        int ip = (((i3-p3)*nGauss*nel2 + i2-p2)*nGauss*nel1 + i1-p1)*nGauss;
+        for (int k = 0; k < nGauss; k++, ip += nGauss*(nel2-1)*nGauss*nel1)
+          for (int j = 0; j < nGauss; j++, ip += nGauss*(nel1-1))
+            for (int i = 0; i < nGauss; i++, ip++)
+            {
+              // Local element coordinates of current integration point
+              fe.xi   = xg[i];
+              fe.eta  = xg[j];
+              fe.zeta = xg[k];

-	      // Fetch basis function derivatives at current integration point
-	      extractBasis(spline1[ip],fe.N1,dN1du);
-	      extractBasis(spline2[ip],fe.N2,dN2du);
+              // Parameter values of current integration point
+              fe.u = gpar[0](i+1,i1-p1+1);
+              fe.v = gpar[1](j+1,i2-p2+1);
+              fe.w = gpar[2](k+1,i3-p3+1);

-	      // Compute Jacobian inverse of the coordinate mapping and
-	      // basis function derivatives w.r.t. Cartesian coordinates
-	      if (geoUsesBasis1)
-	      {
-		fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
-		fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
-	      }
-	      else
-	      {
-		fe.detJxW = utl::Jacobian(Jac,fe.dN2dX,Xnod,dN2du);
-		fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
-	      }
-	      if (fe.detJxW == 0.0) continue; // skip singular points
+              // Fetch basis function derivatives at current integration point
+              extractBasis(spline1[ip],fe.N1,dN1du);
+              extractBasis(spline2[ip],fe.N2,dN2du);

-	      // Cartesian coordinates of current integration point
-	      X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
-	      X.t = time.t;
+              // Compute Jacobian inverse of the coordinate mapping and
+              // basis function derivatives w.r.t. Cartesian coordinates
+              if (geoUsesBasis1)
+              {
+                fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
+                fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
+              }
+              else
+              {
+                fe.detJxW = utl::Jacobian(Jac,fe.dN2dX,Xnod,dN2du);
+                fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
+              }
+              if (fe.detJxW == 0.0) continue; // skip singular points

-	      // Evaluate the integrand and accumulate element contributions
-	      fe.detJxW *= 0.125*dV*wg[i]*wg[j]*wg[k];
-	      if (!integrand.evalIntMx(*A,fe,time,X))
-		return false;
-	    }
+              // Cartesian coordinates of current integration point
+              X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
+              X.t = time.t;

-	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+              // Evaluate the integrand and accumulate element contributions
+              fe.detJxW *= 0.125*dV*wg[i]*wg[j]*wg[k];
+              if (!integrand.evalIntMx(*A,fe,time,X)) {
+                ok = false;
+                break;
+              }
+            }
+
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
+    }
+  }

-  return true;
+  return ok;
 }


@ -666,7 +690,6 @@ bool ASMs3Dmx::integrate (Integrand& integrand, int lIndex,

  const int n1 = svol->numCoefs(0);
  const int n2 = svol->numCoefs(1);
-  const int n3 = svol->numCoefs(2);

  const int p1 = svol->order(0);
  const int p2 = svol->order(1);
@ -675,134 +698,141 @@ bool ASMs3Dmx::integrate (Integrand& integrand, int lIndex,
  const int nel1 = n1 - p1 + 1;
  const int nel2 = n2 - p2 + 1;

-  MxFiniteElement fe(basis1->order(0)*basis1->order(1)*basis1->order(2),
-		     basis2->order(0)*basis2->order(1)*basis2->order(2));
-  fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
-  fe.u = gpar[0](1,1);
-  fe.v = gpar[1](1,1);
-  fe.w = gpar[2](1,1);
-
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
-  Vec3   normal;
-
+  if (threadGroupsFace.empty())
+    generateThreadGroups();

  // === Assembly loop over all elements on the patch face =====================
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+      MxFiniteElement fe(basis1->order(0)*basis1->order(1)*basis1->order(2),
+                         basis2->order(0)*basis2->order(1)*basis2->order(2));
+      fe.xi = fe.eta = fe.zeta = faceDir < 0 ? -1.0 : 1.0;
+      fe.u = gpar[0](1,1);
+      fe.v = gpar[1](1,1);
+      fe.w = gpar[2](1,1);

-  int iel = 1;
-  for (int i3 = p3; i3 <= n3; i3++)
-    for (int i2 = p2; i2 <= n2; i2++)
-      for (int i1 = p1; i1 <= n1; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-	if (fe.iel < 1) continue; // zero-volume element
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      Vec3   normal;
+      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
+        int iel = threadGroupsFace[lIndex-1][g][t][l];
+        fe.iel = MLGE[iel];
+        if (fe.iel < 1) continue; // zero-volume element

-	// Skip elements that are not on current boundary face
-	bool skipMe = false;
-	switch (faceDir)
-	  {
-	  case -1: if (i1 > p1) skipMe = true; break;
-	  case  1: if (i1 < n1) skipMe = true; break;
-	  case -2: if (i2 > p2) skipMe = true; break;
-	  case  2: if (i2 < n2) skipMe = true; break;
-	  case -3: if (i3 > p3) skipMe = true; break;
-	  case  3: if (i3 < n3) skipMe = true; break;
-	  }
-	if (skipMe) continue;
+        int i1 = p1 + iel % nel1;
+        int i2 = p2 + (iel / nel1) % nel2;
+        int i3 = p3 + iel / (nel1*nel2);

 	// Get element face area in the parameter space
-	double dA = this->getParametricArea(iel,abs(faceDir));
-	if (dA < 0.0) return false; // topology error (probably logic error)
+	double dA = this->getParametricArea(++iel,abs(faceDir));
+	if (dA < 0.0) { // topology error (probably logic error)
+          ok = false;
+          break;
+        }

 	// Set up control point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+	if (!this->getElementCoordinates(Xnod,iel)) {
+          ok = false;
+          break;
+        }

 	// Initialize element quantities
 	IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,true);
 	if (!integrand.initElementBou(IntVec(MNPC[iel-1].begin(),f2start),
-				      IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	  return false;
+				      IntVec(f2start,MNPC[iel-1].end()),nb1,*A)) {
+          ok = false;
+          break;
+        }

-	// Define some loop control variables depending on which face we are on
-	int nf1, j1, j2;
-	switch (abs(faceDir))
-	  {
-	  case 1: nf1 = nel2; j2 = i3-p3; j1 = i2-p2; break;
-	  case 2: nf1 = nel1; j2 = i3-p3; j1 = i1-p1; break;
-	  case 3: nf1 = nel1; j2 = i2-p2; j1 = i1-p1; break;
-	  default: nf1 = j1 = j2 = 0;
-	  }
+        // Define some loop control variables depending on which face we are on
+        int nf1, j1, j2;
+        switch (abs(faceDir))
+        {
+          case 1: nf1 = nel2; j2 = i3-p3; j1 = i2-p2; break;
+          case 2: nf1 = nel1; j2 = i3-p3; j1 = i1-p1; break;
+          case 3: nf1 = nel1; j2 = i2-p2; j1 = i1-p1; break;
+          default: nf1 = j1 = j2 = 0;
+        }

 	// --- Integration loop over all Gauss points in each direction --------

-	int k1, k2, k3;
-	int ip = (j2*nGauss*nf1 + j1)*nGauss;
-	for (int j = 0; j < nGauss; j++, ip += nGauss*(nf1-1))
-	  for (int i = 0; i < nGauss; i++, ip++)
-	  {
- 	    // Local element coordinates and parameter values
- 	    // of current integration point
-	    switch (abs(faceDir)) {
-	    case 1: k2 = i+1; k3 = j+1; k1 = 0; break;
-	    case 2: k1 = i+1; k3 = j+1; k2 = 0; break;
-	    case 3: k1 = i+1; k2 = j+1; k3 = 0; break;
-	    default: k1 = k2 = k3 = 0;
-	    }
-	    if (gpar[0].size() > 1)
-	    {
-	      fe.xi = xg[k1];
-	      fe.u = gpar[0](k1,i1-p1+1);
-	    }
-	    if (gpar[1].size() > 1)
-	    {
-	      fe.eta = xg[k2];
-	      fe.v = gpar[1](k2,i2-p2+1);
-	    }
-	    if (gpar[2].size() > 1)
-	    {
-	      fe.zeta = xg[k3];
-	      fe.w = gpar[2](k3,i3-p3+1);
-	    }
+        int k1, k2, k3;
+        int ip = (j2*nGauss*nf1 + j1)*nGauss;
+        for (int j = 0; j < nGauss; j++, ip += nGauss*(nf1-1))
+          for (int i = 0; i < nGauss; i++, ip++)
+          {
+            // Local element coordinates and parameter values
+            // of current integration point
+            switch (abs(faceDir)) {
+              case 1: k2 = i+1; k3 = j+1; k1 = 0; break;
+              case 2: k1 = i+1; k3 = j+1; k2 = 0; break;
+              case 3: k1 = i+1; k2 = j+1; k3 = 0; break;
+              default: k1 = k2 = k3 = 0;
+            }
+            if (gpar[0].size() > 1)
+            {
+              fe.xi = xg[k1];
+              fe.u = gpar[0](k1,i1-p1+1);
+            }
+            if (gpar[1].size() > 1)
+            {
+              fe.eta = xg[k2];
+              fe.v = gpar[1](k2,i2-p2+1);
+            }
+            if (gpar[2].size() > 1)
+            {
+              fe.zeta = xg[k3];
+              fe.w = gpar[2](k3,i3-p3+1);
+            }

-	    // Fetch basis function derivatives at current integration point
-	    extractBasis(spline1[ip],fe.N1,dN1du);
-	    extractBasis(spline2[ip],fe.N2,dN2du);
+            // Fetch basis function derivatives at current integration point
+            extractBasis(spline1[ip],fe.N1,dN1du);
+            extractBasis(spline2[ip],fe.N2,dN2du);

-	    // Compute Jacobian inverse of the coordinate mapping and
-	    // basis function derivatives w.r.t. Cartesian coordinates
-	    if (geoUsesBasis1)
-	    {
-	      fe.detJxW = utl::Jacobian(Jac,normal,fe.dN1dX,Xnod,dN1du,t1,t2);
-	      fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
-	    }
-	    else
-	    {
-	      fe.detJxW = utl::Jacobian(Jac,normal,fe.dN2dX,Xnod,dN2du,t1,t2);
-	      fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
-	    }
-	    if (fe.detJxW == 0.0) continue; // skip singular points
+            // Compute Jacobian inverse of the coordinate mapping and
+            // basis function derivatives w.r.t. Cartesian coordinates
+            if (geoUsesBasis1)
+            {
+              fe.detJxW = utl::Jacobian(Jac,normal,fe.dN1dX,Xnod,dN1du,t1,t2);
+              fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
+            }
+            else
+            {
+              fe.detJxW = utl::Jacobian(Jac,normal,fe.dN2dX,Xnod,dN2du,t1,t2);
+              fe.dN1dX.multiply(dN1du,Jac); // dN1dX = dN1du * J^-1
+            }
+            if (fe.detJxW == 0.0) continue; // skip singular points

-	    if (faceDir < 0) normal *= -1.0;
+            if (faceDir < 0) normal *= -1.0;

-	    // Cartesian coordinates of current integration point
-	    X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
-	    X.t = time.t;
+            // Cartesian coordinates of current integration point
+            X = Xnod * (geoUsesBasis1 ? fe.N1 : fe.N2);
+            X.t = time.t;

-	    // Evaluate the integrand and accumulate element contributions
-	    fe.detJxW *= 0.25*dA*wg[i]*wg[j];
-	    if (!integrand.evalBouMx(A,fe,time,X,normal))
-	      return false;
-	  }
+            // Evaluate the integrand and accumulate element contributions
+            fe.detJxW *= 0.25*dA*wg[i]*wg[j];
+            if (!integrand.evalBouMx(*A,fe,time,X,normal)) {
+              ok = false;
+              break;
+            }
+          }

 	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
-      }
+	if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }

-  return true;
+        A->destruct();
+      }
+    }
+  }
+
+  return ok;
 }


--- a/src/ASM/ASMs3DmxLag.C
+++ b/src/ASM/ASMs3DmxLag.C
@ -18,12 +18,12 @@
 #include "TimeDomain.h"
 #include "FiniteElement.h"
 #include "GlobalIntegral.h"
+#include "LocalIntegral.h"
 #include "IntegrandBase.h"
 #include "CoordinateMapping.h"
 #include "GaussQuadrature.h"
 #include "Utilities.h"
 #include "Vec3Oper.h"
-#include "ElmNorm.h"


 ASMs3DmxLag::ASMs3DmxLag (unsigned char n_f1, unsigned char n_f2)
@ -246,7 +246,6 @@ bool ASMs3DmxLag::integrate (Integrand& integrand,
  // Number of elements in each direction
  const int nelx = upar.size() - 1;
  const int nely = vpar.size() - 1;
-  const int nelz = wpar.size() - 1;

  // Order of basis in the two parametric directions (order = degree + 1)
  const int p1 = svol->order(0);
@ -257,77 +256,96 @@ bool ASMs3DmxLag::integrate (Integrand& integrand,
  const int q2 = p2 - 1;
  const int q3 = p3 - 1;

-  MxFiniteElement fe(p1*p2*p3,q1*q2*q3);
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
-
+  if (threadGroupsVol.empty())
+    generateThreadGroups();

  // === Assembly loop over all elements in the patch ==========================

-  int iel = 1;
-  for (int i3 = 0; i3 < nelz; i3++)
-    for (int i2 = 0; i2 < nely; i2++)
-      for (int i1 = 0; i1 < nelx; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsVol.size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsVol[g].size();++t) {
+      MxFiniteElement fe(p1*p2*p3,q1*q2*q3);
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      for (size_t l=0;l<threadGroupsVol[g][t].size();++l) {
+        int iel = threadGroupsVol[g][t][l];
+        int i1  =  iel % nelx;
+        int i2  = (iel / nelx) % nely;
+        int i3  =  iel / (nelx*nely);

        // Set up control point coordinates for current element
-        if (!this->getElementCoordinates(Xnod,iel)) return false;
+        if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

-	// Initialize element quantities
-	IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,false);
-	if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
-				   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	  return false;
+        // Initialize element quantities
+        fe.iel = MLGE[iel-1];
+        IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,false);
+        if (!integrand.initElement(IntVec(MNPC[iel-1].begin(),f2start),
+                                   IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
+        {
+          ok = false;
+          break;
+        }

-	// --- Integration loop over all Gauss points in each direction --------
+        // --- Integration loop over all Gauss points in each direction --------

-	for (int k = 0; k < nGauss; k++)
-	  for (int j = 0; j < nGauss; j++)
-	    for (int i = 0; i < nGauss; i++)
-	    {
-	      // Parameter value of current integration point
-	      fe.u = 0.5*(upar[i1]*(1.0-x[i]) + upar[i1+1]*(1.0+x[i]));
-	      fe.v = 0.5*(vpar[i2]*(1.0-x[j]) + vpar[i2+1]*(1.0+x[j]));
-	      fe.w = 0.5*(wpar[i3]*(1.0-x[k]) + wpar[i3+1]*(1.0+x[k]));
+        for (int k = 0; k < nGauss; k++)
+          for (int j = 0; j < nGauss; j++)
+            for (int i = 0; i < nGauss; i++)
+            {
+              // Parameter value of current integration point
+              fe.u = 0.5*(upar[i1]*(1.0-x[i]) + upar[i1+1]*(1.0+x[i]));
+              fe.v = 0.5*(vpar[i2]*(1.0-x[j]) + vpar[i2+1]*(1.0+x[j]));
+              fe.w = 0.5*(wpar[i3]*(1.0-x[k]) + wpar[i3+1]*(1.0+x[k]));

-	      // Local coordinate of current integration point
-	      fe.xi   = x[i];
-	      fe.eta  = x[j];
-	      fe.zeta = x[k];
+              // Local coordinate of current integration point
+              fe.xi   = x[i];
+              fe.eta  = x[j];
+              fe.zeta = x[k];

-	      // Compute basis function derivatives at current integration point
-	      // using tensor product of one-dimensional Lagrange polynomials
-	      if (!Lagrange::computeBasis(fe.N1,dN1du,p1,x[i],p2,x[j],p3,x[k]))
-		return false;
-	      if (!Lagrange::computeBasis(fe.N2,dN2du,q1,x[i],q2,x[j],q3,x[k]))
-		return false;
+              // Compute basis function derivatives at current integration point
+              // using tensor product of one-dimensional Lagrange polynomials
+              if (!Lagrange::computeBasis(fe.N1,dN1du,p1,x[i],p2,x[j],p3,x[k]) ||
+                  !Lagrange::computeBasis(fe.N2,dN2du,q1,x[i],q2,x[j],q3,x[k])) {
+                ok = false;
+                break;
+              }

-	      // Compute Jacobian inverse of coordinate mapping and derivatives
-	      fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
-	      if (fe.detJxW == 0.0) continue; // skip singular points
+              // Compute Jacobian inverse of coordinate mapping and derivatives
+              fe.detJxW = utl::Jacobian(Jac,fe.dN1dX,Xnod,dN1du);
+              if (fe.detJxW == 0.0) continue; // skip singular points

-	      fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1
+              fe.dN2dX.multiply(dN2du,Jac); // dN2dX = dN2du * J^-1

-	      // Cartesian coordinates of current integration point
-	      X = Xnod * fe.N1;
-	      X.t = time.t;
+              // Cartesian coordinates of current integration point
+              X = Xnod * fe.N1;
+              X.t = time.t;

-	      // Evaluate the integrand and accumulate element contributions
-	      fe.detJxW *= w[i]*w[j]*w[k];
-	      if (!integrand.evalIntMx(*A,fe,time,X))
-		return false;
-	    }
+              // Evaluate the integrand and accumulate element contributions
+              fe.detJxW *= w[i]*w[j]*w[k];
+              if (!integrand.evalIntMx(*A,fe,time,X)) {
+                ok = false;
+                break;
+              }
+            }

-	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
+        // Assembly of global system integral
+        if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }
+
+        A->destruct();
      }
+    }
+  }

-  return true;
+  return ok;
 }


@ -358,49 +376,38 @@ bool ASMs3DmxLag::integrate (Integrand& integrand, int lIndex,
  const int q2 = p2 - 1;
  const int q3 = p3 - 1;

-  // Number of elements in each direction
-  const int nelx = (nx2-1)/(q1-1);
-  const int nely = (ny2-1)/(q2-1);
-  const int nelz = (nz2-1)/(q3-1);
-
-  MxFiniteElement fe(p1*p2*p3,q1*q2*q3);
-  Matrix dN1du, dN2du, Xnod, Jac;
-  Vec4   X;
-  Vec3   normal;
-  double xi[3];
-
+  if (threadGroupsFace.empty())
+    generateThreadGroups();

  // === Assembly loop over all elements on the patch face =====================
-
-  int iel = 1;
-  for (int i3 = 0; i3 < nelz; i3++)
-    for (int i2 = 0; i2 < nely; i2++)
-      for (int i1 = 0; i1 < nelx; i1++, iel++)
-      {
-	fe.iel = MLGE[iel-1];
-
-	// Skip elements that are not on current boundary face
-	bool skipMe = false;
-	switch (faceDir)
-	  {
-	  case -1: if (i1 > 0)      skipMe = true; break;
-	  case  1: if (i1 < nelx-1) skipMe = true; break;
-	  case -2: if (i2 > 0)      skipMe = true; break;
-	  case  2: if (i2 < nely-1) skipMe = true; break;
-	  case -3: if (i3 > 0)      skipMe = true; break;
-	  case  3: if (i3 < nelz-1) skipMe = true; break;
-	  }
-	if (skipMe) continue;
+  bool ok=true;
+  for (size_t g=0;g<threadGroupsFace[lIndex-1].size() && ok;++g) {
+#pragma omp parallel for schedule(static)
+    for (size_t t=0;t<threadGroupsFace[lIndex-1][g].size();++t) {
+      MxFiniteElement fe(p1*p2*p3,q1*q2*q3);
+      Matrix dN1du, dN2du, Xnod, Jac;
+      Vec4   X;
+      Vec3   normal;
+      double xi[3];
+      for (size_t l=0;l<threadGroupsFace[lIndex-1][g][t].size();++l) {
+        int iel = threadGroupsFace[lIndex-1][g][t][l];

 	// Set up control point coordinates for current element
-	if (!this->getElementCoordinates(Xnod,iel)) return false;
+	if (!this->getElementCoordinates(Xnod,++iel)) {
+          ok = false;
+          break;
+        }

 	// Initialize element quantities
+	fe.iel = MLGE[iel-1];
 	IntVec::const_iterator f2start = MNPC[iel-1].begin() + fe.N1.size();
-        LocalIntegral* A = integrand.getLocalIntegral(MNPC[iel-1].size(),iel-1,true);
+        LocalIntegral* A = integrand.getLocalIntegral(fe.N1.size(),fe.N2.size(),
+                                                      fe.iel,true);
 	if (!integrand.initElementBou(IntVec(MNPC[iel-1].begin(),f2start),
-				      IntVec(f2start,MNPC[iel-1].end()),nb1,*A))
-	  return false;
+				      IntVec(f2start,MNPC[iel-1].end()),nb1,*A)) {
+          ok = false;
+          break;
+        }

 	// --- Integration loop over all Gauss points in each direction --------

@ -414,10 +421,14 @@ bool ASMs3DmxLag::integrate (Integrand& integrand, int lIndex,

 	    // Compute the basis functions and their derivatives, using
 	    // tensor product of one-dimensional Lagrange polynomials
-	    if (!Lagrange::computeBasis(fe.N1,dN1du,p1,xi[0],p2,xi[1],p3,xi[2]))
-	      return false;
-	    if (!Lagrange::computeBasis(fe.N2,dN2du,q1,xi[0],q2,xi[1],q3,xi[2]))
-	      return false;
+	    if (!Lagrange::computeBasis(fe.N1,dN1du,p1,xi[0],p2,xi[1],p3,xi[2])) {
+              ok = false;
+              break;
+            }
+	    if (!Lagrange::computeBasis(fe.N2,dN2du,q1,xi[0],q2,xi[1],q3,xi[2])) {
+              ok = false;
+              break;
+            }

 	    // Compute basis function derivatives and the edge normal
 	    fe.detJxW = utl::Jacobian(Jac,normal,fe.dN1dX,Xnod,dN1du,t1,t2);
@ -433,18 +444,24 @@ bool ASMs3DmxLag::integrate (Integrand& integrand, int lIndex,

 	    // Evaluate the integrand and accumulate element contributions
 	    fe.detJxW *= wg[i]*wg[j];
-	    if (!integrand.evalBouMx(A,fe,time,X,normal))
-	      return false;
+	    if (!integrand.evalBouMx(*A,fe,time,X,normal)) {
+              ok = false;
+              break;
+            }
 	  }

 	// Assembly of global system integral
-	if (!glInt.assemble(A,fe.iel))
-	  return false;
-        if (!dynamic_cast<ElmNorm*>(A))
-          delete A;
-      }
+	if (!glInt.assemble(A->ref(),fe.iel)) {
+          ok = false;
+          break;
+        }

-  return true;
+        A->destruct();
+      }
+    }
+  }
+
+  return ok;
 }


--- a/src/Utility/Utilities.C
+++ b/src/Utility/Utilities.C
@ -14,6 +14,10 @@
 #include "Utilities.h"
 #include <cstdlib>

+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+

 void utl::parseIntegers (std::vector<int>& values, const char* argv)
 {
@ -254,3 +258,145 @@ size_t utl::find_closest (const std::vector<real>& a, real v)
  else
    return i-1;
 }
+
+
+void utl::calcThreadGroups(int nel1, int nel2, utl::ThreadGroups& result)
+{
+  int threads=1;
+  int groups=1;
+#ifdef USE_OPENMP
+  threads = omp_get_max_threads();
+  if (threads > 1)
+    groups = 2;
+#endif
+
+  int stripsize = nel1/(groups*threads);
+  if (stripsize < 2) {
+    std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
+              << "Reducing to a suitable amount" << std::endl;
+    while (((stripsize = nel1/(groups*threads)) < 2) && threads > 1)
+      threads--;
+    if (threads == 1)
+      groups=1;
+    stripsize = nel1/(groups*threads);
+  }
+  int remainder = nel1-(stripsize*groups*threads);
+  result.resize(groups);
+
+#if SP_DEBUG > 1
+  std::cout << "we have " << threads << " threads available" << std::endl;
+  std::cout << "nel1 " << nel1 << std::endl;
+  std::cout << "nel2 " << nel2 << std::endl;
+  std::cout << "stripsize " << stripsize << std::endl;
+  std::cout << "# of strips " << nel1/stripsize << std::endl;
+  std::cout << "remainder " << remainder << std::endl;
+#endif
+
+  if (groups == 1) {
+     result[0].resize(1);
+    for (int i=0;i<nel1*nel2;++i)
+      result[0][0].push_back(i);
+  } else {
+    for (size_t g=0;g<result.size();++g) { // loop over groups
+      result[g].resize(threads);
+      for (int t=0;t<threads;++t) { // loop over threads
+        size_t startel = g*stripsize+result.size()*t*stripsize;
+        int curstripsize = stripsize;
+        if (t == threads-1 && g == result.size()-1)
+          curstripsize += remainder;
+        for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
+          for (int i1=0;i1<curstripsize; ++i1) {
+            int iEl = startel+i1+i2*nel1;
+            result[g][t].push_back(iEl);
+          }
+        }
+      }
+    }
+  }
+
+#if SP_DEBUG > 1
+  for (size_t i=0;i<result.size();++i) {
+    std::cout << "group " << i << std::endl;
+    for (size_t j=0;j<result[i].size();++j) {
+      std::cout << "\t thread " << j << ": ";
+      for (size_t k=0;k<result[i][j].size();++k)
+        std::cout << result[i][j][k] << " ";
+      std::cout << std::endl;
+    }
+  }
+#endif
+}
+
+
+void utl::calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result)
+{
+  int threads=1;
+  int groups=1;
+#ifdef USE_OPENMP
+  threads = omp_get_max_threads();
+  if (threads > 1)
+    groups = 2;
+#endif
+
+  int stripsize = nel1/(groups*threads);
+  if (stripsize < 2) {
+    std::cerr << __FUNCTION__ << ": Warning: too many threads available." << std::endl
+              << "Reducing to a suitable amount" << std::endl;
+    while ((stripsize = nel1/(groups*threads)) < 2 && threads > 1)
+      threads--;
+    if (threads == 1)
+      groups=1;
+    stripsize = nel1/(groups*threads);
+  }
+  int remainder = nel1-(stripsize*groups*threads);
+  result.resize(groups);
+
+#if SP_DEBUG > 1
+  std::cout << "we have " << threads << " threads available" << std::endl;
+  std::cout << "nel1 " << nel1 << std::endl;
+  std::cout << "nel2 " << nel2 << std::endl;
+  std::cout << "nel3 " << nel3 << std::endl;
+  std::cout << "stripsize " << stripsize << std::endl;
+  std::cout << "# of strips " << (stripsize?nel1/stripsize:0) << std::endl;
+  std::cout << "remainder " << remainder << std::endl;
+#endif
+
+  for (size_t g=0;g<result.size();++g) { // loop over groups
+    result[g].resize(threads);
+    for (int t=0;t<threads;++t) { // loop over threads
+      size_t startel = g*stripsize+result.size()*t*stripsize;
+      int curstripsize = stripsize;
+      if (t == threads-1 && g == result.size()-1)
+        curstripsize += remainder;
+      for (int i2=0; i2 < nel2; ++i2) { // loop in y direction
+        for (int i3=0; i3 < nel3; ++i3) {
+          for (int i1=0;i1<curstripsize; ++i1) {
+            int iEl = startel+i1+i3*nel1*nel2+i2*nel1;
+            result[g][t].push_back(iEl);
+          }
+        }
+      }
+    }
+  }
+
+#if SP_DEBUG > 1
+  for (size_t i=0;i<result.size();++i) {
+    std::cout << "group " << i << std::endl;
+    for (size_t j=0;j<result[i].size();++j) {
+      std::cout << "\t thread " << j << " (" << result[i][j].size() << "): ";
+      for (size_t k=0;k<result[i][j].size();++k)
+        std::cout << result[i][j][k] << " ";
+      std::cout << std::endl;
+    }
+  }
+#endif
+}
+
+
+void utl::mapThreadGroups(ThreadGroups& result, const std::vector<int>& map)
+{
+  for (size_t l=0;l<result.size();++l)
+    for (size_t k=0;k<result[l].size();++k)
+      for (size_t j=0;j<result[l][k].size();++j)
+        result[l][k][j] = map[result[l][k][j]];
+}
--- a/src/Utility/Utilities.h
+++ b/src/Utility/Utilities.h
@ -113,6 +113,26 @@ namespace utl
  //! \note It is assumed that the array \a a is sorted in encreasing order on
  //! input. If this is not the case the method will deliver incorrect result.
  size_t find_closest(const std::vector<real>& a, real v);
+
+  //! \brief Used to store thread-groups of finite elements
+  typedef std::vector<std::vector<std::vector<int> > > ThreadGroups;
+  
+  //! \brief Calculate a 2D thread group partitioning based on strips
+  //! \param[in] nel1 Number of elements in the first direction
+  //! \param[in] nel2 Number of elements in the second direction
+  //! \param[out] result The partitioning
+  void calcThreadGroups(int nel1, int nel2, ThreadGroups& result);
+
+  //! \brief Calculate a 3D thread group partitioning based on strips
+  //! \param[in] nel1 Number of elements in the first direction
+  //! \param[in] nel2 Number of elements in the second direction
+  //! \param[in] nel3 Number of elements in the third direction
+  //! \param[out] result The partitioning
+  void calcThreadGroups(int nel1, int nel2, int nel3, ThreadGroups& result);
+
+  //! \brief Map one partitioning through a map
+  //! \details The original entry n in the thread group is mapped to map[n]
+  void mapThreadGroups(ThreadGroups& result, const std::vector<int>& map);
 }

 #endif