Merge pull request #1 from atgeirr/performance-mods

Performance mods from atgeirr, great job.
2025-02-25 18:55:30 -06:00 · 2014-12-05 14:47:18 +01:00 · 2014-12-05 14:47:18 +01:00 · 1cd3dcadc6
commit 1cd3dcadc6
parent 6f55c862ce cda742ab0e
6 changed files with 226 additions and 351 deletions
--- a/opm/autodiff/AutoDiffBlock.hpp
+++ b/opm/autodiff/AutoDiffBlock.hpp
@ -22,9 +22,9 @@
 #include <opm/core/utility/platform_dependent/disable_warnings.h>
 #include <opm/autodiff/ConservativeSparseSparseProduct.h>
 #include <Eigen/Eigen>
 #include <Eigen/Sparse>
 #include <opm/autodiff/fastSparseProduct.hpp>
 #include <opm/core/utility/platform_dependent/reenable_warnings.h>
@ -441,7 +441,8 @@ namespace Opm
        std::vector<typename AutoDiffBlock<Scalar>::M> jac(num_blocks);
        assert(lhs.cols() == rhs.value().rows());
        for (int block = 0; block < num_blocks; ++block) {
-            jac[block] = lhs*rhs.derivative()[block];
+            // jac[block] = lhs*rhs.derivative()[block];
            fastSparseProduct(lhs, rhs.derivative()[block], jac[block]);
        }
        typename AutoDiffBlock<Scalar>::V val = lhs*rhs.value().matrix();
        return AutoDiffBlock<Scalar>::function(val, jac);
--- a/opm/autodiff/BlackoilPropsAdFromDeck.cpp
+++ b/opm/autodiff/BlackoilPropsAdFromDeck.cpp
@ -392,7 +392,7 @@ namespace Opm
        const int num_blocks = pw.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dmudp_diag * pw.derivative()[block];
+            fastSparseProduct(dmudp_diag, pw.derivative()[block], jacs[block]);
        }
        return ADB::function(mu, jacs);
    }
@ -427,7 +427,10 @@ namespace Opm
        const int num_blocks = po.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dmudp_diag * po.derivative()[block] + dmudr_diag * rs.derivative()[block];
+            fastSparseProduct(dmudp_diag, po.derivative()[block], jacs[block]);
            ADB::M temp;
            fastSparseProduct(dmudr_diag, rs.derivative()[block], temp);
            jacs[block] += temp;
        }
        return ADB::function(mu, jacs);
    }
@ -458,7 +461,7 @@ namespace Opm
        const int num_blocks = pg.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dmudp_diag * pg.derivative()[block];
+            fastSparseProduct(dmudp_diag, pg.derivative()[block], jacs[block]);
        }
        return ADB::function(mu, jacs);
    }
@ -493,7 +496,10 @@ namespace Opm
        const int num_blocks = pg.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dmudp_diag * pg.derivative()[block] + dmudr_diag * rv.derivative()[block];
+            fastSparseProduct(dmudp_diag, pg.derivative()[block], jacs[block]);
            ADB::M temp;
            fastSparseProduct(dmudr_diag, rv.derivative()[block], temp);
            jacs[block] += temp;
        }
        return ADB::function(mu, jacs);
    }
@ -653,7 +659,7 @@ namespace Opm
        const int num_blocks = pw.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dbdp_diag * pw.derivative()[block];
+            fastSparseProduct(dbdp_diag, pw.derivative()[block], jacs[block]);
        }
        return ADB::function(b, jacs);
    }
@ -689,7 +695,10 @@ namespace Opm
        const int num_blocks = po.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dbdp_diag * po.derivative()[block] + dbdr_diag * rs.derivative()[block];
+            fastSparseProduct(dbdp_diag, po.derivative()[block], jacs[block]);
            ADB::M temp;
            fastSparseProduct(dbdr_diag, rs.derivative()[block], temp);
            jacs[block] += temp;
        }
        return ADB::function(b, jacs);
    }
@ -721,7 +730,7 @@ namespace Opm
        const int num_blocks = pg.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dbdp_diag * pg.derivative()[block];
+            fastSparseProduct(dbdp_diag, pg.derivative()[block], jacs[block]);
        }
        return ADB::function(b, jacs);
    }
@ -753,11 +762,14 @@ namespace Opm
                                               b.data(), dbdp.data(), dbdr.data());
        ADB::M dbdp_diag = spdiag(dbdp);
-        ADB::M dmudr_diag = spdiag(dbdr);
+        ADB::M dbdr_diag = spdiag(dbdr);
        const int num_blocks = pg.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = dbdp_diag * pg.derivative()[block] + dmudr_diag * rv.derivative()[block];;
+            fastSparseProduct(dbdp_diag, pg.derivative()[block], jacs[block]);
            ADB::M temp;
            fastSparseProduct(dbdr_diag, rv.derivative()[block], temp);
            jacs[block] += temp;
        }
        return ADB::function(b, jacs);
    }
@ -817,7 +829,7 @@ namespace Opm
        const int num_blocks = po.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = drbubdp_diag * po.derivative()[block];
+            fastSparseProduct(drbubdp_diag, po.derivative()[block], jacs[block]);
        }
        return ADB::function(rbub, jacs);
    }
@ -889,7 +901,7 @@ namespace Opm
        const int num_blocks = po.numBlocks();
        std::vector<ADB::M> jacs(num_blocks);
        for (int block = 0; block < num_blocks; ++block) {
-            jacs[block] = drvdp_diag * po.derivative()[block];
+            fastSparseProduct(drvdp_diag, po.derivative()[block], jacs[block]);
        }
        return ADB::function(rv, jacs);
    }
@ -1004,7 +1016,9 @@ namespace Opm
                    const int column = phase1_pos + np*phase2_pos; // Recall: Fortran ordering from props_.relperm()
                    ADB::M dkr1_ds2_diag = spdiag(dkr.col(column));
                    for (int block = 0; block < num_blocks; ++block) {
-                        jacs[block] += dkr1_ds2_diag * s[phase2]->derivative()[block];
+                        ADB::M temp;
                        fastSparseProduct(dkr1_ds2_diag, s[phase2]->derivative()[block], temp);
                        jacs[block] += temp;
                    }
                }
                relperms.emplace_back(ADB::function(kr.col(phase1_pos), jacs));
@ -1062,7 +1076,9 @@ namespace Opm
                    const int column = phase1_pos + numActivePhases*phase2_pos; // Recall: Fortran ordering from props_.relperm()
                    ADB::M dpc1_ds2_diag = spdiag(dpc.col(column));
                    for (int block = 0; block < numBlocks; ++block) {
-                        jacs[block] += dpc1_ds2_diag * s[phase2]->derivative()[block];
+                        ADB::M temp;
                        fastSparseProduct(dpc1_ds2_diag, s[phase2]->derivative()[block], temp);
                        jacs[block] += temp;
                    }
                }
                adbCapPressures.emplace_back(ADB::function(pc.col(phase1_pos), jacs));
--- a/opm/autodiff/ConservativeSparseSparseProduct.h
+++ b/opm/autodiff/ConservativeSparseSparseProduct.h
@ -1,332 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 #warning "Using overloaded Eigen::ConservativeSparseSparseProduct.h"
 #include <algorithm>
 #include <iterator>
 #include <functional>
 #include <limits>
 #include <vector>
 #include <Eigen/Core>
 namespace Eigen {
 // forward declaration of SparseMatrix
 template<typename _Scalar, int _Options, typename _Index>
 class SparseMatrix;
 namespace internal {
 template < unsigned int depth >
 struct QuickSort
 {
  template <typename T>
  static inline void sort(T begin, T end)
  {
    if (begin != end)
    {
      T middle = std::partition (begin, end,
                                 std::bind2nd(std::less<typename std::iterator_traits<T>::value_type>(), *begin)
                                );
      QuickSort< depth-1 >::sort(begin, middle);
      // std::sort (max(begin + 1, middle), end);
      T new_middle = begin;
      QuickSort< depth-1 >::sort(++new_middle, end);
    }
  }
 };
 template <>
 struct QuickSort< 0 >
 {
  template <typename T>
  static inline void sort(T begin, T end)
  {
    // fall back to standard insertion sort
    std::sort( begin, end );
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
 {
  // if one of the matrices does not contain non zero elements
  // the result will only contain an empty matrix
  if( lhs.nonZeros() == 0 || rhs.nonZeros() == 0 )
    return ;
  typedef typename remove_all<Lhs>::type::Scalar Scalar;
  typedef typename remove_all<Lhs>::type::Index Index;
  // make sure to call innerSize/outerSize since we fake the storage order.
  Index rows = lhs.innerSize();
  Index cols = rhs.outerSize();
  eigen_assert(lhs.outerSize() == rhs.innerSize());
  std::vector<bool> mask(rows,false);
  Matrix<Scalar,Dynamic,1> values(rows);
  Matrix<Index,Dynamic,1>  indices(rows);
  // estimate the number of non zero entries
  // given a rhs column containing Y non zeros, we assume that the respective Y columns
  // of the lhs differs in average of one non zeros, thus the number of non zeros for
  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
  // per column of the lhs.
  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
  res.setZero();
  res.reserve(Index(estimated_nnz_prod));
  //const Scalar epsilon = std::numeric_limits< Scalar >::epsilon();
  const Scalar epsilon = 1e-15 ;
  // we compute each column of the result, one after the other
  for (Index j=0; j<cols; ++j)
  {
    Index nnz = 0;
    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
    {
      const Scalar y = rhsIt.value();
      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
      {
        const Scalar val = lhsIt.value() * y;
        if( std::abs( val ) > epsilon )
        {
          const Index i = lhsIt.index();
          if(!mask[i])
          {
            mask[i] = true;
            values[i] = val;
            indices[nnz] = i;
            ++nnz;
          }
          else
            values[i] += val;
        }
      }
    }
    if( nnz > 1 )
    {
      // sort indices for sorted insertion to avoid later copying
      QuickSort< 1 >::sort( indices.data(), indices.data()+nnz );
    }
    res.startVec(j);
    // ordered insertion
    // still using insertBackByOuterInnerUnordered since we know what we are doing
    for(Index k=0; k<nnz; ++k)
    {
      const Index i = indices[k];
      res.insertBackByOuterInnerUnordered(j,i) = values[i];
      mask[i] = false;
    }
 #if 0
    // alternative ordered insertion code:
    Index t200 = rows/(log2(200)*1.39);
    Index t = (rows*100)/139;
    // FIXME reserve nnz non zeros
    // FIXME implement fast sort algorithms for very small nnz
    // if the result is sparse enough => use a quick sort
    // otherwise => loop through the entire vector
    // In order to avoid to perform an expensive log2 when the
    // result is clearly very sparse we use a linear bound up to 200.
    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
    //res.startVec(j);
    if(true)
    {
      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
      for(Index k=0; k<nnz; ++k)
      {
        Index i = indices[k];
        res.insertBackByOuterInner(j,i) = values[i];
        mask[i] = false;
      }
    }
    else
    {
      // dense path
      for(Index i=0; i<rows; ++i)
      {
        if(mask[i])
        {
          mask[i] = false;
          res.insertBackByOuterInner(j,i) = values[i];
        }
      }
    }
 #endif
  }
  res.finalize();
 }
 } // end namespace internal
 namespace internal {
 template<typename Lhs, typename Rhs, typename ResultType,
  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
  int ResStorageOrder = (traits<ResultType>::Flags&RowMajorBit) ? RowMajor : ColMajor>
 struct conservative_sparse_sparse_product_selector;
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
 {
  typedef typename remove_all<Lhs>::type LhsCleaned;
  typedef typename LhsCleaned::Scalar Scalar;
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    //typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    //ColMajorMatrix resCol(lhs.rows(),rhs.cols());
    res = ColMajorMatrix(lhs.rows(),rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, res);
    //internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
    // sort the non zeros:
    //RowMajorMatrix resRow(resCol);
    //res = resRow;
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
     typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
     //RowMajorMatrix rhsRow = rhs;
     //RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     ColMajorMatrix lhsCol = lhs;
     res = ResultType( lhs.rows(), rhs.cols() );
     internal::conservative_sparse_sparse_product_impl<ColMajorMatrix, Rhs, ResultType>( lhsCol, rhs, res );
     //internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
     //res = resRow;
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    ColMajorMatrix rhsCol = rhs;
    res = ResultType( lhs.rows(), rhs.cols() );
    internal::conservative_sparse_sparse_product_impl<Lhs, ColMajorMatrix, ResultType>( lhs, rhsCol, res);
    /*
    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    RowMajorMatrix lhsRow = lhs;
    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
    res = resRow;
    */
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
    res = resRow;
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
 {
  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
    res = resCol;
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    RowMajorMatrix rhsRow = rhs;
    res = ResultType( lhs.rows(), rhs.cols() );
    internal::conservative_sparse_sparse_product_impl<Lhs, RowMajorMatrix, ResultType>(rhsRow, lhs, res);
    /*
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    ColMajorMatrix lhsCol = lhs;
    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
    internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
    res = resCol;
    */
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    RowMajorMatrix lhsRow = lhs;
    res = RowMajorMatrix( lhs.rows(), rhs.cols() );
    internal::conservative_sparse_sparse_product_impl<Rhs, RowMajorMatrix, ResultType>(rhs, lhsRow, res);
    /*
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    ColMajorMatrix rhsCol = rhs;
    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
    res = resCol;
    */
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
 {
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
  {
    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
    //typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    res = RowMajorMatrix( lhs.rows(),rhs.cols() );
    //RowMajorMatrix resRow(lhs.rows(),rhs.cols());
    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, res);
    // sort the non zeros:
    //ColMajorMatrix resCol(resRow);
    //res = resCol;
  }
 };
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
--- a/opm/autodiff/FullyImplicitBlackoilSolver_impl.hpp
+++ b/opm/autodiff/FullyImplicitBlackoilSolver_impl.hpp
@ -2059,7 +2059,7 @@ namespace {
            const int num_blocks = p.numBlocks();
            std::vector<ADB::M> jacs(num_blocks);
            for (int block = 0; block < num_blocks; ++block) {
-                jacs[block] = dpm_diag * p.derivative()[block];
+                fastSparseProduct(dpm_diag, p.derivative()[block], jacs[block]);
            }
            return ADB::function(pm, jacs);
        } else {
@ -2087,7 +2087,7 @@ namespace {
            const int num_blocks = p.numBlocks();
            std::vector<ADB::M> jacs(num_blocks);
            for (int block = 0; block < num_blocks; ++block) {
-                jacs[block] = dtm_diag * p.derivative()[block];
+                fastSparseProduct(dtm_diag, p.derivative()[block], jacs[block]);
            }
            return ADB::function(tm, jacs);
        } else {
--- a/opm/autodiff/NewtonIterationBlackoilCPR.cpp
+++ b/opm/autodiff/NewtonIterationBlackoilCPR.cpp
@ -279,7 +279,9 @@ namespace Opm
                    continue;
                }
                // solve Du = C
-                const M u = Di * Jn[var]; // solver.solve(Jn[var]);
+                // const M u = Di * Jn[var]; // solver.solve(Jn[var]);
                M u;
                fastSparseProduct(Di, Jn[var], u); // solver.solve(Jn[var]);
                for (int eq = 0; eq < num_eq; ++eq) {
                    if (eq == n) {
                        continue;
@ -292,7 +294,9 @@ namespace Opm
                    jacs[eq].push_back(Je[var]);
                    M& J = jacs[eq].back();
                    // Subtract Bu (B*inv(D)*C)
-                    J -= B * u;
+                    M Bu;
                    fastSparseProduct(B, u, Bu);
                    J -= Bu;
                }
            }
@ -397,6 +401,7 @@ namespace Opm
        void formEllipticSystem(const int num_phases,
                                const std::vector<ADB>& eqs_in,
                                Eigen::SparseMatrix<double, Eigen::RowMajor>& A,
                                // M& A,
                                V& b)
        {
            if (num_phases != 3) {
--- a/opm/autodiff/fastSparseProduct.hpp
+++ b/opm/autodiff/fastSparseProduct.hpp
@ -0,0 +1,185 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // This file has been modified for use in the OPM project codebase.
 #ifndef OPM_FASTSPARSEPRODUCT_HEADER_INCLUDED
 #define OPM_FASTSPARSEPRODUCT_HEADER_INCLUDED
 #include <Eigen/Sparse>
 #include <algorithm>
 #include <iterator>
 #include <functional>
 #include <limits>
 #include <vector>
 #include <Eigen/Core>
 namespace Opm {
 template < unsigned int depth >
 struct QuickSort
 {
  template <typename T>
  static inline void sort(T begin, T end)
  {
    if (begin != end)
    {
      T middle = std::partition (begin, end,
                                 std::bind2nd(std::less<typename std::iterator_traits<T>::value_type>(), *begin)
                                );
      QuickSort< depth-1 >::sort(begin, middle);
      // std::sort (max(begin + 1, middle), end);
      T new_middle = begin;
      QuickSort< depth-1 >::sort(++new_middle, end);
    }
  }
 };
 template <>
 struct QuickSort< 0 >
 {
  template <typename T>
  static inline void sort(T begin, T end)
  {
    // fall back to standard insertion sort
    std::sort( begin, end );
  }
 };
 template<typename Lhs, typename Rhs, typename ResultType>
 void fastSparseProduct(const Lhs& lhs, const Rhs& rhs, ResultType& res)
 {
    using namespace Eigen;
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
    res = ColMajorMatrix(lhs.rows(), rhs.cols());
  // if one of the matrices does not contain non zero elements
  // the result will only contain an empty matrix
  if( lhs.nonZeros() == 0 || rhs.nonZeros() == 0 )
    return;
  typedef typename Eigen::internal::remove_all<Lhs>::type::Scalar Scalar;
  typedef typename Eigen::internal::remove_all<Lhs>::type::Index Index;
  // make sure to call innerSize/outerSize since we fake the storage order.
  Index rows = lhs.innerSize();
  Index cols = rhs.outerSize();
  eigen_assert(lhs.outerSize() == rhs.innerSize());
  std::vector<bool> mask(rows,false);
  Matrix<Scalar,Dynamic,1> values(rows);
  Matrix<Index,Dynamic,1>  indices(rows);
  // estimate the number of non zero entries
  // given a rhs column containing Y non zeros, we assume that the respective Y columns
  // of the lhs differs in average of one non zeros, thus the number of non zeros for
  // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
  // per column of the lhs.
  // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
  res.setZero();
  res.reserve(Index(estimated_nnz_prod));
  //const Scalar epsilon = std::numeric_limits< Scalar >::epsilon();
  const Scalar epsilon = 0.0;
  // we compute each column of the result, one after the other
  for (Index j=0; j<cols; ++j)
  {
    Index nnz = 0;
    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
    {
      const Scalar y = rhsIt.value();
      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
      {
        const Scalar val = lhsIt.value() * y;
        if( std::abs( val ) > epsilon )
        {
          const Index i = lhsIt.index();
          if(!mask[i])
          {
            mask[i] = true;
            values[i] = val;
            indices[nnz] = i;
            ++nnz;
          }
          else
            values[i] += val;
        }
      }
    }
    if( nnz > 1 )
    {
      // sort indices for sorted insertion to avoid later copying
        // QuickSort< 1 >::sort( indices.data(), indices.data()+nnz );
      std::sort( indices.data(), indices.data()+nnz );
    }
    res.startVec(j);
    // ordered insertion
    // still using insertBackByOuterInnerUnordered since we know what we are doing
    for(Index k=0; k<nnz; ++k)
    {
      const Index i = indices[k];
      res.insertBackByOuterInnerUnordered(j,i) = values[i];
      mask[i] = false;
    }
 #if 0
    // alternative ordered insertion code:
    Index t200 = rows/(log2(200)*1.39);
    Index t = (rows*100)/139;
    // FIXME reserve nnz non zeros
    // FIXME implement fast sort algorithms for very small nnz
    // if the result is sparse enough => use a quick sort
    // otherwise => loop through the entire vector
    // In order to avoid to perform an expensive log2 when the
    // result is clearly very sparse we use a linear bound up to 200.
    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
    //res.startVec(j);
    if(true)
    {
      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
      for(Index k=0; k<nnz; ++k)
      {
        Index i = indices[k];
        res.insertBackByOuterInner(j,i) = values[i];
        mask[i] = false;
      }
    }
    else
    {
      // dense path
      for(Index i=0; i<rows; ++i)
      {
        if(mask[i])
        {
          mask[i] = false;
          res.insertBackByOuterInner(j,i) = values[i];
        }
      }
    }
 #endif
  }
  res.finalize();
 }
 } // end namespace Opm
 #endif // OPM_FASTSPARSEPRODUCT_HEADER_INCLUDED