Several OpenMP improvements

Several optimizations based on profiling of 20M grid model. These fixes will improve the largest performance issues, but there are still more operations that can be refactored. * OpenMP: Use in fault geometry generator * OpenMP: Use when computing statistics for result values * OpenMP: Use multithreading on fault detection * Add RiaOpenMPTools * VizFwk: Use openMP for texture generation
2025-02-25 18:55:39 -06:00 · 2022-12-19 13:49:03 +01:00
parent 254c74be13
commit a423ecf95f
12 changed files with 244 additions and 110 deletions
--- a/ApplicationLibCode/ReservoirDataModel/RigEclipseNativeStatCalc.cpp
+++ b/ApplicationLibCode/ReservoirDataModel/RigEclipseNativeStatCalc.cpp
@@ -41,7 +41,14 @@ RigEclipseNativeStatCalc::RigEclipseNativeStatCalc( RigCaseCellResultsData*
 void RigEclipseNativeStatCalc::minMaxCellScalarValues( size_t timeStepIndex, double& min, double& max )
 {
    MinMaxAccumulator acc( min, max );
-    traverseCells( acc, timeStepIndex );
+
+    std::vector<MinMaxAccumulator> threadAccumulators;
+    threadTraverseCells( threadAccumulators, timeStepIndex );
+    for ( const auto& threadAcc : threadAccumulators )
+    {
+        acc.addValue( threadAcc.min );
+        acc.addValue( threadAcc.max );
+    }
    min = acc.min;
    max = acc.max;
 }
@@ -63,7 +70,13 @@ void RigEclipseNativeStatCalc::p10p90CellScalarValues( double& p10, double& p90

    for ( size_t timeStepIndex = 0; timeStepIndex < timeStepCount(); timeStepIndex++ )
    {
-        traverseCells( acc, timeStepIndex );
+        std::vector<PercentilAccumulator> threadAccumulators;
+        threadTraverseCells( threadAccumulators, timeStepIndex );
+        for ( const auto& threadAcc : threadAccumulators )
+        {
+            acc.addData( threadAcc.values );
+            acc.addData( threadAcc.values );
+        }
    }

    acc.computep10p90( p10, p90 );
@@ -75,7 +88,15 @@ void RigEclipseNativeStatCalc::p10p90CellScalarValues( double& p10, double& p90
 void RigEclipseNativeStatCalc::p10p90CellScalarValues( size_t timeStepIndex, double& p10, double& p90 )
 {
    PercentilAccumulator acc;
-    traverseCells( acc, timeStepIndex );
+
+    std::vector<PercentilAccumulator> threadAccumulators;
+    threadTraverseCells( threadAccumulators, timeStepIndex );
+    for ( const auto& threadAcc : threadAccumulators )
+    {
+        acc.addData( threadAcc.values );
+        acc.addData( threadAcc.values );
+    }
+
    acc.computep10p90( p10, p90 );
 }

@@ -85,7 +106,15 @@ void RigEclipseNativeStatCalc::p10p90CellScalarValues( size_t timeStepIndex, dou
 void RigEclipseNativeStatCalc::posNegClosestToZero( size_t timeStepIndex, double& pos, double& neg )
 {
    PosNegAccumulator acc( pos, neg );
-    traverseCells( acc, timeStepIndex );
+
+    std::vector<PosNegAccumulator> threadAccumulators;
+    threadTraverseCells( threadAccumulators, timeStepIndex );
+    for ( const auto& threadAcc : threadAccumulators )
+    {
+        acc.addValue( threadAcc.pos );
+        acc.addValue( threadAcc.neg );
+    }
+
    pos = acc.pos;
    neg = acc.neg;
 }
--- a/ApplicationLibCode/ReservoirDataModel/RigEclipseNativeStatCalc.h
+++ b/ApplicationLibCode/ReservoirDataModel/RigEclipseNativeStatCalc.h
@@ -19,10 +19,11 @@

 #pragma once

-#include "RigStatisticsCalculator.h"
+#include "RiaOpenMPTools.h"

 #include "RigActiveCellInfo.h"
 #include "RigCaseCellResultsData.h"
+#include "RigStatisticsCalculator.h"

 class RigHistogramCalculator;

@@ -50,6 +51,7 @@ private:
    RigCaseCellResultsData* m_resultsData;
    RigEclipseResultAddress m_eclipseResultAddress;

+    // Add all result values to an accumulator for a given time step
    template <typename StatisticsAccumulator>
    void traverseCells( StatisticsAccumulator& accumulator, size_t timeStepIndex )
    {
@@ -59,31 +61,34 @@ private:
        }

        const std::vector<double>& values = m_resultsData->cellScalarResults( m_eclipseResultAddress, timeStepIndex );
-
-        if ( values.empty() )
+        for ( const auto& val : values )
+        {
+            accumulator.addValue( val );
+        }
+    }
+
+    // Create one accumulator for each available thread, and add values to accumulator for a given time step
+    template <typename StatisticsAccumulator>
+    void threadTraverseCells( std::vector<StatisticsAccumulator>& accumulators, size_t timeStepIndex )
+    {
+        if ( timeStepIndex >= m_resultsData->cellScalarResults( m_eclipseResultAddress ).size() )
        {
-            // Can happen if values do not exist for the current time step index.
            return;
        }

-        const RigActiveCellInfo* actCellInfo = m_resultsData->activeCellInfo();
-        size_t                   cellCount   = actCellInfo->reservoirCellCount();
+        const std::vector<double>& values = m_resultsData->cellScalarResults( m_eclipseResultAddress, timeStepIndex );

-        bool isUsingGlobalActiveIndex = m_resultsData->isUsingGlobalActiveIndex( m_eclipseResultAddress );
-        for ( size_t cIdx = 0; cIdx < cellCount; ++cIdx )
+        int numberOfThreads = RiaOpenMPTools::availableThreadCount();
+        accumulators.resize( numberOfThreads );
+
+#pragma omp parallel
        {
-            // Filter out inactive cells
-            if ( !actCellInfo->isActive( cIdx ) ) continue;
+            int myThread = RiaOpenMPTools::currentThreadIndex();

-            size_t cellResultIndex = cIdx;
-            if ( isUsingGlobalActiveIndex )
+#pragma omp for
+            for ( long long i = 0; i < (long long)values.size(); i++ )
            {
-                cellResultIndex = actCellInfo->cellResultIndex( cIdx );
-            }
-
-            if ( cellResultIndex != cvf::UNDEFINED_SIZE_T && cellResultIndex < values.size() )
-            {
-                accumulator.addValue( values[cellResultIndex] );
+                accumulators[myThread].addValue( values[i] );
            }
        }
    }
--- a/ApplicationLibCode/ReservoirDataModel/RigMainGrid.cpp
+++ b/ApplicationLibCode/ReservoirDataModel/RigMainGrid.cpp
@@ -21,6 +21,7 @@
 #include "RigMainGrid.h"

 #include "RiaLogging.h"
+#include "RiaOpenMPTools.h"
 #include "RiaResultNames.h"

 #include "RigActiveCellInfo.h"
@@ -30,10 +31,6 @@
 #include "cvfAssert.h"
 #include "cvfBoundingBoxTree.h"

-#ifdef USE_OPENMP
-#include <omp.h>
-#endif
-
 RigMainGrid::RigMainGrid()
    : RigGridBase( this )
 {
@@ -453,18 +450,39 @@ void RigMainGrid::calculateFaults( const RigActiveCellInfo* activeCellInfo )

    const std::vector<cvf::Vec3d>& vxs = m_mainGrid->nodes();

+    int numberOfThreads = RiaOpenMPTools::availableThreadCount();
+
+    std::vector<std::vector<RigFault::FaultFace>> threadFaultFaces( numberOfThreads );
+    std::vector<std::vector<RigFault::FaultFace>> threadInactiveFaultFaces( numberOfThreads );
+
+#pragma omp parallel
+    {
+        int myThread = RiaOpenMPTools::currentThreadIndex();
+
+        // NB! We are inside a parallel section, do not use "parallel for" here
+#pragma omp for
+        for ( int gcIdx = 0; gcIdx < static_cast<int>( m_cells.size() ); ++gcIdx )
+        {
+            addUnNamedFaultFaces( gcIdx,
+                                  activeCellInfo,
+                                  vxs,
+                                  unNamedFaultIdx,
+                                  unNamedFaultWithInactiveIdx,
+                                  threadFaultFaces[myThread],
+                                  threadInactiveFaultFaces[myThread],
+                                  m_faultsPrCellAcc.p() );
+        }
+    }
+
    std::vector<RigFault::FaultFace>& unNamedFaultFaces         = unNamedFault->faultFaces();
    std::vector<RigFault::FaultFace>& unNamedFaultFacesInactive = unNamedFaultWithInactive->faultFaces();
-    for ( int gcIdx = 0; gcIdx < static_cast<int>( m_cells.size() ); ++gcIdx )
+
+    for ( int i = 0; i < numberOfThreads; i++ )
    {
-        addUnNamedFaultFaces( gcIdx,
-                              activeCellInfo,
-                              vxs,
-                              unNamedFaultIdx,
-                              unNamedFaultWithInactiveIdx,
-                              unNamedFaultFaces,
-                              unNamedFaultFacesInactive,
-                              m_faultsPrCellAcc.p() );
+        unNamedFaultFaces.insert( unNamedFaultFaces.end(), threadFaultFaces[i].begin(), threadFaultFaces[i].end() );
+        unNamedFaultFacesInactive.insert( unNamedFaultFacesInactive.end(),
+                                          threadInactiveFaultFaces[i].begin(),
+                                          threadInactiveFaultFaces[i].end() );
    }
 }

@@ -557,8 +575,16 @@ void RigMainGrid::addUnNamedFaultFaces( int                               gcIdx,
            int faultIdx = unNamedFaultIdx;
            if ( !( isCellActive && isNeighborCellActive ) ) faultIdx = unNamedFaultWithInactiveIdx;

-            faultsPrCellAcc->setFaultIdx( gcIdx, face, faultIdx );
-            faultsPrCellAcc->setFaultIdx( neighborReservoirCellIdx, StructGridInterface::oppositeFace( face ), faultIdx );
+#pragma omp critical( faultsPrCellAcc_modification )
+            {
+                // Best practice is to avoid critical sections. The number of cells related to a fault is usually very
+                // small compared to the total number of cells, so the performance of this function should be good. The
+                // main computation is related to the 'pointDistance' functions above. The refactoring of this structure
+                // to avoid critical section is considered too much compared to the gain.
+
+                faultsPrCellAcc->setFaultIdx( gcIdx, face, faultIdx );
+                faultsPrCellAcc->setFaultIdx( neighborReservoirCellIdx, StructGridInterface::oppositeFace( face ), faultIdx );
+            }

            // Add as fault face only if the grid index is less than the neighbors

@@ -574,15 +600,6 @@ void RigMainGrid::addUnNamedFaultFaces( int                               gcIdx,
                    unNamedFaultFacesInactive.push_back( ff );
                }
            }
-            else
-            {
-                CVF_FAIL_MSG( "Found fault with global neighbor index less than the native index. " ); // Should never
-                                                                                                       // occur. because
-                                                                                                       // we flag the
-                                                                                                       // opposite face
-                                                                                                       // in the
-                                                                                                       // faultsPrCellAcc
-            }
        }
    }
 }
@@ -745,10 +762,8 @@ void RigMainGrid::buildCellSearchTree()

 #pragma omp parallel
        {
-            size_t threadCellCount = cellCount;
-#ifdef USE_OPENMP
-            threadCellCount = std::ceil( cellCount / static_cast<double>( omp_get_num_threads() ) );
-#endif
+            int    numberOfThreads = RiaOpenMPTools::availableThreadCount();
+            size_t threadCellCount = std::ceil( cellCount / static_cast<double>( numberOfThreads ) );

            std::vector<size_t>           threadIndicesForBoundingBoxes;
            std::vector<cvf::BoundingBox> threadBoundingBoxes;