Moved the statistical calculation algorithm code into a separate file.

Added unit test to these calculations, and fixed a calculation error.
p4#: 21140
This commit is contained in:
Jacob Støren 2013-04-05 10:49:39 +02:00
parent db62f01e0a
commit 0fed66deb2
11 changed files with 404 additions and 187 deletions

View File

@ -32,9 +32,9 @@ namespace caf {
template<>
void caf::AppEnum<RimStatisticsCase::PercentileCalcType>::setUp()
{
addItem(RimStatisticsCase::EXACT, "ExactPercentile", "Exact");
addItem(RimStatisticsCase::HISTOGRAM_ESTIMATED, "HistogramEstimatedPercentile", "Estimated (Faster for large case counts)");
setDefault(RimStatisticsCase::EXACT);
addItem(RimStatisticsCase::NEAREST_OBSERVATION, "NearestObservationPercentile", "Nearest Observation");
addItem(RimStatisticsCase::HISTOGRAM_ESTIMATED, "HistogramEstimatedPercentile", "Histogram based estimate");
setDefault(RimStatisticsCase::NEAREST_OBSERVATION);
}
}

View File

@ -53,7 +53,7 @@ public:
enum PercentileCalcType
{
EXACT,
NEAREST_OBSERVATION,
HISTOGRAM_ESTIMATED
};

View File

@ -22,106 +22,12 @@
#include "RimReservoirView.h"
#include "RimCase.h"
#include "RigCaseData.h"
#include "RigStatisticsMath.h"
//#include "RigCaseData.h"
#include <QDebug>
#include "cafProgressInfo.h"
//--------------------------------------------------------------------------------------------------
/// An internal class to do the actual computations
//--------------------------------------------------------------------------------------------------
void calculateBasicStatistics(const std::vector<double>& values, double* min, double* max, double* range, double* mean, double* dev)
{
double m_min(HUGE_VAL);
double m_max(-HUGE_VAL);
double m_mean(HUGE_VAL);
double m_dev(HUGE_VAL);
double sum = 0.0;
double sumSquared = 0.0;
size_t validValueCount = 0;
for (size_t i = 0; i < values.size(); i++)
{
double val = values[i];
if (val == HUGE_VAL) continue;
validValueCount++;
if (val < m_min) m_min = val;
if (val > m_max) m_max = val;
sum += val;
sumSquared += (val * val);
}
if (validValueCount > 0)
{
m_mean = sum / validValueCount;
// http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
// Running standard deviation
double s0 = validValueCount;
double s1 = sum;
double s2 = sumSquared;
m_dev = cvf::Math::sqrt( (s0 * s2) - (s1 * s1) ) / s0;
}
if (min) *min = m_min;
if (max) *max = m_max;
if (range) *range = m_max - m_min;
if (mean) *mean = m_mean;
if (dev) *dev = m_dev;
}
//--------------------------------------------------------------------------------------------------
/// Calculate the percentiles of /a inputValues at the pValPosition percentages using the "Nearest Rank"
/// method. This method treats HUGE_VAL as "undefined" values, and ignores these. Will return HUGE_VAL if
/// the inputValues does not contain any valid values
//--------------------------------------------------------------------------------------------------
std::vector<double> calculateNearestRankPercentiles(const std::vector<double> & inputValues, const std::vector<double>& pValPositions)
{
std::vector<double> sortedValues;
sortedValues.reserve(inputValues.size());
for (size_t i = 0; i < inputValues.size(); ++i)
{
if (inputValues[i] != HUGE_VAL)
{
sortedValues.push_back(inputValues[i]);
}
}
std::sort(sortedValues.begin(), sortedValues.end());
std::vector<double> percentiles(pValPositions.size(), HUGE_VAL);
if (sortedValues.size())
{
for (size_t i = 0; i < pValPositions.size(); ++i)
{
double pVal = HUGE_VAL;
size_t pValIndex = static_cast<size_t>(sortedValues.size() * abs(pValPositions[i]) / 100);
pValIndex += 1;
if (pValIndex >= sortedValues.size() ) pValIndex = sortedValues.size() - 1;
pVal = sortedValues[pValIndex];
percentiles[i] = pVal;
}
}
return percentiles;
};
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
@ -353,18 +259,18 @@ void RimStatisticsCaseEvaluator::evaluateForResults(const QList<ResSpec>& result
if (foundAnyValidValues)
{
calculateBasicStatistics(values, &statParams[MIN], &statParams[MAX], &statParams[RANGE], &statParams[MEAN], &statParams[STDEV]);
RigStatisticsMath::calculateBasicStatistics(values, &statParams[MIN], &statParams[MAX], &statParams[RANGE], &statParams[MEAN], &statParams[STDEV]);
// Calculate percentiles
if (m_statisticsConfig.m_calculatePercentiles )
{
if (m_statisticsConfig.m_pValMethod == RimStatisticsCase::EXACT)
if (m_statisticsConfig.m_pValMethod == RimStatisticsCase::NEAREST_OBSERVATION)
{
std::vector<double> pValPoss;
pValPoss.push_back(m_statisticsConfig.m_pMinPos);
pValPoss.push_back(m_statisticsConfig.m_pMidPos);
pValPoss.push_back(m_statisticsConfig.m_pMaxPos);
std::vector<double> pVals = calculateNearestRankPercentiles(values, pValPoss);
std::vector<double> pVals = RigStatisticsMath::calculateNearestRankPercentiles(values, pValPoss);
statParams[PMIN] = pVals[0];
statParams[PMID] = pVals[1];
statParams[PMAX] = pVals[2];

View File

@ -41,7 +41,7 @@ public:
m_pMinPos(10.0),
m_pMidPos(50.0),
m_pMaxPos(90.0),
m_pValMethod(RimStatisticsCase::EXACT)
m_pValMethod(RimStatisticsCase::NEAREST_OBSERVATION)
{
}

View File

@ -17,6 +17,8 @@ ${CEE_CURRENT_LIST_DIR}RigMainGrid.h
${CEE_CURRENT_LIST_DIR}RigReservoirBuilderMock.h
${CEE_CURRENT_LIST_DIR}RigCaseCellResultsData.h
${CEE_CURRENT_LIST_DIR}RigSingleWellResultsData.h
${CEE_CURRENT_LIST_DIR}RigStatisticsMath.h
)
list(APPEND CODE_SOURCE_FILES
@ -31,6 +33,7 @@ ${CEE_CURRENT_LIST_DIR}RigMainGrid.cpp
${CEE_CURRENT_LIST_DIR}RigReservoirBuilderMock.cpp
${CEE_CURRENT_LIST_DIR}RigCaseCellResultsData.cpp
${CEE_CURRENT_LIST_DIR}RigSingleWellResultsData.cpp
${CEE_CURRENT_LIST_DIR}RigStatisticsMath.cpp
)

View File

@ -24,7 +24,7 @@ include_directories(
${ResInsight_SOURCE_DIR}/CommonCode
#Remove when RigStatistics is out
${ResInsight_SOURCE_DIR}/ApplicationCode/ModelVisualization
#${ResInsight_SOURCE_DIR}/ApplicationCode/ModelVisualization
)
# Populate the filenames into variable lists
@ -38,6 +38,7 @@ set( UNIT_TEST_CPP_SOURCES
main.cpp
RigActiveCellInfo-Test.cpp
RigReservoir-Test.cpp
RigStatisticsMath-Test.cpp
)

View File

@ -0,0 +1,143 @@
/////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2011-2012 Statoil ASA, Ceetron AS
//
// ResInsight is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ResInsight is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.
//
// See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html>
// for more details.
//
/////////////////////////////////////////////////////////////////////////////////
#include "RiaStdInclude.h"
#include "gtest/gtest.h"
#include "RigStatisticsMath.h"
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
TEST(RigStatisticsMath, BasicTest)
{
std::vector<double> values;
values.push_back(HUGE_VAL);
values.push_back(2788.2723335651900);
values.push_back(-22481.0927881701000);
values.push_back(68778.6851686236000);
values.push_back(-76092.8157632591000);
values.push_back(6391.97999909729003);
values.push_back(65930.1200169780000);
values.push_back(-27696.2320267235000);
values.push_back(HUGE_VAL);
values.push_back(HUGE_VAL);
values.push_back(96161.7546348456000);
values.push_back(73875.6716288563000);
values.push_back(80720.4378655615000);
values.push_back(-98649.8109937874000);
values.push_back(99372.9362079615000);
values.push_back(HUGE_VAL);
values.push_back(-57020.4389966513000);
double min, max, range, mean, stdev;
RigStatisticsMath::calculateBasicStatistics(values, &min, &max, &range, &mean, &stdev);
EXPECT_DOUBLE_EQ(-98649.8109937874000, min );
EXPECT_DOUBLE_EQ(99372.9362079615000 , max );
EXPECT_DOUBLE_EQ(198022.7472017490000, range );
EXPECT_DOUBLE_EQ(16313.8051759152000 , mean );
EXPECT_DOUBLE_EQ(66104.391542887200 , stdev );
}
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
TEST(RigStatisticsMath, RankPercentiles)
{
std::vector<double> values;
values.push_back(HUGE_VAL);
values.push_back(2788.2723335651900);
values.push_back(-22481.0927881701000);
values.push_back(68778.6851686236000);
values.push_back(-76092.8157632591000);
values.push_back(6391.97999909729003);
values.push_back(65930.1200169780000);
values.push_back(-27696.2320267235000);
values.push_back(HUGE_VAL);
values.push_back(HUGE_VAL);
values.push_back(96161.7546348456000);
values.push_back(73875.6716288563000);
values.push_back(80720.4378655615000);
values.push_back(-98649.8109937874000);
values.push_back(99372.9362079615000);
values.push_back(HUGE_VAL);
values.push_back(-57020.4389966513000);
std::vector<double> pValPos;
pValPos.push_back(10);
pValPos.push_back(40);
pValPos.push_back(50);
pValPos.push_back(90);
std::vector<double> pVals = RigStatisticsMath::calculateNearestRankPercentiles(values, pValPos);
EXPECT_DOUBLE_EQ( -76092.8157632591000, pVals[0]);
EXPECT_DOUBLE_EQ( 2788.2723335651900 , pVals[1]);
EXPECT_DOUBLE_EQ( 6391.979999097290 , pVals[2]);
EXPECT_DOUBLE_EQ( 96161.7546348456000 , pVals[3]);
}
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
TEST(RigStatisticsMath, HistogramPercentiles)
{
std::vector<double> values;
values.push_back(HUGE_VAL);
values.push_back(2788.2723335651900);
values.push_back(-22481.0927881701000);
values.push_back(68778.6851686236000);
values.push_back(-76092.8157632591000);
values.push_back(6391.97999909729003);
values.push_back(65930.1200169780000);
values.push_back(-27696.2320267235000);
values.push_back(HUGE_VAL);
values.push_back(HUGE_VAL);
values.push_back(96161.7546348456000);
values.push_back(73875.6716288563000);
values.push_back(80720.4378655615000);
values.push_back(-98649.8109937874000);
values.push_back(99372.9362079615000);
values.push_back(HUGE_VAL);
values.push_back(-57020.4389966513000);
double min, max, range, mean, stdev;
RigStatisticsMath::calculateBasicStatistics(values, &min, &max, &range, &mean, &stdev);
std::vector<size_t> histogram;
RigHistogramCalculator histCalc(min, max, 100, &histogram);
histCalc.addData(values);
std::vector<double> pVals;
double p10, p50, p90;
p10 = histCalc.calculatePercentil(0.1);
p50 = histCalc.calculatePercentil(0.5);
p90 = histCalc.calculatePercentil(0.9);
EXPECT_DOUBLE_EQ( -76273.240559989776, p10);
EXPECT_DOUBLE_EQ( 5312.1312871307755 , p50);
EXPECT_DOUBLE_EQ( 94818.413022321271 , p90);
}

View File

@ -20,6 +20,8 @@
#include "RifReaderInterface.h"
#include "RigMainGrid.h"
#include "RigStatisticsMath.h"
#include <QDateTime>
#include <math.h>

View File

@ -116,86 +116,3 @@ private:
RigMainGrid* m_ownerMainGrid;
};
class RigHistogramCalculator
{
public:
RigHistogramCalculator(double min, double max, size_t nBins, std::vector<size_t>* histogram)
{
CVF_ASSERT(histogram);
CVF_ASSERT(nBins > 0);
if (max == min) { nBins = 1; } // Avoid dividing on 0 range
m_histogram = histogram;
m_min = min;
m_observationCount = 0;
// Initialize bins
m_histogram->resize(nBins);
for (size_t i = 0; i < m_histogram->size(); ++i) (*m_histogram)[i] = 0;
m_range = max - min;
maxIndex = nBins-1;
}
void addData(const std::vector<double>& data)
{
CVF_ASSERT(m_histogram);
for (size_t i = 0; i < data.size(); ++i)
{
if (data[i] == HUGE_VAL)
{
continue;
}
size_t index = 0;
if (maxIndex > 0) index = (size_t)(maxIndex*(data[i] - m_min)/m_range);
if(index < m_histogram->size()) // Just clip to the max min range (-index will overflow to positive )
{
(*m_histogram)[index]++;
m_observationCount++;
}
}
}
/// Calculates the estimated percentile from the histogram.
/// the percentile is the domain value at which pVal of the observations are below it.
/// Will only consider observed values between min and max, as all other values are discarded from the histogram
double calculatePercentil(double pVal)
{
CVF_ASSERT(m_histogram);
CVF_ASSERT(m_histogram->size());
CVF_ASSERT( 0.0 <= pVal && pVal <= 1.0);
double pValObservationCount = pVal*m_observationCount;
if (pValObservationCount == 0.0) return m_min;
size_t accObsCount = 0;
double binWidth = m_range/m_histogram->size();
for (size_t binIdx = 0; binIdx < m_histogram->size(); ++binIdx)
{
size_t binObsCount = (*m_histogram)[binIdx];
accObsCount += binObsCount;
if (accObsCount >= pValObservationCount)
{
double domainValueAtEndOfBin = m_min + (binIdx+1) * binWidth;
double unusedFractionOfLastBin = (double)(accObsCount - pValObservationCount)/binObsCount;
return domainValueAtEndOfBin - unusedFractionOfLastBin*binWidth;
}
}
CVF_ASSERT(false);
return HUGE_VAL;
}
private:
size_t maxIndex;
double m_range;
double m_min;
size_t m_observationCount;
std::vector<size_t>* m_histogram;
};

View File

@ -0,0 +1,192 @@
/////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2011-2012 Statoil ASA, Ceetron AS
//
// ResInsight is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ResInsight is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.
//
// See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html>
// for more details.
//
/////////////////////////////////////////////////////////////////////////////////
#include "RigStatisticsMath.h"
#include <algorithm>
#include <assert.h>
//--------------------------------------------------------------------------------------------------
/// A function to do basic statistical calculations
//--------------------------------------------------------------------------------------------------
void RigStatisticsMath::calculateBasicStatistics(const std::vector<double>& values, double* min, double* max, double* range, double* mean, double* dev)
{
double m_min(HUGE_VAL);
double m_max(-HUGE_VAL);
double m_mean(HUGE_VAL);
double m_dev(HUGE_VAL);
double sum = 0.0;
double sumSquared = 0.0;
size_t validValueCount = 0;
for (size_t i = 0; i < values.size(); i++)
{
double val = values[i];
if (val == HUGE_VAL) continue;
validValueCount++;
if (val < m_min) m_min = val;
if (val > m_max) m_max = val;
sum += val;
sumSquared += (val * val);
}
if (validValueCount > 0)
{
m_mean = sum / validValueCount;
// http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
// Running standard deviation
double s0 = static_cast<double>(validValueCount);
double s1 = sum;
double s2 = sumSquared;
m_dev = sqrt( (s0 * s2) - (s1 * s1) ) / s0;
}
if (min) *min = m_min;
if (max) *max = m_max;
if (range) *range = m_max - m_min;
if (mean) *mean = m_mean;
if (dev) *dev = m_dev;
}
//--------------------------------------------------------------------------------------------------
/// Calculate the percentiles of /a inputValues at the pValPosition percentages using the "Nearest Rank"
/// method. This method treats HUGE_VAL as "undefined" values, and ignores these. Will return HUGE_VAL if
/// the inputValues does not contain any valid values
//--------------------------------------------------------------------------------------------------
std::vector<double> RigStatisticsMath::calculateNearestRankPercentiles(const std::vector<double> & inputValues, const std::vector<double>& pValPositions)
{
std::vector<double> sortedValues;
sortedValues.reserve(inputValues.size());
for (size_t i = 0; i < inputValues.size(); ++i)
{
if (inputValues[i] != HUGE_VAL)
{
sortedValues.push_back(inputValues[i]);
}
}
std::sort(sortedValues.begin(), sortedValues.end());
std::vector<double> percentiles(pValPositions.size(), HUGE_VAL);
if (sortedValues.size())
{
for (size_t i = 0; i < pValPositions.size(); ++i)
{
double pVal = HUGE_VAL;
size_t pValIndex = static_cast<size_t>(sortedValues.size() * abs(pValPositions[i]) / 100);
if (pValIndex >= sortedValues.size() ) pValIndex = sortedValues.size() - 1;
pVal = sortedValues[pValIndex];
percentiles[i] = pVal;
}
}
return percentiles;
};
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
RigHistogramCalculator::RigHistogramCalculator(double min, double max, size_t nBins, std::vector<size_t>* histogram)
{
assert(histogram);
assert(nBins > 0);
if (max == min) { nBins = 1; } // Avoid dividing on 0 range
m_histogram = histogram;
m_min = min;
m_observationCount = 0;
// Initialize bins
m_histogram->resize(nBins);
for (size_t i = 0; i < m_histogram->size(); ++i) (*m_histogram)[i] = 0;
m_range = max - min;
maxIndex = nBins-1;
}
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
void RigHistogramCalculator::addData(const std::vector<double>& data)
{
assert(m_histogram);
for (size_t i = 0; i < data.size(); ++i)
{
if (data[i] == HUGE_VAL)
{
continue;
}
size_t index = 0;
if (maxIndex > 0) index = (size_t)(maxIndex*(data[i] - m_min)/m_range);
if(index < m_histogram->size()) // Just clip to the max min range (-index will overflow to positive )
{
(*m_histogram)[index]++;
m_observationCount++;
}
}
}
//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
double RigHistogramCalculator::calculatePercentil(double pVal)
{
assert(m_histogram);
assert(m_histogram->size());
assert( 0.0 <= pVal && pVal <= 1.0);
double pValObservationCount = pVal*m_observationCount;
if (pValObservationCount == 0.0) return m_min;
size_t accObsCount = 0;
double binWidth = m_range/m_histogram->size();
for (size_t binIdx = 0; binIdx < m_histogram->size(); ++binIdx)
{
size_t binObsCount = (*m_histogram)[binIdx];
accObsCount += binObsCount;
if (accObsCount >= pValObservationCount)
{
double domainValueAtEndOfBin = m_min + (binIdx+1) * binWidth;
double unusedFractionOfLastBin = (double)(accObsCount - pValObservationCount)/binObsCount;
return domainValueAtEndOfBin - unusedFractionOfLastBin*binWidth;
}
}
assert(false);
return HUGE_VAL;
}

View File

@ -0,0 +1,53 @@
/////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2011-2012 Statoil ASA, Ceetron AS
//
// ResInsight is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ResInsight is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.
//
// See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html>
// for more details.
//
/////////////////////////////////////////////////////////////////////////////////
#pragma once
#include <vector>
#include <cmath>
class RigStatisticsMath
{
public:
static void calculateBasicStatistics(const std::vector<double>& values, double* min, double* max, double* range, double* mean, double* dev);
static std::vector<double> calculateNearestRankPercentiles(const std::vector<double> & inputValues, const std::vector<double>& pValPositions);
};
//==================================================================================================
/// Class to calculate a histogram, and histogram based p-value estimates
//==================================================================================================
class RigHistogramCalculator
{
public:
RigHistogramCalculator(double min, double max, size_t nBins, std::vector<size_t>* histogram);
void addData(const std::vector<double>& data);
/// Calculates the estimated percentile from the histogram.
/// the percentile is the domain value at which pVal of the observations are below it.
/// Will only consider observed values between min and max, as all other values are discarded from the histogram
double calculatePercentil(double pVal);
private:
size_t maxIndex;
double m_range;
double m_min;
size_t m_observationCount;
std::vector<size_t>* m_histogram;
};