#2066 Observed Data : Major rewrite of varying column width parsing

Add TableInfo to hold a table with several ColumnInfo objects Improved detection of RifEclipseSummaryAddress from all variants
2025-02-25 18:55:39 -06:00 · 2017-11-02 19:00:27 +01:00
parent 02cd42aa54
commit fee00918e9
14 changed files with 1146 additions and 495 deletions
--- a/ApplicationCode/FileInterface/RifEclipseUserDataParserTools.cpp
+++ b/ApplicationCode/FileInterface/RifEclipseUserDataParserTools.cpp
@@ -18,8 +18,11 @@

 #include "RifEclipseUserDataParserTools.h"

+#include "RiaDateStringParser.h"
 #include "RiaLogging.h"

+#include "RifEclipseUserDataKeywordTools.h"
+
 #include "cvfAssert.h"

 #include <QString>
@@ -80,7 +83,7 @@ bool RifEclipseUserDataParserTools::isLineSkippable(const std::string& line)
 //--------------------------------------------------------------------------------------------------
 bool RifEclipseUserDataParserTools::isAComment(const std::string& word)
 {
-    if (word.size() > 1 && word.substr(0, 2) == "--")
+    if (word.find("--") != std::string::npos)
    {
        return true;
    }
@@ -161,77 +164,6 @@ size_t RifEclipseUserDataParserTools::findFirstNonEmptyEntryIndex(std::vector<st
    return list.size();
 }

-//--------------------------------------------------------------------------------------------------
-/// 
-//--------------------------------------------------------------------------------------------------
-RifEclipseSummaryAddress RifEclipseUserDataParserTools::makeAndFillAddress(std::string quantityName, std::vector< std::string > headerColumn)
-{
-    int         regionNumber = -1;
-    int         regionNumber2 = -1;
-    std::string wellGroupName = "";
-    std::string wellName = "";
-    int         wellSegmentNumber = -1;
-    std::string lgrName = "";
-    int         cellI = -1;
-    int         cellJ = -1;
-    int         cellK = -1;
-
-    RifEclipseSummaryAddress::SummaryVarCategory category = identifyCategory(quantityName);
-
-    switch (category) //TODO: More categories
-    {
-    case (RifEclipseSummaryAddress::SUMMARY_INVALID):
-    {
-        break;
-    }
-    case (RifEclipseSummaryAddress::SUMMARY_WELL):
-    {
-        size_t index = findFirstNonEmptyEntryIndex(headerColumn);
-        if (index < headerColumn.size())
-        {
-            wellName = headerColumn[index];
-        }
-	    break;
-    }
-    case (RifEclipseSummaryAddress::SUMMARY_WELL_GROUP):
-    {
-        size_t index = findFirstNonEmptyEntryIndex(headerColumn);
-        if (index < headerColumn.size())
-        {
-            wellGroupName = headerColumn[index];
-        }
-        break;
-    }
-    case (RifEclipseSummaryAddress::SUMMARY_REGION):
-    {
-        size_t index = findFirstNonEmptyEntryIndex(headerColumn);
-        if (index < headerColumn.size())
-        {
-            try
-            {
-                regionNumber = std::stoi(headerColumn[index]);
-            }
-            catch (...){}
-        }
-        break;
-    }
-    default:
-        break;
-    }
-
-    return RifEclipseSummaryAddress(category,
-        quantityName,
-        regionNumber,
-        regionNumber2,
-        wellGroupName,
-        wellName,
-        wellSegmentNumber,
-        lgrName,
-        cellI,
-        cellJ,
-        cellK);
-}
-
 //--------------------------------------------------------------------------------------------------
 /// 
 //--------------------------------------------------------------------------------------------------
@@ -248,7 +180,19 @@ bool RifEclipseUserDataParserTools::keywordParser(const std::string& line, std::
    else if (words[0] == "STARTDATE")
    {
        words.erase(words.begin());
-        startDate = std::accumulate(words.begin(), words.end(), std::string(""));
+
+        for (size_t i = 0; i < words.size(); i++)
+        {
+            std::string s = words[i];
+
+            startDate += s;
+
+            if (i < words.size() - 1)
+            {
+                startDate += " ";
+            }
+        }
+
        return true;
    }
    else if (words[0] == "DATEFORMAT")
@@ -259,162 +203,6 @@ bool RifEclipseUserDataParserTools::keywordParser(const std::string& line, std::
    return false;
 }

-//--------------------------------------------------------------------------------------------------
-/// 
-//--------------------------------------------------------------------------------------------------
-std::vector<ColumnInfo> RifEclipseUserDataParserTools::columnInfoForTable(std::stringstream& streamData)
-{
-    std::vector<ColumnInfo> table;
-
-    std::string origin = "";
-    std::string dateFormat = "";
-    std::string startDate  = "";
-
-    std::string line;
-    std::getline(streamData, line);
-
-    while (isLineSkippable(line) || keywordParser(line, origin, dateFormat, startDate))
-    {
-        if (!streamData.good()) return table;
-        
-        std::getline(streamData, line);
-    }
-
-    std::vector<std::string> quantityNames = splitLineAndRemoveComments(line);
-    size_t columnCount = quantityNames.size();
-
-    std::vector< std::vector< std::string > > allHeaderRows;
-
-    {
-        std::stringstream::pos_type posAtStartOfLine = streamData.tellg();
-
-        std::string secondLine;
-        std::getline(streamData, line);
-    
-        std::stringstream::pos_type posAtStartOfSecondLine = streamData.tellg();
-        std::getline(streamData, secondLine);
-
-        bool header = true;
-        while (header)
-        {
-            std::vector<std::string> words = splitLineAndRemoveComments(line);
-            std::vector<std::string> wordsSecondLine = splitLineAndRemoveComments(secondLine);
-
-            if (words.size() == columnCount &&
-                wordsSecondLine.size() == columnCount &&
-                hasOnlyValidDoubleValues(words) &&
-                hasOnlyValidDoubleValues(wordsSecondLine))
-            {
-                header = false;
-                break;
-            }
-            else
-            {
-                if (words.size() > columnCount) break;
-
-                size_t diff = columnCount - words.size();
-
-                if (diff == columnCount)
-                {
-                    std::vector< std::string > vectorOfEmptyStrings(columnCount, "");
-                    allHeaderRows.push_back(vectorOfEmptyStrings);
-                }
-                else
-                {
-                    words.insert(words.begin(), diff, "");
-                    allHeaderRows.push_back(words);
-                }
-            }
-
-            posAtStartOfLine = posAtStartOfSecondLine;
-            line = secondLine;
-
-            posAtStartOfSecondLine = streamData.tellg();
-            std::getline(streamData, secondLine);
-        }
-
-        streamData.seekg(posAtStartOfLine);
-    }
-
-    std::vector<std::string> unitNames;
-    std::vector<double> scaleFactors;
-    std::vector< std::vector< std::string > > restOfHeaderRows;
-
-    for (const auto& wordsForRow : allHeaderRows)
-    {
-        bool excludeFromHeader = false;
-        if (unitNames.size() == 0)
-        {
-            for (const std::string& word : wordsForRow)
-            {
-                if (hasTimeUnit(word))
-                {
-                    unitNames = wordsForRow;
-                    excludeFromHeader = true;
-                }
-            }
-        }
-
-        if (scaleFactors.size() == 0)
-        {
-            std::vector<double> values;
-
-            if (hasOnlyValidDoubleValues(wordsForRow, &values))
-            {
-                scaleFactors = values;
-                excludeFromHeader = true;
-            }
-        }
-
-        if (!excludeFromHeader)
-        {
-            restOfHeaderRows.push_back(wordsForRow);
-        }
-    }
-
-    for (const std::string& unit : unitNames)
-    {
-        ColumnInfo columnInfo;
-        columnInfo.unitName = unit;
-        columnInfo.origin = origin;
-        columnInfo.dateFormatString = dateFormat;
-        columnInfo.startDateString = startDate;
-        table.push_back(columnInfo);
-    }
-
-    for (size_t i = 0; i < table.size(); i++)
-    {
-        if (scaleFactors.size() == table.size())
-        {
-            table[i].scaleFactor = scaleFactors[i];
-        }
-        else
-        {
-            table[i].scaleFactor = 1.0;
-        }
-    }
-    
-    for (size_t i = 0; i < table.size(); i++)
-    {
-        std::vector< std::string > restOfHeaderColumn;
-        for (std::vector< std::string > restOfHeaderRow : restOfHeaderRows)
-        {
-            restOfHeaderColumn.push_back(restOfHeaderRow.at(i));
-        }
-        table[i].summaryAddress = makeAndFillAddress(quantityNames.at(i), restOfHeaderColumn);
-    }
-
-    for (ColumnInfo& column : table)
-    {
-        if (column.summaryAddress.category() != RifEclipseSummaryAddress::SUMMARY_INVALID)
-        {
-            column.isAVector = true;
-        }
-    }
-
-    return table;
-}
-
 //--------------------------------------------------------------------------------------------------
 /// 
 //--------------------------------------------------------------------------------------------------
@@ -486,19 +274,234 @@ bool RifEclipseUserDataParserTools::hasOnlyValidDoubleValues(const std::vector<s
 {
    char* end;

+    bool onlyValidValues = true;
+
    for (const auto& word : words)
    {
-        double doubleVal = strtod(word.data(), &end);
-        if (end == word.data())
+        if (word.find_first_not_of("0123456789.eE-") != std::string::npos)
        {
-            return false;
+            onlyValidValues = false;
        }
-
-        if (doubleValues)
+        else
        {
+            double doubleVal = strtod(word.data(), &end);
            doubleValues->push_back(doubleVal);
        }
    }

-    return true;
+    return onlyValidValues;
+}
+
+//--------------------------------------------------------------------------------------------------
+/// 
+//--------------------------------------------------------------------------------------------------
+bool RifEclipseUserDataParserTools::hasDateUnit(const std::string& word)
+{
+    if (word.find("DATE") != std::string::npos) return true;
+
+    return false;
+}
+
+//--------------------------------------------------------------------------------------------------
+/// 
+//--------------------------------------------------------------------------------------------------
+bool RifEclipseUserDataParserTools::isValidTableData(size_t columnCount, const std::string& line)
+{
+    std::vector<std::string> words = splitLineAndRemoveComments(line);
+
+    if (words.size() != columnCount) return false;
+
+    std::vector<double> doubleValues;
+    RifEclipseUserDataParserTools::hasOnlyValidDoubleValues(words, &doubleValues);
+    if (doubleValues.size() == columnCount) return true;
+
+    size_t columnsWithDate = 0;
+    for (auto w : words)
+    {
+        if (RiaDateStringParser::parseDateString(w).isValid())
+        {
+            columnsWithDate++;
+        }
+    }
+
+    if (columnsWithDate == 1 && doubleValues.size() == columnCount - 1)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+//--------------------------------------------------------------------------------------------------
+/// 
+//--------------------------------------------------------------------------------------------------
+TableData RifEclipseUserDataParserTools::tableDataFromText(std::stringstream& streamData, std::vector<std::string>* errorText)
+{
+    TableData emptyTable;
+
+    std::string origin = "";
+    std::string dateFormat = "";
+    std::string startDate = "";
+
+    std::string firstLine;
+    std::getline(streamData, firstLine);
+
+    while (isLineSkippable(firstLine) || keywordParser(firstLine, origin, dateFormat, startDate))
+    {
+        if (!streamData.good())
+        {
+            if (errorText) errorText->push_back("Failed to detect start of table header");
+
+            return emptyTable;
+        }
+
+        std::getline(streamData, firstLine);
+    }
+
+    std::vector<std::string> quantityNames = splitLineAndRemoveComments(firstLine);
+    size_t columnCount = quantityNames.size();
+
+    if (columnCount == 0)
+    {
+        if (errorText) errorText->push_back("No quantities detected in table");
+
+        return emptyTable;
+    }
+
+    std::vector< std::vector< std::string > > allHeaderRows;
+
+    {
+        std::stringstream::pos_type posAtStartOfFirstLine = streamData.tellg();
+
+        std::string secondLine;
+        std::getline(streamData, firstLine);
+
+        std::stringstream::pos_type posAtStartOfSecondLine = streamData.tellg();
+        std::getline(streamData, secondLine);
+
+        bool header = true;
+        while (header)
+        {
+            if (isValidTableData(columnCount, firstLine) &&
+                isValidTableData(columnCount, secondLine))
+            {
+                header = false;
+                break;
+            }
+            else
+            {
+                std::vector<std::string> words = splitLineAndRemoveComments(firstLine);
+                if (words.size() > 0)
+                {
+                    allHeaderRows.push_back(words);
+                }
+            }
+
+            posAtStartOfFirstLine = posAtStartOfSecondLine;
+            firstLine = secondLine;
+
+            posAtStartOfSecondLine = streamData.tellg();
+            std::getline(streamData, secondLine);
+
+            if (!streamData.good())
+            {
+                header = false;
+            }
+        }
+
+        streamData.seekg(posAtStartOfFirstLine);
+    }
+
+    std::vector<std::string> unitNames;
+    std::vector<double> scaleFactors;
+    std::vector< std::vector< std::string > > headerRows;
+
+    for (const auto& rowWords : allHeaderRows)
+    {
+        bool excludeFromHeader = false;
+
+        if (rowWords.size() == columnCount)
+        {
+            if (unitNames.size() == 0)
+            {
+                for (const std::string& word : rowWords)
+                {
+                    if (hasTimeUnit(word))
+                    {
+                        unitNames = rowWords;
+                        excludeFromHeader = true;
+                    }
+                }
+            }
+
+            if (scaleFactors.size() == 0)
+            {
+                std::vector<double> values;
+
+                if (hasOnlyValidDoubleValues(rowWords, &values))
+                {
+                    scaleFactors = values;
+                    excludeFromHeader = true;
+                }
+            }
+        }
+
+        if (!excludeFromHeader)
+        {
+            headerRows.push_back(rowWords);
+        }
+    }
+
+    if (columnCount != unitNames.size())
+    {
+        if (errorText) errorText->push_back("Number of quantities is different from number of units");
+
+        return emptyTable;
+    }
+
+
+    std::vector<ColumnInfo> columnInfos;
+
+    // Create string vectors for each column
+    {
+        std::vector<std::string> parserErrors;
+        std::vector<std::vector<std::string>> tableHeaderText = RifEclipseUserDataKeywordTools::buildColumnHeaderText(quantityNames, headerRows, &parserErrors);
+        if (parserErrors.size() > 0)
+        {
+            if (errorText) errorText->insert(errorText->end(), parserErrors.begin(), parserErrors.end());
+            
+            return emptyTable;
+        }
+
+
+        // For each column header, create rif adress and date time
+        for (size_t i = 0; i < tableHeaderText.size(); i++)
+        {
+            auto columnText = tableHeaderText[i];
+            if (columnText.size() == 0)
+            {
+                if (errorText) errorText->push_back("Detected column with no content");
+                continue;
+            }
+
+            std::string quantity = columnText[0];
+            std::string unit = unitNames[i];
+
+            std::vector<std::string> columnHeader;
+
+            if (columnText.size() > 1) columnHeader.insert(columnHeader.begin(), columnText.begin() + 1, columnText.end());
+
+            RifEclipseSummaryAddress adr = RifEclipseUserDataKeywordTools::makeAndFillAddress(quantity, columnHeader);
+
+            ColumnInfo ci(adr, unit);
+            if (quantity == "DATE")
+            {
+                ci.isStringData = true;
+            }
+
+            columnInfos.push_back(ci);
+        }
+    }
+
+    return TableData(origin, dateFormat, startDate, columnInfos);
 }