ResInsight/ApplicationLibCode/FileInterface/RifEclipseUserDataParserTools.cpp

/////////////////////////////////////////////////////////////////////////////////
//
//  Copyright (C) 2017-  Statoil ASA
//
//  ResInsight is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  ResInsight is distributed in the hope that it will be useful, but WITHOUT ANY
//  WARRANTY; without even the implied warranty of MERCHANTABILITY or
//  FITNESS FOR A PARTICULAR PURPOSE.
//
//  See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html>
//  for more details.
//
/////////////////////////////////////////////////////////////////////////////////

#include "RifEclipseUserDataParserTools.h"

#include "RiaDateStringParser.h"
#include "RiaLogging.h"
#include "RiaQDateTimeTools.h"
#include "RiaStdStringTools.h"

#include "RifEclipseUserDataKeywordTools.h"

#include "cvfAssert.h"

#include <QString>
#include <QStringList>
#include <QTextStream>

#include <algorithm>
#include <cmath>
#include <numeric>

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isLineSkippable( const std::string& line )
{
    std::size_t found = line.find_first_not_of( " " );
    if ( found == std::string::npos )
    {
        // Line with only spaces

        return true;
    }

    if ( line[found] == '-' )
    {
        // Comments start with -

        return true;
    }

    if ( line[found] == '1' && found == 0 && line.find_first_not_of( "1 ", 1 ) == std::string::npos )
    {
        // Single 1 at start of file

        return true;
    }

    std::string str( line );

    if ( str.find( "SUMMARY" ) < str.size() )
    {
        return true;
    }

    if ( str.find( "PAGE" ) < str.size() )
    {
        return true;
    }

    if ( str.find( "NULL" ) < str.size() )
    {
        return true;
    }

    return false;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isAComment( const std::string& word )
{
    return word.find( "--" ) != std::string::npos;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<std::string> RifEclipseUserDataParserTools::splitLineAndRemoveComments( const std::string& line )
{
    std::istringstream       iss( line );
    std::vector<std::string> words{ std::istream_iterator<std::string>{ iss }, std::istream_iterator<std::string>{} };

    for ( auto wordsIterator = words.begin(); wordsIterator != words.end(); ++wordsIterator )
    {
        if ( isAComment( *wordsIterator ) )
        {
            words.erase( wordsIterator, words.end() );
            break;
        }
    }

    return words;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::keywordParser( const std::string& line, std::string& origin, std::string& dateFormat, std::string& startDate )
{
    std::vector<std::string> words = splitLineAndRemoveComments( line );
    if ( words.size() < 2 ) return false;

    if ( words[0] == "ORIGIN" )
    {
        origin = words[1];
        return true;
    }
    else if ( words[0] == "STARTDATE" )
    {
        words.erase( words.begin() );

        for ( size_t i = 0; i < words.size(); i++ )
        {
            std::string s = words[i];

            startDate += s;

            if ( i < words.size() - 1 )
            {
                startDate += " ";
            }
        }

        return true;
    }
    else if ( words[0] == "DATEFORMAT" )
    {
        dateFormat = words[1];
        return true;
    }
    return false;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<double> RifEclipseUserDataParserTools::splitLineToDoubles( const std::string& line )
{
    std::vector<double> values;

    QString s = QString::fromStdString( line );

    QStringList words = s.split( " " );

    bool ok = false;
    for ( auto w : words )
    {
        double val = w.toDouble( &ok );
        if ( ok )
        {
            values.push_back( val );
        }
    }

    return values;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isANumber( const std::string& line )
{
    try
    {
        auto value = std::stod( line );
        return !( std::isinf( value ) || std::isnan( value ) );
    }
    catch ( ... )
    {
        return false;
    }
    return true;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<std::string> RifEclipseUserDataParserTools::headerReader( std::stringstream& streamData, std::string& line )
{
    std::vector<std::string> header;

    while ( !isANumber( line ) && !streamData.eof() )
    {
        header.push_back( line );
        std::getline( streamData, line );
    }
    return header;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::hasTimeUnit( const std::string& word )
{
    return word == "DAYS" || word == "DAY" || word == "YEARS" || word == "YEAR" || word == "DATE" || word == "DATES";
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::hasOnlyValidDoubleValues( const std::vector<std::string>& words, std::vector<double>* doubleValues )
{
    bool onlyValidValues = true;

    for ( const auto& word : words )
    {
        if ( word.find_first_not_of( "0123456789.eE-+" ) != std::string::npos )
        {
            onlyValidValues = false;
        }
        else
        {
            double doubleVal = 0.0;
            RiaStdStringTools::toDouble( word, doubleVal );
            doubleValues->push_back( doubleVal );
        }
    }

    return onlyValidValues;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isValidTableData( size_t columnCount, const std::string& line )
{
    std::vector<std::string> words = splitLineAndRemoveComments( line );

    if ( words.size() != columnCount ) return false;

    std::vector<double> doubleValues;
    RifEclipseUserDataParserTools::hasOnlyValidDoubleValues( words, &doubleValues );
    if ( doubleValues.size() == columnCount ) return true;

    size_t columnsWithDate = 0;
    for ( auto w : words )
    {
        if ( RiaDateStringParser::parseDateString( w ).isValid() )
        {
            columnsWithDate++;
        }
    }

    return columnsWithDate == 1 && doubleValues.size() == columnCount - 1;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
TableData RifEclipseUserDataParserTools::tableDataFromText( std::stringstream& streamData, std::vector<std::string>* errorText )
{
    TableData emptyTable;

    std::string origin     = "";
    std::string dateFormat = "";
    std::string startDate  = "";

    std::string firstLine;
    std::getline( streamData, firstLine );

    while ( isLineSkippable( firstLine ) || keywordParser( firstLine, origin, dateFormat, startDate ) )
    {
        if ( !streamData.good() )
        {
            // End of file
            return emptyTable;
        }

        std::getline( streamData, firstLine );
    }

    std::vector<std::string> quantityNames = splitLineAndRemoveComments( firstLine );
    size_t                   columnCount   = quantityNames.size();

    if ( columnCount == 0 )
    {
        if ( errorText ) errorText->push_back( "No quantities detected in table" );

        return emptyTable;
    }

    std::vector<std::vector<std::string>> allHeaderRows;

    {
        std::stringstream::pos_type posAtStartOfFirstLine = streamData.tellg();

        std::string secondLine;
        std::getline( streamData, firstLine );

        std::stringstream::pos_type posAtStartOfSecondLine = streamData.tellg();
        std::getline( streamData, secondLine );

        bool header = true;
        while ( header )
        {
            if ( isValidTableData( columnCount, firstLine ) && isValidTableData( columnCount, secondLine ) )
            {
                header = false;
                break;
            }
            else
            {
                std::vector<std::string> words = splitLineAndRemoveComments( firstLine );
                if ( !words.empty() )
                {
                    allHeaderRows.push_back( words );
                }
            }

            posAtStartOfFirstLine = posAtStartOfSecondLine;
            firstLine             = secondLine;

            posAtStartOfSecondLine = streamData.tellg();
            std::getline( streamData, secondLine );

            if ( !streamData.good() )
            {
                header = false;
            }
        }

        streamData.seekg( posAtStartOfFirstLine );
    }

    std::vector<std::string>              unitNames;
    std::vector<double>                   scaleFactors;
    std::vector<std::vector<std::string>> headerRows;

    for ( const auto& rowWords : allHeaderRows )
    {
        bool excludeFromHeader = false;

        if ( rowWords.size() == columnCount )
        {
            if ( unitNames.empty() )
            {
                for ( const std::string& word : rowWords )
                {
                    if ( hasTimeUnit( word ) )
                    {
                        unitNames         = rowWords;
                        excludeFromHeader = true;
                    }
                }
            }

            if ( scaleFactors.empty() )
            {
                std::vector<double> values;

                if ( hasOnlyValidDoubleValues( rowWords, &values ) )
                {
                    scaleFactors      = values;
                    excludeFromHeader = true;
                }
            }
        }

        if ( !excludeFromHeader )
        {
            headerRows.push_back( rowWords );
        }
    }

    if ( columnCount != unitNames.size() )
    {
        if ( errorText ) errorText->push_back( "Number of quantities is different from number of units" );

        return emptyTable;
    }

    std::vector<Column> columnInfos;

    // Create string vectors for each column
    {
        std::vector<std::string>              parserErrors;
        std::vector<std::vector<std::string>> tableHeaderText =
            RifEclipseUserDataKeywordTools::buildColumnHeaderText( quantityNames, headerRows, &parserErrors );
        if ( !parserErrors.empty() )
        {
            if ( errorText ) errorText->insert( errorText->end(), parserErrors.begin(), parserErrors.end() );

            return emptyTable;
        }

        // For each column header, create rif adress and date time
        for ( size_t i = 0; i < tableHeaderText.size(); i++ )
        {
            auto columnText = tableHeaderText[i];
            if ( columnText.empty() )
            {
                if ( errorText ) errorText->push_back( "Detected column with no content" );
                continue;
            }

            std::string vectorName = columnText[0];
            std::string unit       = unitNames[i];

            std::vector<std::string> columnHeader;

            if ( columnText.size() > 1 ) columnHeader.insert( columnHeader.begin(), columnText.begin() + 1, columnText.end() );

            RifEclipseSummaryAddress adr = RifEclipseUserDataKeywordTools::makeAndFillAddress( vectorName, columnHeader );

            Column ci = Column::createColumnInfoFromRsmData( vectorName, unit, adr );

            columnInfos.push_back( ci );
        }
    }

    return TableData( origin, startDate, columnInfos );
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isFixedWidthHeader( const std::string& lines )
{
    std::stringstream streamData( lines );

    std::vector<std::string> headerLines = RifEclipseUserDataParserTools::findValidHeaderLines( streamData );
    if ( headerLines.size() > 1 )
    {
        std::vector<size_t> firstLine = RifEclipseUserDataParserTools::columnIndexForWords( headerLines[0] );

        for ( auto line : headerLines )
        {
            std::vector<size_t> columnIndicesForLine = RifEclipseUserDataParserTools::columnIndexForWords( line );
            for ( auto index : columnIndicesForLine )
            {
                if ( std::find( firstLine.begin(), firstLine.end(), index ) == firstLine.end() )
                {
                    return false;
                }
            }
        }

        return true;
    }

    return false;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<Column> RifEclipseUserDataParserTools::columnInfoForFixedColumnWidth( std::stringstream& streamData )
{
    auto headerLines = RifEclipseUserDataParserTools::findValidHeaderLines( streamData );

    auto columnHeaders = RifEclipseUserDataParserTools::splitIntoColumnHeaders( headerLines );

    return RifEclipseUserDataParserTools::columnInfoFromColumnHeaders( columnHeaders );
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<std::string> RifEclipseUserDataParserTools::findValidHeaderLines( std::stringstream& streamData )
{
    std::vector<std::string> headerLines;

    std::stringstream::pos_type posAtTableDataStart = streamData.tellg();

    size_t      columnCount = 0;
    std::string line;
    bool        continueParsing                 = true;
    bool        hasStepType                     = false;
    size_t      minimunRequiredExtraHeaderLines = 0;

    while ( continueParsing )
    {
        posAtTableDataStart = streamData.tellg();

        if ( !std::getline( streamData, line ) )
        {
            continueParsing = false;
        }
        else
        {
            if ( !RifEclipseUserDataParserTools::isLineSkippable( line ) )
            {
                auto words = RifEclipseUserDataParserTools::splitLineAndRemoveComments( line );

                if ( !hasStepType )
                {
                    for ( size_t i = 0; i < words.size(); i++ )
                    {
                        if ( RifEclipseUserDataKeywordTools::isStepType( words[i] ) )
                        {
                            hasStepType = true;
                        }
                    }
                }

                if ( isUnitText( line ) )
                {
                    minimunRequiredExtraHeaderLines += 1;
                }

                if ( isScalingText( line ) )
                {
                    minimunRequiredExtraHeaderLines += 1;
                }

                if ( columnCount == 0 )
                {
                    // Fist line with valid header data defines the number of columns

                    columnCount = words.size();

                    minimunRequiredExtraHeaderLines = RifEclipseUserDataKeywordTools::computeRequiredHeaderLineCount( words );

                    headerLines.push_back( line );
                }
                else if ( headerLines.size() < minimunRequiredExtraHeaderLines )
                {
                    headerLines.push_back( line );
                }
                else
                {
                    std::vector<double> doubleValues = RifEclipseUserDataParserTools::splitLineToDoubles( line );

                    if ( doubleValues.size() < columnCount && words.size() < columnCount )
                    {
                        if ( hasStepType && ( words.size() + 1 == columnCount ) )
                        {
                            continueParsing = false;
                        }
                        else
                        {
                            // Consider a line with double values less than column count as a table header
                            headerLines.push_back( line );
                        }
                    }
                    else
                    {
                        continueParsing = false;
                    }
                }
            }
        }
    }

    streamData.seekg( posAtTableDataStart );

    return headerLines;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<std::vector<std::string>> RifEclipseUserDataParserTools::splitIntoColumnHeaders( const std::vector<std::string>& headerLines )
{
    std::vector<std::vector<std::string>> headerLinesPerColumn;

    if ( !headerLines.empty() )
    {
        std::vector<size_t> columnOffsets = RifEclipseUserDataParserTools::columnIndexForWords( headerLines[0] );

        if ( !columnOffsets.empty() )
        {
            headerLinesPerColumn.resize( columnOffsets.size() );

            for ( auto headerLine : headerLines )
            {
                for ( size_t i = 0; i < columnOffsets.size(); i++ )
                {
                    size_t colStart = columnOffsets[i];

                    size_t columnWidth = std::string::npos;
                    if ( i < columnOffsets.size() - 1 )
                    {
                        columnWidth = columnOffsets[i + 1] - colStart;
                    }
                    else
                    {
                        if ( headerLine.size() > colStart )
                        {
                            columnWidth = headerLine.size() - colStart;
                        }
                    }

                    std::string subString;
                    if ( columnWidth != std::string::npos && colStart < headerLine.size() && colStart + columnWidth <= headerLine.size() )
                    {
                        subString = headerLine.substr( colStart, columnWidth );
                    }

                    subString = trimString( subString );

                    headerLinesPerColumn[i].push_back( subString );
                }
            }
        }
    }

    return headerLinesPerColumn;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<Column> RifEclipseUserDataParserTools::columnInfoFromColumnHeaders( const std::vector<std::vector<std::string>>& columnData )
{
    std::vector<Column> table;

    bool isUnitsDetected   = false;
    bool isScalingDetected = false;

    for ( auto columnLines : columnData )
    {
        if ( columnLines.size() > 1 && isUnitText( columnLines[1] ) )
        {
            isUnitsDetected = true;
        }

        if ( columnLines.size() > 2 && isScalingText( columnLines[2] ) )
        {
            isScalingDetected = true;
        }
    }

    for ( auto columnLines : columnData )
    {
        if ( columnLines.empty() ) continue;

        std::string vectorName = columnLines[0];
        std::string unit;

        size_t startIndex = 1;

        if ( isUnitsDetected )
        {
            unit = columnLines[1];

            startIndex = 2;
        }

        if ( isScalingDetected )
        {
            // std::string scaling = columnLines[2];

            startIndex = 3;
        }

        std::vector<std::string> restOfHeader;
        for ( size_t i = startIndex; i < columnLines.size(); i++ )
        {
            restOfHeader.push_back( columnLines[i] );
        }

        RifEclipseSummaryAddress adr = RifEclipseUserDataKeywordTools::makeAndFillAddress( vectorName, restOfHeader );

        Column ci = Column::createColumnInfoFromRsmData( vectorName, unit, adr );

        table.push_back( ci );
    }

    return table;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<size_t> RifEclipseUserDataParserTools::columnIndexForWords( const std::string& line )
{
    std::vector<size_t> columnOffsets;

    std::size_t offset = line.find_first_not_of( " " );
    while ( offset != std::string::npos )
    {
        columnOffsets.push_back( offset );

        offset = line.find_first_of( " ", offset );
        offset = line.find_first_not_of( " ", offset );
    }

    return columnOffsets;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<TableData> RifEclipseUserDataParserTools::mergeEqualTimeSteps( const std::vector<TableData>& tables )
{
    if ( tables.size() < 2 )
    {
        return tables;
    }

    if ( tables[0].columnInfos().empty() ) return tables;

    QDateTime firstTableStartTime;
    for ( auto c : tables[0].columnInfos() )
    {
        if ( c.summaryAddress.vectorName() == "DATE" )
        {
            if ( c.itemCount() > 0 )
            {
                firstTableStartTime = RiaDateStringParser::parseDateString( c.textValues[0] );
            }
        }
    }

    if ( !firstTableStartTime.isValid() )
    {
        return tables;
    }

    std::vector<TableData> largeTables;

    largeTables.push_back( tables[0] );

    TableData& firstTable        = largeTables[0];
    size_t     itemsInFirstTable = tables[0].columnInfos()[0].itemCount();

    for ( size_t i = 1; i < tables.size(); i++ )
    {
        bool isDatesEqual = true;

        if ( firstTableStartTime.isValid() )
        {
            QDateTime tableFirstTime;
            for ( auto& c : tables[i].columnInfos() )
            {
                if ( c.summaryAddress.vectorName() == "DATE" )
                {
                    if ( c.itemCount() > 0 )
                    {
                        tableFirstTime = RiaDateStringParser::parseDateString( c.textValues[0] );
                    }
                }
            }

            if ( firstTableStartTime != tableFirstTime )
            {
                isDatesEqual = false;
            }
        }

        if ( !tables[i].columnInfos().empty() && tables[i].columnInfos()[0].itemCount() == itemsInFirstTable && isDatesEqual )
        {
            for ( auto& c : tables[i].columnInfos() )
            {
                if ( c.summaryAddress.vectorName() != "DATE" )
                {
                    firstTable.columnInfos().push_back( c );
                }
            }
        }
        else
        {
            largeTables.push_back( tables[i] );
        }
    }

    return largeTables;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::string RifEclipseUserDataParserTools::trimString( const std::string& s )
{
    auto sCopy = s.substr( 0, s.find_last_not_of( ' ' ) + 1 );
    if ( !sCopy.empty() )
    {
        sCopy = sCopy.substr( sCopy.find_first_not_of( ' ' ) );
    }

    return sCopy;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isUnitText( const std::string& word )
{
    if ( hasTimeUnit( word ) ) return true;

    if ( word.find( "BARSA" ) != std::string::npos ) return true;
    if ( word.find( "SM3" ) != std::string::npos ) return true;
    if ( word.find( "RM3" ) != std::string::npos ) return true;

    return false;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
bool RifEclipseUserDataParserTools::isScalingText( const std::string& word )
{
    return word.find_first_of( '*' ) != std::string::npos;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::string Column::columnName() const
{
    return summaryAddress.vectorName();
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
size_t Column::itemCount() const
{
    switch ( dataType )
    {
        case NUMERIC:
            return values.size();
        case TEXT:
            return textValues.size();
        case DATETIME:
            return dateTimeValues.size();
        default:
            return 0;
    }
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
Column Column::createColumnInfoFromRsmData( const std::string& vectorName, const std::string& unit, const RifEclipseSummaryAddress& addr )
{
    Column ci( addr, unit );

    if ( RifEclipseUserDataKeywordTools::isDate( vectorName ) )
    {
        ci.dataType = TEXT;
    }
    else if ( RifEclipseUserDataKeywordTools::isStepType( vectorName ) )
    {
        ci.dataType = TEXT;
    }
    else
    {
        ci.dataType = NUMERIC;
    }
    return ci;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
Column Column::createColumnInfoFromCsvData( const RifEclipseSummaryAddress& addr, const std::string& unit )
{
    Column col( addr, unit );
    return col;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
std::vector<QDateTime> Column::qDateTimeValues() const
{
    std::vector<QDateTime> output;
    for ( auto t : dateTimeValues )
        output.push_back( RiaQDateTimeTools::fromTime_t( t ) );
    return output;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
int TableData::dateTimeColumnIndex() const
{
    return m_dateTimeColumnIndex;
}

//--------------------------------------------------------------------------------------------------
///
//--------------------------------------------------------------------------------------------------
QDateTime TableData::findFirstDate() const
{
    QDateTime dt = RiaQDateTimeTools::epoch();

    for ( auto ci : m_columnInfos )
    {
        if ( RifEclipseUserDataKeywordTools::isDate( ci.summaryAddress.vectorName() ) )
        {
            if ( ci.itemCount() > 0 )
            {
                std::string firstDateString = ci.textValues[0];

                QDateTime candidate = RiaDateStringParser::parseDateString( firstDateString );
                if ( candidate.isValid() )
                {
                    dt = candidate;
                }
            }
        }
    }

    return dt;
}