diff --git a/ApplicationLibCode/FileInterface/RifArrowTools.cpp b/ApplicationLibCode/FileInterface/RifArrowTools.cpp index a75d6cefa9..c127ce7872 100644 --- a/ApplicationLibCode/FileInterface/RifArrowTools.cpp +++ b/ApplicationLibCode/FileInterface/RifArrowTools.cpp @@ -18,40 +18,86 @@ #include "RifArrowTools.h" -#include "cafAssert.h" +#include "RifByteArrayArrowRandomAccessFile.h" +#include "RifCsvDataTableFormatter.h" #include //-------------------------------------------------------------------------------------------------- /// //-------------------------------------------------------------------------------------------------- -std::vector RifArrowTools::convertChunkedArrayToStdVector( const std::shared_ptr& column ) +QString RifArrowTools::readFirstRowsOfTable( const QByteArray& contents ) { - auto convertChunkToVector = []( const std::shared_ptr& array ) -> std::vector + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + std::shared_ptr input = std::make_shared( contents ); + + // Open Parquet file reader + std::unique_ptr arrow_reader; + if ( !parquet::arrow::OpenFile( input, pool, &arrow_reader ).ok() ) { - std::vector result; - - auto double_array = std::static_pointer_cast( array ); - result.resize( double_array->length() ); - for ( int64_t i = 0; i < double_array->length(); ++i ) - { - result[i] = double_array->Value( i ); - } - - return result; - }; - - CAF_ASSERT( column->type()->id() == arrow::Type::DOUBLE ); - - std::vector result; - - // Iterate over each chunk in the column - for ( int i = 0; i < column->num_chunks(); ++i ) - { - std::shared_ptr chunk = column->chunk( i ); - std::vector chunk_vector = convertChunkToVector( chunk ); - result.insert( result.end(), chunk_vector.begin(), chunk_vector.end() ); + return {}; } - return result; -}; + // Read entire file as a single Arrow table + std::shared_ptr table; + if ( !arrow_reader->ReadTable( &table ).ok() ) + { + return {}; + } + + QString tableText; + QTextStream stream( &tableText ); + RifCsvDataTableFormatter formatter( stream, ";" ); + + std::vector header; + for ( std::string columnName : table->ColumnNames() ) + { + header.push_back( RifTextDataTableColumn( QString::fromStdString( columnName ) ) ); + } + + formatter.header( header ); + + std::vector> columnVectors; + + for ( std::string columnName : table->ColumnNames() ) + { + std::shared_ptr column = table->GetColumnByName( columnName ); + + auto columnType = column->type()->id(); + + if ( columnType == arrow::Type::DOUBLE ) + { + std::vector columnVector = RifArrowTools::chunkedArrayToVector( column ); + columnVectors.push_back( columnVector ); + } + else if ( columnType == arrow::Type::FLOAT ) + { + auto columnVector = RifArrowTools::chunkedArrayToVector( column ); + columnVectors.push_back( columnVector ); + } + else if ( columnType == arrow::Type::TIMESTAMP ) + { + auto columnVector = RifArrowTools::chunkedArrayToVector( column ); + columnVectors.push_back( columnVector ); + } + } + + if ( columnVectors.empty() ) + { + return {}; + } + + for ( int i = 0; i < std::min( 20, int( columnVectors[0].size() ) ); i++ ) + { + for ( int j = 0; j < int( columnVectors.size() ); j++ ) + { + formatter.add( columnVectors[j][i] ); + } + formatter.rowCompleted(); + } + + formatter.tableCompleted(); + + return tableText; +} diff --git a/ApplicationLibCode/FileInterface/RifArrowTools.h b/ApplicationLibCode/FileInterface/RifArrowTools.h index aba2ed4042..6778d26cbc 100644 --- a/ApplicationLibCode/FileInterface/RifArrowTools.h +++ b/ApplicationLibCode/FileInterface/RifArrowTools.h @@ -22,15 +22,50 @@ #include #define signals Q_SIGNALS +#include #include #include +#include +#include + //================================================================================================== // // //================================================================================================== -class RifArrowTools +namespace RifArrowTools { -public: - static std::vector convertChunkedArrayToStdVector( const std::shared_ptr& column ); -}; + +// Template class used to handle most of the basic types. Conversiont to std::string requuires a specialization using chunk->GetString(j). +template +std::vector chunkedArrayToVector( const std::shared_ptr& chunkedArray ) +{ + static_assert( std::is_base_of::value, "ArrowArrayType must be derived from arrow::Array" ); + + std::vector result; + for ( int i = 0; i < chunkedArray->num_chunks(); ++i ) + { + auto chunk = std::static_pointer_cast( chunkedArray->chunk( i ) ); + + // Use auto here instead of CType to allow conversion between different types + // Use raw_values() to get the raw data pointer for best performance + const auto* data = chunk->raw_values(); + + for ( int j = 0; j < chunk->length(); ++j ) + { + if ( !chunk->IsNull( j ) ) + { + result.push_back( data[j] ); + } + else + { + result.push_back( std::numeric_limits::quiet_NaN() ); + } + } + } + + return result; +} + +QString readFirstRowsOfTable( const QByteArray& contents ); +}; // namespace RifArrowTools diff --git a/ApplicationLibCode/FileInterface/RifOsduWellLogReader.cpp b/ApplicationLibCode/FileInterface/RifOsduWellLogReader.cpp index 2442efbc3f..31c98527bf 100644 --- a/ApplicationLibCode/FileInterface/RifOsduWellLogReader.cpp +++ b/ApplicationLibCode/FileInterface/RifOsduWellLogReader.cpp @@ -65,7 +65,7 @@ std::pair, QString> RifOsduWellLogReader::readWellL if ( column->type()->id() == arrow::Type::DOUBLE ) { - std::vector columnVector = RifArrowTools::convertChunkedArrayToStdVector( column ); + std::vector columnVector = RifArrowTools::chunkedArrayToVector( column ); logData->setValues( QString::fromStdString( columnName ), columnVector ); } } diff --git a/ApplicationLibCode/FileInterface/RifOsduWellPathReader.cpp b/ApplicationLibCode/FileInterface/RifOsduWellPathReader.cpp index d9a85681bb..69f2edb0b4 100644 --- a/ApplicationLibCode/FileInterface/RifOsduWellPathReader.cpp +++ b/ApplicationLibCode/FileInterface/RifOsduWellPathReader.cpp @@ -143,7 +143,7 @@ std::pair, QString> RifOsduWellPathReader::readWellPathDat if ( column->type()->id() == arrow::Type::DOUBLE ) { - std::vector columnVector = RifArrowTools::convertChunkedArrayToStdVector( column ); + std::vector columnVector = RifArrowTools::chunkedArrayToVector( column ); RiaLogging::debug( QString( "Column name: %1. Size: %2" ).arg( QString::fromStdString( columnName ) ).arg( columnVector.size() ) ); readValues[columnName] = columnVector; } diff --git a/ApplicationLibCode/UnitTests/RifParquetReader-Test.cpp b/ApplicationLibCode/UnitTests/RifParquetReader-Test.cpp index b3341153eb..0a88dee51c 100644 --- a/ApplicationLibCode/UnitTests/RifParquetReader-Test.cpp +++ b/ApplicationLibCode/UnitTests/RifParquetReader-Test.cpp @@ -2,7 +2,10 @@ #include "RiaTestDataDirectory.h" +#include "RifArrowTools.h" + #undef signals +#include #include #include #include @@ -51,3 +54,59 @@ TEST( RifParquetReaderTest, ReadValidFile ) EXPECT_TRUE( scalar->Equals( arrow::Int64Scalar( i ) ) ); } } + +TEST( RifParquetReaderTest, ConvertIntChunkedArrays ) +{ + arrow::Status status; + + arrow::Int32Builder int_builder; + status = int_builder.Append( 1 ); + status = int_builder.Append( 2 ); + status = int_builder.Append( 3 ); + + std::shared_ptr int_array; + status = int_builder.Finish( &int_array ); + + auto int_chunked_array = std::make_shared( int_array ); + + { + auto columnVector = RifArrowTools::chunkedArrayToVector( int_chunked_array ); + EXPECT_EQ( columnVector.size(), 3 ); + } + { + auto columnVector = RifArrowTools::chunkedArrayToVector( int_chunked_array ); + EXPECT_EQ( columnVector.size(), 3 ); + } +} + +TEST( RifParquetReaderTest, ConvertFloatChunkedArrays ) +{ + arrow::Status status; + + // Create an Arrow double array + std::vector values = { 1.0, 2.0, 3.0, 4.0 }; + std::shared_ptr array; + arrow::DoubleBuilder builder; + status = builder.AppendValues( values ); + status = builder.Finish( &array ); + + // Create a chunked array from the Arrow array + std::shared_ptr chunkedArray = std::make_shared( array ); + + // Call the function under test + auto resultVector = RifArrowTools::chunkedArrayToVector( chunkedArray ); + + // Assert that the returned vector contains the expected values + ASSERT_EQ( resultVector.size(), values.size() ); + for ( size_t i = 0; i < values.size(); ++i ) + { + EXPECT_DOUBLE_EQ( resultVector[i], values[i] ); + } + + auto floatVector = RifArrowTools::chunkedArrayToVector( chunkedArray ); + ASSERT_EQ( floatVector.size(), values.size() ); + for ( size_t i = 0; i < values.size(); ++i ) + { + EXPECT_DOUBLE_EQ( floatVector[i], values[i] ); + } +}