mirror of
https://github.com/OPM/ResInsight.git
synced 2025-02-25 18:55:39 -06:00
Use template to convert arrow::ChunkedArray
This commit is contained in:
parent
58b3f0b928
commit
f2d79da8be
@ -18,40 +18,86 @@
|
||||
|
||||
#include "RifArrowTools.h"
|
||||
|
||||
#include "cafAssert.h"
|
||||
#include "RifByteArrayArrowRandomAccessFile.h"
|
||||
#include "RifCsvDataTableFormatter.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
//--------------------------------------------------------------------------------------------------
|
||||
///
|
||||
//--------------------------------------------------------------------------------------------------
|
||||
std::vector<double> RifArrowTools::convertChunkedArrayToStdVector( const std::shared_ptr<arrow::ChunkedArray>& column )
|
||||
QString RifArrowTools::readFirstRowsOfTable( const QByteArray& contents )
|
||||
{
|
||||
auto convertChunkToVector = []( const std::shared_ptr<arrow::Array>& array ) -> std::vector<double>
|
||||
{
|
||||
std::vector<double> result;
|
||||
arrow::MemoryPool* pool = arrow::default_memory_pool();
|
||||
|
||||
auto double_array = std::static_pointer_cast<arrow::DoubleArray>( array );
|
||||
result.resize( double_array->length() );
|
||||
for ( int64_t i = 0; i < double_array->length(); ++i )
|
||||
std::shared_ptr<arrow::io::RandomAccessFile> input = std::make_shared<RifByteArrayArrowRandomAccessFile>( contents );
|
||||
|
||||
// Open Parquet file reader
|
||||
std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
|
||||
if ( !parquet::arrow::OpenFile( input, pool, &arrow_reader ).ok() )
|
||||
{
|
||||
result[i] = double_array->Value( i );
|
||||
return {};
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
CAF_ASSERT( column->type()->id() == arrow::Type::DOUBLE );
|
||||
|
||||
std::vector<double> result;
|
||||
|
||||
// Iterate over each chunk in the column
|
||||
for ( int i = 0; i < column->num_chunks(); ++i )
|
||||
// Read entire file as a single Arrow table
|
||||
std::shared_ptr<arrow::Table> table;
|
||||
if ( !arrow_reader->ReadTable( &table ).ok() )
|
||||
{
|
||||
std::shared_ptr<arrow::Array> chunk = column->chunk( i );
|
||||
std::vector<double> chunk_vector = convertChunkToVector( chunk );
|
||||
result.insert( result.end(), chunk_vector.begin(), chunk_vector.end() );
|
||||
return {};
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
QString tableText;
|
||||
QTextStream stream( &tableText );
|
||||
RifCsvDataTableFormatter formatter( stream, ";" );
|
||||
|
||||
std::vector<RifTextDataTableColumn> header;
|
||||
for ( std::string columnName : table->ColumnNames() )
|
||||
{
|
||||
header.push_back( RifTextDataTableColumn( QString::fromStdString( columnName ) ) );
|
||||
}
|
||||
|
||||
formatter.header( header );
|
||||
|
||||
std::vector<std::vector<double>> columnVectors;
|
||||
|
||||
for ( std::string columnName : table->ColumnNames() )
|
||||
{
|
||||
std::shared_ptr<arrow::ChunkedArray> column = table->GetColumnByName( columnName );
|
||||
|
||||
auto columnType = column->type()->id();
|
||||
|
||||
if ( columnType == arrow::Type::DOUBLE )
|
||||
{
|
||||
std::vector<double> columnVector = RifArrowTools::chunkedArrayToVector<arrow::DoubleArray, double>( column );
|
||||
columnVectors.push_back( columnVector );
|
||||
}
|
||||
else if ( columnType == arrow::Type::FLOAT )
|
||||
{
|
||||
auto columnVector = RifArrowTools::chunkedArrayToVector<arrow::FloatArray, double>( column );
|
||||
columnVectors.push_back( columnVector );
|
||||
}
|
||||
else if ( columnType == arrow::Type::TIMESTAMP )
|
||||
{
|
||||
auto columnVector = RifArrowTools::chunkedArrayToVector<arrow::Int64Array, double>( column );
|
||||
columnVectors.push_back( columnVector );
|
||||
}
|
||||
}
|
||||
|
||||
if ( columnVectors.empty() )
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
for ( int i = 0; i < std::min( 20, int( columnVectors[0].size() ) ); i++ )
|
||||
{
|
||||
for ( int j = 0; j < int( columnVectors.size() ); j++ )
|
||||
{
|
||||
formatter.add( columnVectors[j][i] );
|
||||
}
|
||||
formatter.rowCompleted();
|
||||
}
|
||||
|
||||
formatter.tableCompleted();
|
||||
|
||||
return tableText;
|
||||
}
|
||||
|
@ -22,15 +22,50 @@
|
||||
#include <arrow/array/array_primitive.h>
|
||||
#define signals Q_SIGNALS
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <QByteArray>
|
||||
#include <QString>
|
||||
|
||||
//==================================================================================================
|
||||
//
|
||||
//
|
||||
//==================================================================================================
|
||||
class RifArrowTools
|
||||
namespace RifArrowTools
|
||||
{
|
||||
public:
|
||||
static std::vector<double> convertChunkedArrayToStdVector( const std::shared_ptr<arrow::ChunkedArray>& column );
|
||||
};
|
||||
|
||||
// Template class used to handle most of the basic types. Conversiont to std::string requuires a specialization using chunk->GetString(j).
|
||||
template <typename ArrowArrayType, typename CType>
|
||||
std::vector<CType> chunkedArrayToVector( const std::shared_ptr<arrow::ChunkedArray>& chunkedArray )
|
||||
{
|
||||
static_assert( std::is_base_of<arrow::Array, ArrowArrayType>::value, "ArrowArrayType must be derived from arrow::Array" );
|
||||
|
||||
std::vector<CType> result;
|
||||
for ( int i = 0; i < chunkedArray->num_chunks(); ++i )
|
||||
{
|
||||
auto chunk = std::static_pointer_cast<ArrowArrayType>( chunkedArray->chunk( i ) );
|
||||
|
||||
// Use auto here instead of CType to allow conversion between different types
|
||||
// Use raw_values() to get the raw data pointer for best performance
|
||||
const auto* data = chunk->raw_values();
|
||||
|
||||
for ( int j = 0; j < chunk->length(); ++j )
|
||||
{
|
||||
if ( !chunk->IsNull( j ) )
|
||||
{
|
||||
result.push_back( data[j] );
|
||||
}
|
||||
else
|
||||
{
|
||||
result.push_back( std::numeric_limits<CType>::quiet_NaN() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
QString readFirstRowsOfTable( const QByteArray& contents );
|
||||
}; // namespace RifArrowTools
|
||||
|
@ -65,7 +65,7 @@ std::pair<cvf::ref<RigOsduWellLogData>, QString> RifOsduWellLogReader::readWellL
|
||||
|
||||
if ( column->type()->id() == arrow::Type::DOUBLE )
|
||||
{
|
||||
std::vector<double> columnVector = RifArrowTools::convertChunkedArrayToStdVector( column );
|
||||
std::vector<double> columnVector = RifArrowTools::chunkedArrayToVector<arrow::DoubleArray, double>( column );
|
||||
logData->setValues( QString::fromStdString( columnName ), columnVector );
|
||||
}
|
||||
}
|
||||
|
@ -143,7 +143,7 @@ std::pair<cvf::ref<RigWellPath>, QString> RifOsduWellPathReader::readWellPathDat
|
||||
|
||||
if ( column->type()->id() == arrow::Type::DOUBLE )
|
||||
{
|
||||
std::vector<double> columnVector = RifArrowTools::convertChunkedArrayToStdVector( column );
|
||||
std::vector<double> columnVector = RifArrowTools::chunkedArrayToVector<arrow::DoubleArray, double>( column );
|
||||
RiaLogging::debug( QString( "Column name: %1. Size: %2" ).arg( QString::fromStdString( columnName ) ).arg( columnVector.size() ) );
|
||||
readValues[columnName] = columnVector;
|
||||
}
|
||||
|
@ -2,7 +2,10 @@
|
||||
|
||||
#include "RiaTestDataDirectory.h"
|
||||
|
||||
#include "RifArrowTools.h"
|
||||
|
||||
#undef signals
|
||||
#include <arrow/array/builder_primitive.h>
|
||||
#include <arrow/csv/api.h>
|
||||
#include <arrow/io/api.h>
|
||||
#include <arrow/scalar.h>
|
||||
@ -51,3 +54,59 @@ TEST( RifParquetReaderTest, ReadValidFile )
|
||||
EXPECT_TRUE( scalar->Equals( arrow::Int64Scalar( i ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
TEST( RifParquetReaderTest, ConvertIntChunkedArrays )
|
||||
{
|
||||
arrow::Status status;
|
||||
|
||||
arrow::Int32Builder int_builder;
|
||||
status = int_builder.Append( 1 );
|
||||
status = int_builder.Append( 2 );
|
||||
status = int_builder.Append( 3 );
|
||||
|
||||
std::shared_ptr<arrow::Array> int_array;
|
||||
status = int_builder.Finish( &int_array );
|
||||
|
||||
auto int_chunked_array = std::make_shared<arrow::ChunkedArray>( int_array );
|
||||
|
||||
{
|
||||
auto columnVector = RifArrowTools::chunkedArrayToVector<arrow::FloatArray, double>( int_chunked_array );
|
||||
EXPECT_EQ( columnVector.size(), 3 );
|
||||
}
|
||||
{
|
||||
auto columnVector = RifArrowTools::chunkedArrayToVector<arrow::Int32Array, int>( int_chunked_array );
|
||||
EXPECT_EQ( columnVector.size(), 3 );
|
||||
}
|
||||
}
|
||||
|
||||
TEST( RifParquetReaderTest, ConvertFloatChunkedArrays )
|
||||
{
|
||||
arrow::Status status;
|
||||
|
||||
// Create an Arrow double array
|
||||
std::vector<double> values = { 1.0, 2.0, 3.0, 4.0 };
|
||||
std::shared_ptr<arrow::Array> array;
|
||||
arrow::DoubleBuilder builder;
|
||||
status = builder.AppendValues( values );
|
||||
status = builder.Finish( &array );
|
||||
|
||||
// Create a chunked array from the Arrow array
|
||||
std::shared_ptr<arrow::ChunkedArray> chunkedArray = std::make_shared<arrow::ChunkedArray>( array );
|
||||
|
||||
// Call the function under test
|
||||
auto resultVector = RifArrowTools::chunkedArrayToVector<arrow::DoubleArray, double>( chunkedArray );
|
||||
|
||||
// Assert that the returned vector contains the expected values
|
||||
ASSERT_EQ( resultVector.size(), values.size() );
|
||||
for ( size_t i = 0; i < values.size(); ++i )
|
||||
{
|
||||
EXPECT_DOUBLE_EQ( resultVector[i], values[i] );
|
||||
}
|
||||
|
||||
auto floatVector = RifArrowTools::chunkedArrayToVector<arrow::DoubleArray, float>( chunkedArray );
|
||||
ASSERT_EQ( floatVector.size(), values.size() );
|
||||
for ( size_t i = 0; i < values.size(); ++i )
|
||||
{
|
||||
EXPECT_DOUBLE_EQ( floatVector[i], values[i] );
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user