2020-06-22 11:26:49 -05:00
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project ( OPM ) .
OPM is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
OPM is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with OPM . If not , see < http : //www.gnu.org/licenses/>.
*/
2020-06-25 11:44:49 -05:00
# include <config.h>
2020-06-22 11:26:49 -05:00
# include <cmath>
# include <sstream>
# include <opm/common/OpmLog/OpmLog.hpp>
# include <opm/common/ErrorMacros.hpp>
2020-07-01 07:50:34 -05:00
# include <dune/common/timer.hh>
2020-06-22 11:26:49 -05:00
# define __CL_ENABLE_EXCEPTIONS
# include <CL/cl.hpp>
# include <opm/simulators/linalg/bda/openclKernels.hpp>
# include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
# include <opm/simulators/linalg/bda/BdaResult.hpp>
# include <opm/simulators/linalg/bda/Reorder.hpp>
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
# define COPY_ROW_BY_ROW 0
2020-06-25 07:42:35 -05:00
// Level Scheduling respects the depencies in the original matrix
// Graph Coloring is more aggresive and is likely to change the number of linearizations and linear iterations to converge, but can still be faster on GPU because it results in more parallelism
2020-06-22 11:26:49 -05:00
# define LEVEL_SCHEDULING 1
# define GRAPH_COLORING 0
namespace bda
{
using Opm : : OpmLog ;
2020-07-01 07:50:34 -05:00
using Dune : : Timer ;
2020-06-22 11:26:49 -05:00
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
openclSolverBackend < block_size > : : openclSolverBackend ( int verbosity_ , int maxit_ , double tolerance_ ) : BdaSolver < block_size > ( verbosity_ , maxit_ , tolerance_ ) {
2020-06-22 11:26:49 -05:00
prec = new Preconditioner ( LEVEL_SCHEDULING , GRAPH_COLORING , verbosity_ ) ;
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
openclSolverBackend < block_size > : : ~ openclSolverBackend ( ) {
2020-06-22 11:26:49 -05:00
finalize ( ) ;
}
// divide A by B, and round up: return (int)ceil(A/B)
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
unsigned int openclSolverBackend < block_size > : : ceilDivision ( const unsigned int A , const unsigned int B )
2020-06-22 11:26:49 -05:00
{
return A / B + ( A % B > 0 ) ;
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
double openclSolverBackend < block_size > : : dot_w ( cl : : Buffer in1 , cl : : Buffer in2 , cl : : Buffer out )
2020-06-22 11:26:49 -05:00
{
const unsigned int work_group_size = 1024 ;
const unsigned int num_work_groups = ceilDivision ( N , work_group_size ) ;
const unsigned int total_work_items = num_work_groups * work_group_size ;
const unsigned int lmem_per_work_group = sizeof ( double ) * work_group_size ;
2020-07-01 07:50:34 -05:00
Timer t_dot ;
2020-06-22 11:26:49 -05:00
cl : : Event event = ( * dot_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) , in1 , in2 , out , N , cl : : Local ( lmem_per_work_group ) ) ;
queue - > enqueueReadBuffer ( out , CL_TRUE , 0 , sizeof ( double ) * num_work_groups , tmp ) ;
double gpu_sum = 0.0 ;
for ( unsigned int i = 0 ; i < num_work_groups ; + + i ) {
gpu_sum + = tmp [ i ] ;
}
if ( verbosity > = 4 ) {
event . wait ( ) ;
std : : ostringstream oss ;
2020-07-01 07:50:34 -05:00
oss < < std : : scientific < < " openclSolver dot_w time: " < < t_dot . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( oss . str ( ) ) ;
}
return gpu_sum ;
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
double openclSolverBackend < block_size > : : norm_w ( cl : : Buffer in , cl : : Buffer out )
2020-06-22 11:26:49 -05:00
{
const unsigned int work_group_size = 1024 ;
const unsigned int num_work_groups = ceilDivision ( N , work_group_size ) ;
const unsigned int total_work_items = num_work_groups * work_group_size ;
const unsigned int lmem_per_work_group = sizeof ( double ) * work_group_size ;
2020-07-01 07:50:34 -05:00
Timer t_norm ;
2020-06-22 11:26:49 -05:00
cl : : Event event = ( * norm_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) , in , out , N , cl : : Local ( lmem_per_work_group ) ) ;
queue - > enqueueReadBuffer ( out , CL_TRUE , 0 , sizeof ( double ) * num_work_groups , tmp ) ;
double gpu_norm = 0.0 ;
for ( unsigned int i = 0 ; i < num_work_groups ; + + i ) {
gpu_norm + = tmp [ i ] ;
}
gpu_norm = sqrt ( gpu_norm ) ;
if ( verbosity > = 4 ) {
event . wait ( ) ;
std : : ostringstream oss ;
2020-07-01 07:50:34 -05:00
oss < < std : : scientific < < " openclSolver norm_w time: " < < t_norm . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( oss . str ( ) ) ;
}
return gpu_norm ;
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : axpy_w ( cl : : Buffer in , const double a , cl : : Buffer out )
2020-06-22 11:26:49 -05:00
{
const unsigned int work_group_size = 32 ;
const unsigned int num_work_groups = ceilDivision ( N , work_group_size ) ;
const unsigned int total_work_items = num_work_groups * work_group_size ;
2020-07-01 07:50:34 -05:00
Timer t_axpy ;
2020-06-22 11:26:49 -05:00
cl : : Event event = ( * axpy_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) , in , a , out , N ) ;
if ( verbosity > = 4 ) {
event . wait ( ) ;
std : : ostringstream oss ;
2020-07-01 07:50:34 -05:00
oss < < std : : scientific < < " openclSolver axpy_w time: " < < t_axpy . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( oss . str ( ) ) ;
}
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : custom_w ( cl : : Buffer p , cl : : Buffer v , cl : : Buffer r , const double omega , const double beta )
2020-06-22 11:26:49 -05:00
{
const unsigned int work_group_size = 32 ;
const unsigned int num_work_groups = ceilDivision ( N , work_group_size ) ;
const unsigned int total_work_items = num_work_groups * work_group_size ;
2020-07-01 07:50:34 -05:00
Timer t_custom ;
2020-06-22 11:26:49 -05:00
cl : : Event event = ( * custom_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) , p , v , r , omega , beta , N ) ;
if ( verbosity > = 4 ) {
event . wait ( ) ;
std : : ostringstream oss ;
2020-07-01 07:50:34 -05:00
oss < < std : : scientific < < " openclSolver custom_w time: " < < t_custom . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( oss . str ( ) ) ;
}
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : spmv_blocked_w ( cl : : Buffer vals , cl : : Buffer cols , cl : : Buffer rows , cl : : Buffer x , cl : : Buffer b )
2020-06-22 11:26:49 -05:00
{
const unsigned int work_group_size = 32 ;
const unsigned int num_work_groups = ceilDivision ( N , work_group_size ) ;
const unsigned int total_work_items = num_work_groups * work_group_size ;
const unsigned int lmem_per_work_group = sizeof ( double ) * work_group_size ;
2020-07-01 07:50:34 -05:00
Timer t_spmv ;
2020-06-22 11:26:49 -05:00
cl : : Event event = ( * spmv_blocked_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) , vals , cols , rows , Nb , x , b , block_size , cl : : Local ( lmem_per_work_group ) ) ;
if ( verbosity > = 4 ) {
event . wait ( ) ;
std : : ostringstream oss ;
2020-07-01 07:50:34 -05:00
oss < < std : : scientific < < " openclSolver spmv_blocked_w time: " < < t_spmv . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( oss . str ( ) ) ;
}
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : gpu_pbicgstab ( WellContributions & wellContribs , BdaResult & res ) {
2020-06-22 11:26:49 -05:00
float it ;
double rho , rhop , beta , alpha , omega , tmp1 , tmp2 ;
double norm , norm_0 ;
2020-07-01 07:50:34 -05:00
Timer t_total , t_prec ( false ) , t_spmv ( false ) , t_well ( false ) , t_rest ( false ) ;
2020-06-22 11:26:49 -05:00
wellContribs . setOpenCLQueue ( queue . get ( ) ) ;
wellContribs . setReordering ( toOrder , true ) ;
// set r to the initial residual
// if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
//applyblockedscaleadd(-1.0, mat, x, r);
// set initial values
cl : : Event event ;
queue - > enqueueFillBuffer ( d_p , 0 , 0 , sizeof ( double ) * N ) ;
queue - > enqueueFillBuffer ( d_v , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
rho = 1.0 ;
alpha = 1.0 ;
omega = 1.0 ;
queue - > enqueueCopyBuffer ( d_b , d_r , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
queue - > enqueueCopyBuffer ( d_r , d_rw , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
queue - > enqueueCopyBuffer ( d_r , d_p , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
norm = norm_w ( d_r , d_tmp ) ;
norm_0 = norm ;
if ( verbosity > 1 ) {
std : : ostringstream out ;
out < < std : : scientific < < " openclSolver initial norm: " < < norm_0 ;
OpmLog : : info ( out . str ( ) ) ;
}
2020-07-01 07:50:34 -05:00
t_rest . start ( ) ;
2020-06-22 11:26:49 -05:00
for ( it = 0.5 ; it < maxit ; it + = 0.5 ) {
rhop = rho ;
rho = dot_w ( d_rw , d_r , d_tmp ) ;
if ( it > 1 ) {
beta = ( rho / rhop ) * ( alpha / omega ) ;
custom_w ( d_p , d_v , d_r , omega , beta ) ;
}
2020-07-01 07:50:34 -05:00
t_rest . stop ( ) ;
2020-06-22 11:26:49 -05:00
// pw = prec(p)
2020-07-01 07:50:34 -05:00
t_prec . start ( ) ;
2020-06-22 11:26:49 -05:00
prec - > apply ( d_p , d_pw ) ;
2020-07-01 07:50:34 -05:00
t_prec . stop ( ) ;
2020-06-22 11:26:49 -05:00
// v = A * pw
2020-07-01 07:50:34 -05:00
t_spmv . start ( ) ;
2020-06-22 11:26:49 -05:00
spmv_blocked_w ( d_Avals , d_Acols , d_Arows , d_pw , d_v ) ;
2020-07-01 07:50:34 -05:00
t_spmv . stop ( ) ;
2020-06-22 11:26:49 -05:00
// apply wellContributions
if ( wellContribs . getNumWells ( ) > 0 ) {
2020-07-01 07:50:34 -05:00
t_well . start ( ) ;
2020-06-22 11:26:49 -05:00
wellContribs . apply ( d_pw , d_v ) ;
2020-07-01 07:50:34 -05:00
t_well . stop ( ) ;
2020-06-22 11:26:49 -05:00
}
2020-07-01 07:50:34 -05:00
t_rest . start ( ) ;
2020-06-22 11:26:49 -05:00
tmp1 = dot_w ( d_rw , d_v , d_tmp ) ;
alpha = rho / tmp1 ;
axpy_w ( d_v , - alpha , d_r ) ; // r = r - alpha * v
axpy_w ( d_pw , alpha , d_x ) ; // x = x + alpha * pw
norm = norm_w ( d_r , d_tmp ) ;
2020-07-01 07:50:34 -05:00
t_rest . stop ( ) ;
2020-06-22 11:26:49 -05:00
if ( norm < tolerance * norm_0 ) {
break ;
}
it + = 0.5 ;
// s = prec(r)
2020-07-01 07:50:34 -05:00
t_prec . start ( ) ;
2020-06-22 11:26:49 -05:00
prec - > apply ( d_r , d_s ) ;
2020-07-01 07:50:34 -05:00
t_prec . stop ( ) ;
2020-06-22 11:26:49 -05:00
// t = A * s
2020-07-01 07:50:34 -05:00
t_spmv . start ( ) ;
2020-06-22 11:26:49 -05:00
spmv_blocked_w ( d_Avals , d_Acols , d_Arows , d_s , d_t ) ;
2020-07-01 07:50:34 -05:00
t_spmv . stop ( ) ;
2020-06-22 11:26:49 -05:00
// apply wellContributions
if ( wellContribs . getNumWells ( ) > 0 ) {
2020-07-01 07:50:34 -05:00
t_well . start ( ) ;
2020-06-22 11:26:49 -05:00
wellContribs . apply ( d_s , d_t ) ;
2020-07-01 07:50:34 -05:00
t_well . stop ( ) ;
2020-06-22 11:26:49 -05:00
}
2020-07-01 07:50:34 -05:00
t_rest . start ( ) ;
2020-06-22 11:26:49 -05:00
tmp1 = dot_w ( d_t , d_r , d_tmp ) ;
tmp2 = dot_w ( d_t , d_t , d_tmp ) ;
omega = tmp1 / tmp2 ;
axpy_w ( d_s , omega , d_x ) ; // x = x + omega * s
axpy_w ( d_t , - omega , d_r ) ; // r = r - omega * t
norm = norm_w ( d_r , d_tmp ) ;
2020-07-01 07:50:34 -05:00
t_rest . stop ( ) ;
2020-06-22 11:26:49 -05:00
if ( norm < tolerance * norm_0 ) {
break ;
}
if ( verbosity > 1 ) {
std : : ostringstream out ;
out < < " it: " < < it < < std : : scientific < < " , norm: " < < norm ;
OpmLog : : info ( out . str ( ) ) ;
}
}
res . iterations = std : : min ( it , ( float ) maxit ) ;
res . reduction = norm / norm_0 ;
res . conv_rate = static_cast < double > ( pow ( res . reduction , 1.0 / it ) ) ;
2020-07-01 07:50:34 -05:00
res . elapsed = t_total . stop ( ) ;
2020-06-22 11:26:49 -05:00
res . converged = ( it ! = ( maxit + 0.5 ) ) ;
if ( verbosity > 0 ) {
std : : ostringstream out ;
out < < " === converged: " < < res . converged < < " , conv_rate: " < < res . conv_rate < < " , time: " < < res . elapsed < < \
" , time per iteration: " < < res . elapsed / it < < " , iterations: " < < it ;
OpmLog : : info ( out . str ( ) ) ;
}
if ( verbosity > = 4 ) {
std : : ostringstream out ;
out < < " openclSolver::ily_apply: " < < prec_time < < " s \n " ;
out < < " openclSolver::spmv: " < < spmv_time < < " s \n " ;
out < < " openclSolver::rest: " < < rest_time < < " s \n " ;
out < < " openclSolver::total_solve: " < < res . elapsed < < " s \n " ;
OpmLog : : info ( out . str ( ) ) ;
}
}
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : initialize ( int N_ , int nnz_ , int dim , double * vals , int * rows , int * cols ) {
2020-06-22 11:26:49 -05:00
this - > N = N_ ;
this - > nnz = nnz_ ;
this - > nnzb = nnz_ / block_size / block_size ;
Nb = ( N + dim - 1 ) / dim ;
std : : ostringstream out ;
out < < " Initializing GPU, matrix size: " < < N < < " blocks, nnzb: " < < nnzb < < " \n " ;
out < < " Maxit: " < < maxit < < std : : scientific < < " , tolerance: " < < tolerance < < " \n " ;
OpmLog : : info ( out . str ( ) ) ;
out . str ( " " ) ;
out . clear ( ) ;
int deviceID = 0 ;
cl_int err = CL_SUCCESS ;
try {
std : : vector < cl : : Platform > platforms ;
cl : : Platform : : get ( & platforms ) ;
if ( platforms . size ( ) = = 0 )
{
OPM_THROW ( std : : logic_error , " Error openclSolver is selected but no OpenCL platforms are found " ) ;
}
out < < " Found " < < platforms . size ( ) < < " OpenCL platforms " < < " \n \n " ;
if ( verbosity > = 1 ) {
std : : string platform_info ;
for ( unsigned int platformID = 0 ; platformID < platforms . size ( ) ; + + platformID ) {
platforms [ platformID ] . getInfo ( CL_PLATFORM_NAME , & platform_info ) ;
out < < " Platform name : " < < platform_info < < " \n " ;
platforms [ platformID ] . getInfo ( CL_PLATFORM_VENDOR , & platform_info ) ;
out < < " Platform vendor : " < < platform_info < < " \n " ;
platforms [ platformID ] . getInfo ( CL_PLATFORM_VERSION , & platform_info ) ;
out < < " Platform version : " < < platform_info < < " \n " ;
platforms [ platformID ] . getInfo ( CL_PLATFORM_PROFILE , & platform_info ) ;
out < < " Platform profile : " < < platform_info < < " \n " ;
platforms [ platformID ] . getInfo ( CL_PLATFORM_EXTENSIONS , & platform_info ) ;
out < < " Platform extensions: " < < platform_info < < " \n \n " ;
}
}
OpmLog : : info ( out . str ( ) ) ;
out . str ( " " ) ;
out . clear ( ) ;
cl_context_properties properties [ ] = { CL_CONTEXT_PLATFORM , ( cl_context_properties ) ( platforms [ deviceID ] ) ( ) , 0 } ;
context . reset ( new cl : : Context ( CL_DEVICE_TYPE_GPU , properties ) ) ;
std : : vector < cl : : Device > devices = context - > getInfo < CL_CONTEXT_DEVICES > ( ) ;
if ( devices . size ( ) = = 0 ) {
OPM_THROW ( std : : logic_error , " Error openclSolver is selected but no OpenCL devices are found " ) ;
}
out < < " Found " < < devices . size ( ) < < " OpenCL devices " < < " \n \n " ;
if ( verbosity > = 1 ) {
for ( unsigned int i = 0 ; i < devices . size ( ) ; + + i ) {
std : : string device_info ;
std : : vector < size_t > work_sizes ;
std : : vector < cl_device_partition_property > partitions ;
devices [ i ] . getInfo ( CL_DEVICE_NAME , & device_info ) ;
out < < " CL_DEVICE_NAME : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_VENDOR , & device_info ) ;
out < < " CL_DEVICE_VENDOR : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DRIVER_VERSION , & device_info ) ;
out < < " CL_DRIVER_VERSION : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_BUILT_IN_KERNELS , & device_info ) ;
out < < " CL_DEVICE_BUILT_IN_KERNELS: " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_VERSION , & device_info ) ;
out < < " CL_DEVICE_VERSION : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_PROFILE , & device_info ) ;
out < < " CL_DEVICE_PROFILE : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_OPENCL_C_VERSION , & device_info ) ;
out < < " CL_DEVICE_OPENCL_C_VERSION: " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_EXTENSIONS , & device_info ) ;
out < < " CL_DEVICE_EXTENSIONS : " < < device_info < < " \n " ;
devices [ i ] . getInfo ( CL_DEVICE_MAX_WORK_ITEM_SIZES , & work_sizes ) ;
for ( unsigned int j = 0 ; j < work_sizes . size ( ) ; + + j ) {
out < < " CL_DEVICE_MAX_WORK_ITEM_SIZES[ " < < j < < " ]: " < < work_sizes [ j ] < < " \n " ;
}
devices [ i ] . getInfo ( CL_DEVICE_PARTITION_PROPERTIES , & partitions ) ;
for ( unsigned int j = 0 ; j < partitions . size ( ) ; + + j ) {
out < < " CL_DEVICE_PARTITION_PROPERTIES[ " < < j < < " ]: " < < partitions [ j ] < < " \n " ;
}
partitions . clear ( ) ;
devices [ i ] . getInfo ( CL_DEVICE_PARTITION_TYPE , & partitions ) ;
for ( unsigned int j = 0 ; j < partitions . size ( ) ; + + j ) {
out < < " CL_DEVICE_PARTITION_PROPERTIES[ " < < j < < " ]: " < < partitions [ j ] < < " \n " ;
}
// C-style properties
cl_device_id tmp_id = devices [ i ] ( ) ;
cl_ulong size ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_LOCAL_MEM_SIZE , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_LOCAL_MEM_SIZE : " < < size / 1024 < < " KB \n " ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_GLOBAL_MEM_SIZE , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_GLOBAL_MEM_SIZE : " < < size / 1024 / 1024 / 1024 < < " GB \n " ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_MAX_COMPUTE_UNITS , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_MAX_COMPUTE_UNITS : " < < size < < " \n " ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_MAX_MEM_ALLOC_SIZE : " < < size / 1024 / 1024 < < " MB \n " ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_MAX_WORK_GROUP_SIZE , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_MAX_WORK_GROUP_SIZE : " < < size < < " \n " ;
clGetDeviceInfo ( tmp_id , CL_DEVICE_GLOBAL_MEM_SIZE , sizeof ( cl_ulong ) , & size , 0 ) ;
out < < " CL_DEVICE_GLOBAL_MEM_SIZE : " < < size / 1024 / 1024 / 1024 < < " GB \n \n " ;
}
}
OpmLog : : info ( out . str ( ) ) ;
cl : : Program : : Sources source ( 1 , std : : make_pair ( kernel_1 , strlen ( kernel_1 ) ) ) ; // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
source . emplace_back ( std : : make_pair ( kernel_2 , strlen ( kernel_2 ) ) ) ;
source . emplace_back ( std : : make_pair ( axpy_s , strlen ( axpy_s ) ) ) ;
source . emplace_back ( std : : make_pair ( dot_1_s , strlen ( dot_1_s ) ) ) ;
source . emplace_back ( std : : make_pair ( norm_s , strlen ( norm_s ) ) ) ;
source . emplace_back ( std : : make_pair ( custom_s , strlen ( custom_s ) ) ) ;
source . emplace_back ( std : : make_pair ( spmv_blocked_s , strlen ( spmv_blocked_s ) ) ) ;
source . emplace_back ( std : : make_pair ( ILU_apply1_s , strlen ( ILU_apply1_s ) ) ) ;
source . emplace_back ( std : : make_pair ( ILU_apply2_s , strlen ( ILU_apply2_s ) ) ) ;
cl : : Program program_ = cl : : Program ( * context , source ) ;
program_ . build ( devices ) ;
cl : : Event event ;
queue . reset ( new cl : : CommandQueue ( * context , devices [ 0 ] , 0 , & err ) ) ;
prec - > setOpenCLContext ( context . get ( ) ) ;
prec - > setOpenCLQueue ( queue . get ( ) ) ;
rb = new double [ N ] ;
tmp = new double [ N ] ;
# if COPY_ROW_BY_ROW
vals_contiguous = new double [ N ] ;
# endif
2020-06-25 04:51:41 -05:00
mat = new BlockedMatrix ( ) ;
2020-06-22 11:26:49 -05:00
mat - > Nb = Nb ;
mat - > nnzbs = nnzb ;
2020-06-25 04:51:41 -05:00
mat - > nnzValues = vals ;
2020-06-22 11:26:49 -05:00
mat - > colIndices = cols ;
mat - > rowPointers = rows ;
d_x = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_b = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_rb = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_r = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_rw = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_p = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_pw = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_s = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_t = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_v = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_tmp = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * N ) ;
d_Avals = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * nnz ) ;
d_Acols = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * nnzb ) ;
d_Arows = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * ( Nb + 1 ) ) ;
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : LocalSpaceArg > ( cl : : Kernel ( program_ , " dot_1 " ) ) ) ;
norm_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : LocalSpaceArg > ( cl : : Kernel ( program_ , " norm " ) ) ) ;
axpy_k . reset ( new cl : : make_kernel < cl : : Buffer & , const double , cl : : Buffer & , const unsigned int > ( cl : : Kernel ( program_ , " axpy " ) ) ) ;
custom_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const double , const double , const unsigned int > ( cl : : Kernel ( program_ , " custom " ) ) ) ;
spmv_blocked_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : LocalSpaceArg > ( cl : : Kernel ( program_ , " spmv_blocked " ) ) ) ;
ILU_apply1_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , const unsigned int , cl : : LocalSpaceArg > ( cl : : Kernel ( program_ , " ILU_apply1 " ) ) ) ;
ILU_apply2_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , cl : : Buffer & , cl : : Buffer & , cl : : Buffer & , const unsigned int , const unsigned int , cl : : LocalSpaceArg > ( cl : : Kernel ( program_ , " ILU_apply2 " ) ) ) ;
prec - > setKernels ( ILU_apply1_k . get ( ) , ILU_apply2_k . get ( ) ) ;
} catch ( cl : : Error error ) {
std : : ostringstream oss ;
oss < < " OpenCL Error: " < < error . what ( ) < < " ( " < < error . err ( ) < < " ) " ;
OpmLog : : error ( oss . str ( ) ) ;
}
initialized = true ;
} // end initialize()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : finalize ( ) {
2020-06-25 04:51:41 -05:00
delete mat ;
2020-06-22 11:26:49 -05:00
delete [ ] rb ;
delete [ ] tmp ;
# if COPY_ROW_BY_ROW
delete [ ] vals_contiguous ;
# endif
} // end finalize()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : copy_system_to_gpu ( ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
cl : : Event event ;
# if COPY_ROW_BY_ROW
int sum = 0 ;
for ( int i = 0 ; i < Nb ; + + i ) {
int size_row = rmat - > rowPointers [ i + 1 ] - rmat - > rowPointers [ i ] ;
memcpy ( vals_contiguous + sum , reinterpret_cast < double * > ( rmat - > nnzValues ) + sum , size_row * sizeof ( double ) * block_size * block_size ) ;
sum + = size_row * block_size * block_size ;
}
queue - > enqueueWriteBuffer ( d_Avals , CL_TRUE , 0 , sizeof ( double ) * nnz , vals_contiguous ) ;
# else
queue - > enqueueWriteBuffer ( d_Avals , CL_TRUE , 0 , sizeof ( double ) * nnz , rmat - > nnzValues ) ;
# endif
queue - > enqueueWriteBuffer ( d_Acols , CL_TRUE , 0 , sizeof ( int ) * nnzb , rmat - > colIndices ) ;
queue - > enqueueWriteBuffer ( d_Arows , CL_TRUE , 0 , sizeof ( int ) * ( Nb + 1 ) , rmat - > rowPointers ) ;
queue - > enqueueWriteBuffer ( d_b , CL_TRUE , 0 , sizeof ( double ) * N , rb ) ;
queue - > enqueueFillBuffer ( d_x , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::copy_system_to_gpu(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : update_system_on_gpu ( ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
cl : : Event event ;
# if COPY_ROW_BY_ROW
int sum = 0 ;
for ( int i = 0 ; i < Nb ; + + i ) {
int size_row = rmat - > rowPointers [ i + 1 ] - rmat - > rowPointers [ i ] ;
memcpy ( vals_contiguous + sum , reinterpret_cast < double * > ( rmat - > nnzValues ) + sum , size_row * sizeof ( double ) * block_size * block_size ) ;
sum + = size_row * block_size * block_size ;
}
queue - > enqueueWriteBuffer ( d_Avals , CL_TRUE , 0 , sizeof ( double ) * nnz , vals_contiguous ) ;
# else
queue - > enqueueWriteBuffer ( d_Avals , CL_TRUE , 0 , sizeof ( double ) * nnz , rmat - > nnzValues ) ;
# endif
queue - > enqueueWriteBuffer ( d_b , CL_TRUE , 0 , sizeof ( double ) * N , rb ) ;
queue - > enqueueFillBuffer ( d_x , 0 , 0 , sizeof ( double ) * N , nullptr , & event ) ;
event . wait ( ) ;
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::update_system_on_gpu(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
} // end update_system_on_gpu()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
bool openclSolverBackend < block_size > : : analyse_matrix ( ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
2020-06-24 13:09:14 -05:00
bool success = prec - > init ( mat ) ;
2020-06-22 11:26:49 -05:00
int work_group_size = 32 ;
int num_work_groups = ceilDivision ( N , work_group_size ) ;
int total_work_items = num_work_groups * work_group_size ;
int lmem_per_work_group = work_group_size * sizeof ( double ) ;
prec - > setKernelParameters ( work_group_size , total_work_items , lmem_per_work_group ) ;
toOrder = prec - > getToOrder ( ) ;
fromOrder = prec - > getFromOrder ( ) ;
rmat = prec - > getRMat ( ) ;
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::analyse_matrix(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
analysis_done = true ;
return success ;
} // end analyse_matrix()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : update_system ( double * vals , double * b ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
2020-06-25 04:51:41 -05:00
mat - > nnzValues = vals ;
blocked_reorder_vector_by_pattern < block_size > ( mat - > Nb , b , fromOrder , rb ) ;
2020-06-22 11:26:49 -05:00
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::update_system(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
} // end update_system()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
bool openclSolverBackend < block_size > : : create_preconditioner ( ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
bool result = prec - > create_preconditioner ( mat ) ;
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::create_preconditioner(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
return result ;
} // end create_preconditioner()
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : solve_system ( WellContributions & wellContribs , BdaResult & res ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
2020-07-01 07:50:34 -05:00
// actually solve
2020-06-22 11:26:49 -05:00
gpu_pbicgstab ( wellContribs , res ) ;
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::solve_system(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
void openclSolverBackend < block_size > : : get_result ( double * x ) {
2020-07-01 07:50:34 -05:00
Timer t ;
2020-06-22 11:26:49 -05:00
queue - > enqueueReadBuffer ( d_x , CL_TRUE , 0 , sizeof ( double ) * N , rb ) ;
2020-06-25 04:51:41 -05:00
blocked_reorder_vector_by_pattern < block_size > ( mat - > Nb , rb , toOrder , x ) ;
2020-06-22 11:26:49 -05:00
if ( verbosity > 2 ) {
std : : ostringstream out ;
2020-07-01 07:50:34 -05:00
out < < " openclSolver::get_result(): " < < t . stop ( ) < < " s " ;
2020-06-22 11:26:49 -05:00
OpmLog : : info ( out . str ( ) ) ;
}
} // end get_result()
2020-06-24 13:09:14 -05:00
typedef BdaSolverStatus : : Status Status ;
2020-06-22 11:26:49 -05:00
2020-06-24 13:09:14 -05:00
template < unsigned int block_size >
Status openclSolverBackend < block_size > : : solve_system ( int N_ , int nnz_ , int dim , double * vals , int * rows , int * cols , double * b , WellContributions & wellContribs , BdaResult & res ) {
2020-06-22 11:26:49 -05:00
if ( initialized = = false ) {
initialize ( N_ , nnz_ , dim , vals , rows , cols ) ;
if ( analysis_done = = false ) {
if ( ! analyse_matrix ( ) ) {
2020-06-24 13:09:14 -05:00
return Status : : BDA_SOLVER_ANALYSIS_FAILED ;
2020-06-22 11:26:49 -05:00
}
}
update_system ( vals , b ) ;
if ( ! create_preconditioner ( ) ) {
2020-06-24 13:09:14 -05:00
return Status : : BDA_SOLVER_CREATE_PRECONDITIONER_FAILED ;
2020-06-22 11:26:49 -05:00
}
copy_system_to_gpu ( ) ;
} else {
update_system ( vals , b ) ;
if ( ! create_preconditioner ( ) ) {
2020-06-24 13:09:14 -05:00
return Status : : BDA_SOLVER_CREATE_PRECONDITIONER_FAILED ;
2020-06-22 11:26:49 -05:00
}
update_system_on_gpu ( ) ;
}
solve_system ( wellContribs , res ) ;
2020-06-24 13:09:14 -05:00
return Status : : BDA_SOLVER_SUCCESS ;
2020-06-22 11:26:49 -05:00
}
2020-06-24 13:09:14 -05:00
# define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend < n > : : openclSolverBackend ( int , int , double ) ; \
INSTANTIATE_BDA_FUNCTIONS ( 1 ) ;
INSTANTIATE_BDA_FUNCTIONS ( 2 ) ;
INSTANTIATE_BDA_FUNCTIONS ( 3 ) ;
INSTANTIATE_BDA_FUNCTIONS ( 4 ) ;
# undef INSTANTIATE_BDA_FUNCTIONS
} // namespace bda
2020-06-22 11:26:49 -05:00