2020-12-17 07:49:59 -06:00
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project ( OPM ) .
OPM is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
OPM is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with OPM . If not , see < http : //www.gnu.org/licenses/>.
*/
# include <opm/common/OpmLog/OpmLog.hpp>
# include <opm/common/ErrorMacros.hpp>
# include <dune/common/timer.hh>
2021-02-03 10:43:54 -06:00
# include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
2020-12-17 07:49:59 -06:00
namespace bda
{
using Opm : : OpmLog ;
// if PARALLEL is 0:
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
// In that case some workitems will process multiple blocks sequentially.
// else:
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially
// Each block in a row gets a warp of 32 workitems, of which 9 are always active.
// Multiple blocks can be processed in parallel if a workgroup contains multiple warps.
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
// Notes:
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
// PARALLEL 0 is generally faster, despite not having parallelization.
2021-01-26 06:45:40 -06:00
// only 3x3 blocks are supported
2020-12-17 07:49:59 -06:00
# define PARALLEL 0
# if PARALLEL
2021-02-03 10:43:54 -06:00
inline const char * chow_patel_ilu_sweep_s = R " (
2020-12-17 07:49:59 -06:00
# pragma OPENCL EXTENSION cl_khr_fp64 : enable
2021-01-26 06:45:40 -06:00
// subtract blocks: a = a - b * c
// the output block has 9 entries, each entry is calculated by 1 thread
2020-12-17 07:49:59 -06:00
void blockMultSub (
__local double * restrict a ,
__global const double * restrict b ,
__global const double * restrict c )
{
const unsigned int block_size = 3 ;
const unsigned int warp_size = 32 ;
const unsigned int idx_t = get_local_id ( 0 ) ; // thread id in work group
const unsigned int thread_id_in_warp = idx_t % warp_size ; // thread id in warp (32 threads)
if ( thread_id_in_warp < block_size * block_size ) {
const unsigned int row = thread_id_in_warp / block_size ;
const unsigned int col = thread_id_in_warp % block_size ;
double temp = 0.0 ;
for ( unsigned int k = 0 ; k < block_size ; k + + ) {
temp + = b [ block_size * row + k ] * c [ block_size * k + col ] ;
}
a [ block_size * row + col ] - = temp ;
}
}
2021-01-26 06:45:40 -06:00
// multiply blocks: resMat = mat1 * mat2
// the output block has 9 entries, each entry is calculated by 1 thread
2020-12-17 07:49:59 -06:00
void blockMult (
__local const double * restrict mat1 ,
__local const double * restrict mat2 ,
__global double * restrict resMat )
{
const unsigned int block_size = 3 ;
const unsigned int warp_size = 32 ;
const unsigned int idx_t = get_local_id ( 0 ) ; // thread id in work group
const unsigned int thread_id_in_warp = idx_t % warp_size ; // thread id in warp (32 threads)
if ( thread_id_in_warp < block_size * block_size ) {
const unsigned int row = thread_id_in_warp / block_size ;
const unsigned int col = thread_id_in_warp % block_size ;
double temp = 0.0 ;
for ( unsigned int k = 0 ; k < block_size ; k + + ) {
temp + = mat1 [ block_size * row + k ] * mat2 [ block_size * k + col ] ;
}
resMat [ block_size * row + col ] = temp ;
}
}
2021-01-26 06:45:40 -06:00
// invert block: inverse = matrix^{-1}
// the output block has 9 entries, each entry is calculated by 1 thread
2020-12-17 07:49:59 -06:00
void invert (
__global const double * restrict matrix ,
__local double * restrict inverse )
{
const unsigned int block_size = 3 ;
2021-01-26 06:45:40 -06:00
const unsigned int bs = block_size ; // rename to shorter name
2020-12-17 07:49:59 -06:00
const unsigned int warp_size = 32 ;
const unsigned int idx_t = get_local_id ( 0 ) ; // thread id in work group
const unsigned int thread_id_in_warp = idx_t % warp_size ; // thread id in warp (32 threads)
if ( thread_id_in_warp < block_size * block_size ) {
// code generated by maple, copied from Dune::DenseMatrix
double t4 = matrix [ 0 ] * matrix [ 4 ] ;
double t6 = matrix [ 0 ] * matrix [ 5 ] ;
double t8 = matrix [ 1 ] * matrix [ 3 ] ;
double t10 = matrix [ 2 ] * matrix [ 3 ] ;
double t12 = matrix [ 1 ] * matrix [ 6 ] ;
double t14 = matrix [ 2 ] * matrix [ 6 ] ;
double det = ( t4 * matrix [ 8 ] - t6 * matrix [ 7 ] - t8 * matrix [ 8 ] +
t10 * matrix [ 7 ] + t12 * matrix [ 5 ] - t14 * matrix [ 4 ] ) ;
double t17 = 1.0 / det ;
const unsigned int r = thread_id_in_warp / block_size ;
const unsigned int c = thread_id_in_warp % block_size ;
const unsigned int r1 = ( r + 1 ) % bs ;
const unsigned int c1 = ( c + 1 ) % bs ;
const unsigned int r2 = ( r + bs - 1 ) % bs ;
const unsigned int c2 = ( c + bs - 1 ) % bs ;
inverse [ c * bs + r ] = ( ( matrix [ r1 * bs + c1 ] * matrix [ r2 * bs + c2 ] ) - ( matrix [ r1 * bs + c2 ] * matrix [ r2 * bs + c1 ] ) ) * t17 ;
}
}
2021-01-26 06:45:40 -06:00
// perform the fixed-point iteration
// all entries in L and U are updated once
// output is written to [LU]tmp
// aij and ujj are local arrays whose size is specified before kernel launch
2021-02-03 10:43:54 -06:00
__kernel void chow_patel_ilu_sweep (
2020-12-17 07:49:59 -06:00
__global const double * restrict Ut_vals ,
__global const double * restrict L_vals ,
__global const double * restrict LU_vals ,
__global const int * restrict Ut_rows ,
__global const int * restrict L_rows ,
__global const int * restrict LU_rows ,
__global const int * restrict Ut_cols ,
__global const int * restrict L_cols ,
__global const int * restrict LU_cols ,
__global double * restrict Ltmp ,
__global double * restrict Utmp ,
const int Nb ,
__local double * aij ,
__local double * ujj )
{
const int bs = 3 ;
const unsigned int warp_size = 32 ;
2021-01-26 06:45:40 -06:00
const unsigned int work_group_size = get_local_size ( 0 ) ;
const unsigned int idx_b = get_global_id ( 0 ) / work_group_size ;
2020-12-17 07:49:59 -06:00
const unsigned int num_groups = get_num_groups ( 0 ) ;
2021-01-26 06:45:40 -06:00
const unsigned int warps_per_group = work_group_size / warp_size ;
2020-12-17 07:49:59 -06:00
const unsigned int idx_t = get_local_id ( 0 ) ; // thread id in work group
const unsigned int thread_id_in_warp = idx_t % warp_size ; // thread id in warp (32 threads)
const unsigned int warp_id_in_group = idx_t / warp_size ;
2021-01-26 06:45:40 -06:00
const unsigned int lmem_offset = warp_id_in_group * bs * bs ; // each workgroup gets some lmem, but the workitems have to share it
// every workitem in a warp has the same lmem_offset
// for every row of L or every col of U
2020-12-17 07:49:59 -06:00
for ( int row = idx_b ; row < Nb ; row + = num_groups ) {
2021-01-26 06:45:40 -06:00
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
int jColStart = Ut_rows [ row ] ; // actually colPointers to U
2020-12-17 07:49:59 -06:00
int jColEnd = Ut_rows [ row + 1 ] ;
2021-01-26 06:45:40 -06:00
// for every block on this column
2020-12-17 07:49:59 -06:00
for ( int ij = jColStart + warp_id_in_group ; ij < jColEnd ; ij + = warps_per_group ) {
2021-01-26 06:45:40 -06:00
int rowU1 = Ut_cols [ ij ] ; // actually rowIndices for U
2020-12-17 07:49:59 -06:00
// refine Uij element (or diagonal)
2021-01-26 06:45:40 -06:00
int i1 = LU_rows [ rowU1 ] ;
int i2 = LU_rows [ rowU1 + 1 ] ;
2020-12-17 07:49:59 -06:00
int kk = 0 ;
2021-01-26 06:45:40 -06:00
// LUmat->nnzValues[kk] is block Aij
2020-12-17 07:49:59 -06:00
for ( kk = i1 ; kk < i2 ; + + kk ) {
int c = LU_cols [ kk ] ;
if ( c > = row ) {
break ;
}
}
2021-01-26 06:45:40 -06:00
// copy block Aij so operations can be done on it without affecting LUmat
2020-12-17 07:49:59 -06:00
if ( thread_id_in_warp < bs * bs ) {
aij [ lmem_offset + thread_id_in_warp ] = LU_vals [ kk * bs * bs + thread_id_in_warp ] ;
}
2021-01-26 06:45:40 -06:00
int jk = L_rows [ rowU1 ] ; // points to row rowU1 in L
// if row rowU1 is empty: skip row. The whole warp looks at the same row, so no divergence
if ( jk < L_rows [ rowU1 + 1 ] ) {
int colL = L_cols [ jk ] ;
// only check until block U(i,j) is reached
for ( int k = jColStart ; k < ij ; + + k ) {
int rowU2 = Ut_cols [ k ] ; // actually rowIndices for U
while ( colL < rowU2 ) {
+ + jk ; // check next block on row rowU1 of L
colL = L_cols [ jk ] ;
}
if ( colL = = rowU2 ) {
// Aij -= (Lik * Ukj)
blockMultSub ( aij + lmem_offset , L_vals + jk * bs * bs , Ut_vals + k * bs * bs ) ;
}
2020-12-17 07:49:59 -06:00
}
}
2021-01-26 06:45:40 -06:00
// Uij_new = Aij - sum
// write result of this sweep
2020-12-17 07:49:59 -06:00
if ( thread_id_in_warp < bs * bs ) {
Utmp [ ij * bs * bs + thread_id_in_warp ] = aij [ lmem_offset + thread_id_in_warp ] ;
}
}
// update L
2021-01-26 06:45:40 -06:00
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
2020-12-17 07:49:59 -06:00
int iRowStart = L_rows [ row ] ;
int iRowEnd = L_rows [ row + 1 ] ;
for ( int ij = iRowStart + warp_id_in_group ; ij < iRowEnd ; ij + = warps_per_group ) {
int j = L_cols [ ij ] ;
// // refine Lij element
int i1 = LU_rows [ row ] ;
int i2 = LU_rows [ row + 1 ] ;
int kk = 0 ;
2021-01-26 06:45:40 -06:00
// LUmat->nnzValues[kk] is block Aij
2020-12-17 07:49:59 -06:00
for ( kk = i1 ; kk < i2 ; + + kk ) {
int c = LU_cols [ kk ] ;
if ( c > = j ) {
break ;
}
}
2021-01-26 06:45:40 -06:00
// copy block Aij so operations can be done on it without affecting LUmat
2020-12-17 07:49:59 -06:00
if ( thread_id_in_warp < bs * bs ) {
aij [ lmem_offset + thread_id_in_warp ] = LU_vals [ kk * bs * bs + thread_id_in_warp ] ;
}
2021-01-26 06:45:40 -06:00
int jk = Ut_rows [ j ] ; // actually colPointers, jk points to col j in U
int rowU = Ut_cols [ jk ] ; // actually rowIndices, rowU is the row of block jk
// only check until block L(i,j) is reached
2020-12-17 07:49:59 -06:00
for ( int k = iRowStart ; k < ij ; + + k ) {
2021-01-26 06:45:40 -06:00
int colL = L_cols [ k ] ;
while ( rowU < colL ) {
+ + jk ; // check next block on col j of U
rowU = Ut_cols [ jk ] ;
2020-12-17 07:49:59 -06:00
}
2021-01-26 06:45:40 -06:00
if ( rowU = = colL ) {
// Aij -= (Lik * Ukj)
2020-12-17 07:49:59 -06:00
blockMultSub ( aij + lmem_offset , L_vals + k * bs * bs , Ut_vals + jk * bs * bs ) ;
}
}
2021-01-26 06:45:40 -06:00
// calculate 1 / Ujj
2020-12-17 07:49:59 -06:00
invert ( Ut_vals + ( Ut_rows [ j + 1 ] - 1 ) * bs * bs , ujj + lmem_offset ) ;
2021-01-26 06:45:40 -06:00
// Lij_new = (Aij - sum) / Ujj
// write result of this sweep
2020-12-17 07:49:59 -06:00
blockMult ( aij + lmem_offset , ujj + lmem_offset , Ltmp + ij * bs * bs ) ;
}
}
}
) " ;
# else
2021-02-03 10:43:54 -06:00
inline const char * chow_patel_ilu_sweep_s = R " (
2020-12-17 07:49:59 -06:00
# pragma OPENCL EXTENSION cl_khr_fp64 : enable
2021-01-26 06:45:40 -06:00
// subtract blocks: a = a - b * c
// only one workitem performs this action
2020-12-17 07:49:59 -06:00
void blockMultSub (
__local double * restrict a ,
__global const double * restrict b ,
__global const double * restrict c )
{
const unsigned int block_size = 3 ;
for ( unsigned int row = 0 ; row < block_size ; row + + ) {
for ( unsigned int col = 0 ; col < block_size ; col + + ) {
double temp = 0.0 ;
for ( unsigned int k = 0 ; k < block_size ; k + + ) {
temp + = b [ block_size * row + k ] * c [ block_size * k + col ] ;
}
a [ block_size * row + col ] - = temp ;
}
}
}
2021-01-26 06:45:40 -06:00
// multiply blocks: resMat = mat1 * mat2
// only one workitem performs this action
2020-12-17 07:49:59 -06:00
void blockMult (
__local const double * restrict mat1 ,
__local const double * restrict mat2 ,
__global double * restrict resMat )
{
const unsigned int block_size = 3 ;
for ( unsigned int row = 0 ; row < block_size ; row + + ) {
for ( unsigned int col = 0 ; col < block_size ; col + + ) {
double temp = 0.0 ;
for ( unsigned int k = 0 ; k < block_size ; k + + ) {
temp + = mat1 [ block_size * row + k ] * mat2 [ block_size * k + col ] ;
}
resMat [ block_size * row + col ] = temp ;
}
}
}
2021-01-26 06:45:40 -06:00
// invert block: inverse = matrix^{-1}
// only one workitem performs this action
2020-12-17 07:49:59 -06:00
__kernel void inverter (
__global const double * restrict matrix ,
__local double * restrict inverse )
{
// code generated by maple, copied from Dune::DenseMatrix
double t4 = matrix [ 0 ] * matrix [ 4 ] ;
double t6 = matrix [ 0 ] * matrix [ 5 ] ;
double t8 = matrix [ 1 ] * matrix [ 3 ] ;
double t10 = matrix [ 2 ] * matrix [ 3 ] ;
double t12 = matrix [ 1 ] * matrix [ 6 ] ;
double t14 = matrix [ 2 ] * matrix [ 6 ] ;
double det = ( t4 * matrix [ 8 ] - t6 * matrix [ 7 ] - t8 * matrix [ 8 ] +
t10 * matrix [ 7 ] + t12 * matrix [ 5 ] - t14 * matrix [ 4 ] ) ;
double t17 = 1.0 / det ;
inverse [ 0 ] = ( matrix [ 4 ] * matrix [ 8 ] - matrix [ 5 ] * matrix [ 7 ] ) * t17 ;
inverse [ 1 ] = - ( matrix [ 1 ] * matrix [ 8 ] - matrix [ 2 ] * matrix [ 7 ] ) * t17 ;
inverse [ 2 ] = ( matrix [ 1 ] * matrix [ 5 ] - matrix [ 2 ] * matrix [ 4 ] ) * t17 ;
inverse [ 3 ] = - ( matrix [ 3 ] * matrix [ 8 ] - matrix [ 5 ] * matrix [ 6 ] ) * t17 ;
inverse [ 4 ] = ( matrix [ 0 ] * matrix [ 8 ] - t14 ) * t17 ;
inverse [ 5 ] = - ( t6 - t10 ) * t17 ;
inverse [ 6 ] = ( matrix [ 3 ] * matrix [ 7 ] - matrix [ 4 ] * matrix [ 6 ] ) * t17 ;
inverse [ 7 ] = - ( matrix [ 0 ] * matrix [ 7 ] - t12 ) * t17 ;
inverse [ 8 ] = ( t4 - t8 ) * t17 ;
}
2021-01-26 06:45:40 -06:00
// perform the fixed-point iteration
// all entries in L and U are updated once
// output is written to [LU]tmp
// aij and ujj are local arrays whose size is specified before kernel launch
2021-02-03 10:43:54 -06:00
__kernel void chow_patel_ilu_sweep (
2020-12-17 07:49:59 -06:00
__global const double * restrict Ut_vals ,
__global const double * restrict L_vals ,
__global const double * restrict LU_vals ,
__global const int * restrict Ut_rows ,
__global const int * restrict L_rows ,
__global const int * restrict LU_rows ,
__global const int * restrict Ut_cols ,
__global const int * restrict L_cols ,
__global const int * restrict LU_cols ,
__global double * restrict Ltmp ,
__global double * restrict Utmp ,
const int Nb ,
__local double * aij ,
__local double * ujj )
{
const int bs = 3 ;
const unsigned int warp_size = 32 ;
2021-01-26 06:45:40 -06:00
const unsigned int work_group_size = get_local_size ( 0 ) ;
const unsigned int idx_b = get_global_id ( 0 ) / work_group_size ;
2020-12-17 07:49:59 -06:00
const unsigned int num_groups = get_num_groups ( 0 ) ;
2021-01-26 06:45:40 -06:00
const unsigned int warps_per_group = work_group_size / warp_size ;
2020-12-17 07:49:59 -06:00
const unsigned int idx_t = get_local_id ( 0 ) ; // thread id in work group
const unsigned int thread_id_in_warp = idx_t % warp_size ; // thread id in warp (32 threads)
const unsigned int warp_id_in_group = idx_t / warp_size ;
2021-01-26 06:45:40 -06:00
// for every row of L or every col of U
2020-12-17 07:49:59 -06:00
for ( int row = idx_b ; row < Nb ; row + = num_groups ) {
2021-01-26 06:45:40 -06:00
// Uij = (Aij - sum k=1 to i-1 {Lik*Ukj})
int jColStart = Ut_rows [ row ] ; // actually colPointers to U
2020-12-17 07:49:59 -06:00
int jColEnd = Ut_rows [ row + 1 ] ;
2021-01-26 06:45:40 -06:00
// for every block on this column
for ( int ij = jColStart + idx_t ; ij < jColEnd ; ij + = work_group_size ) {
int rowU1 = Ut_cols [ ij ] ; // actually rowIndices for U
2020-12-17 07:49:59 -06:00
// refine Uij element (or diagonal)
2021-01-26 06:45:40 -06:00
int i1 = LU_rows [ rowU1 ] ;
int i2 = LU_rows [ rowU1 + 1 ] ;
2020-12-17 07:49:59 -06:00
int kk = 0 ;
2021-01-26 06:45:40 -06:00
// LUmat->nnzValues[kk] is block Aij
2020-12-17 07:49:59 -06:00
for ( kk = i1 ; kk < i2 ; + + kk ) {
int c = LU_cols [ kk ] ;
if ( c > = row ) {
break ;
}
}
2021-01-26 06:45:40 -06:00
// copy block Aij so operations can be done on it without affecting LUmat
2020-12-17 07:49:59 -06:00
for ( int z = 0 ; z < bs * bs ; + + z ) {
aij [ idx_t * bs * bs + z ] = LU_vals [ kk * bs * bs + z ] ;
}
2021-01-26 06:45:40 -06:00
int jk = L_rows [ rowU1 ] ;
// if row rowU1 is empty: do not sum. The workitems have different rowU1 values, divergence is possible
int colL = ( jk < L_rows [ rowU1 + 1 ] ) ? L_cols [ jk ] : Nb ;
2020-12-17 07:49:59 -06:00
2021-01-26 06:45:40 -06:00
// only check until block U(i,j) is reached
2020-12-17 07:49:59 -06:00
for ( int k = jColStart ; k < ij ; + + k ) {
2021-01-26 06:45:40 -06:00
int rowU2 = Ut_cols [ k ] ; // actually rowIndices for U
while ( colL < rowU2 ) {
+ + jk ; // check next block on row rowU1 of L
colL = L_cols [ jk ] ;
2020-12-17 07:49:59 -06:00
}
2021-01-26 06:45:40 -06:00
if ( colL = = rowU2 ) {
// Aij -= (Lik * Ukj)
2020-12-17 07:49:59 -06:00
blockMultSub ( aij + idx_t * bs * bs , L_vals + jk * bs * bs , Ut_vals + k * bs * bs ) ;
}
}
2021-01-26 06:45:40 -06:00
// Uij_new = Aij - sum
// write result of this sweep
2020-12-17 07:49:59 -06:00
for ( int z = 0 ; z < bs * bs ; + + z ) {
Utmp [ ij * bs * bs + z ] = aij [ idx_t * bs * bs + z ] ;
}
}
// update L
2021-01-26 06:45:40 -06:00
// Lij = (Aij - sum k=1 to j-1 {Lik*Ukj}) / Ujj
2020-12-17 07:49:59 -06:00
int iRowStart = L_rows [ row ] ;
int iRowEnd = L_rows [ row + 1 ] ;
2021-01-26 06:45:40 -06:00
for ( int ij = iRowStart + idx_t ; ij < iRowEnd ; ij + = work_group_size ) {
2020-12-17 07:49:59 -06:00
int j = L_cols [ ij ] ;
// // refine Lij element
int i1 = LU_rows [ row ] ;
int i2 = LU_rows [ row + 1 ] ;
int kk = 0 ;
2021-01-26 06:45:40 -06:00
// LUmat->nnzValues[kk] is block Aij
2020-12-17 07:49:59 -06:00
for ( kk = i1 ; kk < i2 ; + + kk ) {
int c = LU_cols [ kk ] ;
if ( c > = j ) {
break ;
}
}
2021-01-26 06:45:40 -06:00
// copy block Aij so operations can be done on it without affecting LUmat
2020-12-17 07:49:59 -06:00
for ( int z = 0 ; z < bs * bs ; + + z ) {
aij [ idx_t * bs * bs + z ] = LU_vals [ kk * bs * bs + z ] ;
}
2021-01-26 06:45:40 -06:00
int jk = Ut_rows [ j ] ; // actually colPointers, jk points to col j in U
int rowU = Ut_cols [ jk ] ; // actually rowIndices, rowU is the row of block jk
// only check until block L(i,j) is reached
2020-12-17 07:49:59 -06:00
for ( int k = iRowStart ; k < ij ; + + k ) {
2021-01-26 06:45:40 -06:00
int colL = L_cols [ k ] ;
while ( rowU < colL ) {
+ + jk ; // check next block on col j of U
rowU = Ut_cols [ jk ] ;
2020-12-17 07:49:59 -06:00
}
2021-01-26 06:45:40 -06:00
if ( rowU = = colL ) {
// Aij -= (Lik * Ukj)
2020-12-17 07:49:59 -06:00
blockMultSub ( aij + idx_t * bs * bs , L_vals + k * bs * bs , Ut_vals + jk * bs * bs ) ;
}
}
2021-01-26 06:45:40 -06:00
// calculate 1 / ujj
2020-12-17 07:49:59 -06:00
inverter ( Ut_vals + ( Ut_rows [ j + 1 ] - 1 ) * bs * bs , ujj + idx_t * bs * bs ) ;
2021-01-26 06:45:40 -06:00
// Lij_new = (Aij - sum) / Ujj
// write result of this sweep
2020-12-17 07:49:59 -06:00
blockMult ( aij + idx_t * bs * bs , ujj + idx_t * bs * bs , Ltmp + ij * bs * bs ) ;
}
}
}
) " ;
# endif
2021-02-03 10:43:54 -06:00
void ChowPatelIlu : : decomposition (
2020-12-17 07:49:59 -06:00
cl : : CommandQueue * queue , cl : : Context * context ,
int * Ut_ptrs , int * Ut_idxs , double * Ut_vals , int Ut_nnzbs ,
int * L_rows , int * L_cols , double * L_vals , int L_nnzbs ,
int * LU_rows , int * LU_cols , double * LU_vals , int LU_nnzbs ,
int Nb , int num_sweeps , int verbosity )
{
const int block_size = 3 ;
try {
2021-01-26 10:32:18 -06:00
// just put everything in the capture list
std : : call_once ( initialize_flag , [ & ] ( ) {
2021-02-03 10:43:54 -06:00
cl : : Program : : Sources source ( 1 , std : : make_pair ( chow_patel_ilu_sweep_s , strlen ( chow_patel_ilu_sweep_s ) ) ) ; // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
2020-12-17 07:49:59 -06:00
cl : : Program program = cl : : Program ( * context , source , & err ) ;
2021-01-26 10:32:18 -06:00
if ( err ! = CL_SUCCESS ) {
2021-02-03 10:43:54 -06:00
OPM_THROW ( std : : logic_error , " ChowPatelIlu OpenCL could not create Program " ) ;
2021-01-26 10:32:18 -06:00
}
2020-12-17 07:49:59 -06:00
std : : vector < cl : : Device > devices = context - > getInfo < CL_CONTEXT_DEVICES > ( ) ;
program . build ( devices ) ;
2021-02-03 10:43:54 -06:00
chow_patel_ilu_sweep_k . reset ( new cl : : make_kernel < cl : : Buffer & , cl : : Buffer & , cl : : Buffer & ,
2020-12-17 07:49:59 -06:00
cl : : Buffer & , cl : : Buffer & , cl : : Buffer & ,
cl : : Buffer & , cl : : Buffer & , cl : : Buffer & ,
cl : : Buffer & , cl : : Buffer & ,
2021-02-03 10:43:54 -06:00
const int , cl : : LocalSpaceArg , cl : : LocalSpaceArg > ( cl : : Kernel ( program , " chow_patel_ilu_sweep " , & err ) ) ) ;
2021-01-26 10:32:18 -06:00
if ( err ! = CL_SUCCESS ) {
2021-02-03 10:43:54 -06:00
OPM_THROW ( std : : logic_error , " ChowPatelIlu OpenCL could not create Kernel " ) ;
2021-01-26 10:32:18 -06:00
}
2020-12-17 07:49:59 -06:00
// allocate GPU memory
d_Ut_vals = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * Ut_nnzbs * block_size * block_size ) ;
d_L_vals = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * L_nnzbs * block_size * block_size ) ;
d_LU_vals = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * LU_nnzbs * block_size * block_size ) ;
d_Ut_ptrs = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * ( Nb + 1 ) ) ;
d_L_rows = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * ( Nb + 1 ) ) ;
d_LU_rows = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * ( Nb + 1 ) ) ;
d_Ut_idxs = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * Ut_nnzbs ) ;
d_L_cols = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * L_nnzbs ) ;
d_LU_cols = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( int ) * LU_nnzbs ) ;
d_Ltmp = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * L_nnzbs * block_size * block_size ) ;
d_Utmp = cl : : Buffer ( * context , CL_MEM_READ_WRITE , sizeof ( double ) * Ut_nnzbs * block_size * block_size ) ;
Dune : : Timer t_copy_pattern ;
2021-01-28 08:40:10 -06:00
events . resize ( 6 ) ;
err | = queue - > enqueueWriteBuffer ( d_Ut_ptrs , CL_FALSE , 0 , sizeof ( int ) * ( Nb + 1 ) , Ut_ptrs , nullptr , & events [ 0 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_L_rows , CL_FALSE , 0 , sizeof ( int ) * ( Nb + 1 ) , L_rows , nullptr , & events [ 1 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_LU_rows , CL_FALSE , 0 , sizeof ( int ) * ( Nb + 1 ) , LU_rows , nullptr , & events [ 2 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_Ut_idxs , CL_FALSE , 0 , sizeof ( int ) * Ut_nnzbs , Ut_idxs , nullptr , & events [ 3 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_L_cols , CL_FALSE , 0 , sizeof ( int ) * L_nnzbs , L_cols , nullptr , & events [ 4 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_LU_cols , CL_FALSE , 0 , sizeof ( int ) * LU_nnzbs , LU_cols , nullptr , & events [ 5 ] ) ;
cl : : WaitForEvents ( events ) ;
events . clear ( ) ;
2021-03-03 10:10:55 -06:00
if ( verbosity > = 4 ) {
2020-12-17 07:49:59 -06:00
std : : ostringstream out ;
2021-02-03 10:43:54 -06:00
out < < " ChowPatelIlu copy sparsity pattern time: " < < t_copy_pattern . stop ( ) < < " s " ;
2020-12-17 07:49:59 -06:00
OpmLog : : info ( out . str ( ) ) ;
}
2021-04-09 09:19:45 -05:00
std : : ostringstream out ;
out < < " ChowPatelIlu PARALLEL: " < < PARALLEL ;
OpmLog : : info ( out . str ( ) ) ;
2021-01-26 10:32:18 -06:00
} ) ;
2020-12-17 07:49:59 -06:00
// copy to GPU
Dune : : Timer t_copy1 ;
2021-01-28 08:40:10 -06:00
events . resize ( 3 ) ;
err = queue - > enqueueWriteBuffer ( d_Ut_vals , CL_FALSE , 0 , sizeof ( double ) * Ut_nnzbs * block_size * block_size , Ut_vals , nullptr , & events [ 0 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_L_vals , CL_FALSE , 0 , sizeof ( double ) * L_nnzbs * block_size * block_size , L_vals , nullptr , & events [ 1 ] ) ;
err | = queue - > enqueueWriteBuffer ( d_LU_vals , CL_FALSE , 0 , sizeof ( double ) * LU_nnzbs * block_size * block_size , LU_vals , nullptr , & events [ 2 ] ) ;
cl : : WaitForEvents ( events ) ;
events . clear ( ) ;
2021-03-03 10:10:55 -06:00
if ( verbosity > = 4 ) {
2020-12-17 07:49:59 -06:00
std : : ostringstream out ;
2021-02-03 10:43:54 -06:00
out < < " ChowPatelIlu copy1 time: " < < t_copy1 . stop ( ) < < " s " ;
2020-12-17 07:49:59 -06:00
OpmLog : : info ( out . str ( ) ) ;
}
if ( err ! = CL_SUCCESS ) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
2021-02-03 10:43:54 -06:00
OPM_THROW ( std : : logic_error , " ChowPatelIlu OpenCL enqueueWriteBuffer error " ) ;
2020-12-17 07:49:59 -06:00
}
// call kernel
for ( int sweep = 0 ; sweep < num_sweeps ; + + sweep ) {
// normally, L_vals and Ltmp are swapped after the sweep is done
// these conditionals implement that without actually swapping pointers
// 1st sweep reads X_vals, writes to Xtmp
// 2nd sweep reads Xtmp, writes to X_vals
auto * Larg1 = ( sweep % 2 = = 0 ) ? & d_L_vals : & d_Ltmp ;
auto * Larg2 = ( sweep % 2 = = 0 ) ? & d_Ltmp : & d_L_vals ;
auto * Uarg1 = ( sweep % 2 = = 0 ) ? & d_Ut_vals : & d_Utmp ;
auto * Uarg2 = ( sweep % 2 = = 0 ) ? & d_Utmp : & d_Ut_vals ;
int num_work_groups = Nb ;
# if PARALLEL
int work_group_size = 32 ;
# else
int work_group_size = 16 ;
# endif
int total_work_items = num_work_groups * work_group_size ;
int lmem_per_work_group = work_group_size * block_size * block_size * sizeof ( double ) ;
Dune : : Timer t_kernel ;
2021-02-03 10:43:54 -06:00
event = ( * chow_patel_ilu_sweep_k ) ( cl : : EnqueueArgs ( * queue , cl : : NDRange ( total_work_items ) , cl : : NDRange ( work_group_size ) ) ,
2020-12-17 07:49:59 -06:00
* Uarg1 , * Larg1 , d_LU_vals ,
d_Ut_ptrs , d_L_rows , d_LU_rows ,
d_Ut_idxs , d_L_cols , d_LU_cols ,
* Larg2 , * Uarg2 , Nb , cl : : Local ( lmem_per_work_group ) , cl : : Local ( lmem_per_work_group ) ) ;
event . wait ( ) ;
2021-03-03 10:10:55 -06:00
if ( verbosity > = 4 ) {
2020-12-17 07:49:59 -06:00
std : : ostringstream out ;
2021-02-03 10:43:54 -06:00
out < < " ChowPatelIlu sweep kernel time: " < < t_kernel . stop ( ) < < " s " ;
2020-12-17 07:49:59 -06:00
OpmLog : : info ( out . str ( ) ) ;
}
}
// copy back
Dune : : Timer t_copy2 ;
2021-01-28 08:40:10 -06:00
events . resize ( 2 ) ;
2020-12-17 07:49:59 -06:00
if ( num_sweeps % 2 = = 0 ) {
2021-01-28 08:40:10 -06:00
err = queue - > enqueueReadBuffer ( d_Ut_vals , CL_FALSE , 0 , sizeof ( double ) * Ut_nnzbs * block_size * block_size , Ut_vals , nullptr , & events [ 0 ] ) ;
err | = queue - > enqueueReadBuffer ( d_L_vals , CL_FALSE , 0 , sizeof ( double ) * L_nnzbs * block_size * block_size , L_vals , nullptr , & events [ 1 ] ) ;
2020-12-17 07:49:59 -06:00
} else {
2021-01-28 08:40:10 -06:00
err = queue - > enqueueReadBuffer ( d_Utmp , CL_FALSE , 0 , sizeof ( double ) * Ut_nnzbs * block_size * block_size , Ut_vals , nullptr , & events [ 0 ] ) ;
err | = queue - > enqueueReadBuffer ( d_Ltmp , CL_FALSE , 0 , sizeof ( double ) * L_nnzbs * block_size * block_size , L_vals , nullptr , & events [ 1 ] ) ;
2020-12-17 07:49:59 -06:00
}
2021-01-28 08:40:10 -06:00
cl : : WaitForEvents ( events ) ;
events . clear ( ) ;
2021-03-03 10:10:55 -06:00
if ( verbosity > = 4 ) {
2020-12-17 07:49:59 -06:00
std : : ostringstream out ;
2021-02-03 10:43:54 -06:00
out < < " ChowPatelIlu copy2 time: " < < t_copy2 . stop ( ) < < " s " ;
2020-12-17 07:49:59 -06:00
OpmLog : : info ( out . str ( ) ) ;
}
if ( err ! = CL_SUCCESS ) {
// enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
2021-02-03 10:43:54 -06:00
OPM_THROW ( std : : logic_error , " ChowPatelIlu OpenCL enqueueReadBuffer error " ) ;
2020-12-17 07:49:59 -06:00
}
} catch ( const cl : : Error & error ) {
std : : ostringstream oss ;
oss < < " OpenCL Error: " < < error . what ( ) < < " ( " < < error . err ( ) < < " ) \n " ;
oss < < getErrorString ( error . err ( ) ) < < std : : endl ;
// rethrow exception
OPM_THROW ( std : : logic_error , oss . str ( ) ) ;
} catch ( const std : : logic_error & error ) {
// rethrow exception by OPM_THROW in the try{}
throw error ;
}
}
} // end namespace bda