mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
571 lines
22 KiB
C++
571 lines
22 KiB
C++
|
/*
|
||
|
Copyright 2020 Equinor ASA
|
||
|
|
||
|
This file is part of the Open Porous Media project (OPM).
|
||
|
|
||
|
OPM is free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation, either version 3 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
OPM is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
|
||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||
|
#include <opm/common/ErrorMacros.hpp>
|
||
|
#include <dune/common/timer.hh>
|
||
|
|
||
|
#include <opm/simulators/linalg/bda/fgpilu.hpp>
|
||
|
|
||
|
namespace bda
|
||
|
{
|
||
|
|
||
|
using Opm::OpmLog;
|
||
|
|
||
|
// if PARALLEL is 0:
|
||
|
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
|
||
|
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
|
||
|
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
|
||
|
// In that case some workitems will process multiple blocks sequentially.
|
||
|
// else:
|
||
|
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially
|
||
|
// Each block in a row gets a warp of 32 workitems, of which 9 are always active.
|
||
|
// Multiple blocks can be processed in parallel if a workgroup contains multiple warps.
|
||
|
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
|
||
|
|
||
|
// Notes:
|
||
|
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
|
||
|
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
|
||
|
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
|
||
|
// PARALLEL 0 is generally faster, despite not having parallelization.
|
||
|
|
||
|
#define PARALLEL 0
|
||
|
|
||
|
#if PARALLEL
|
||
|
|
||
|
inline const char* fgpilu_sweep_s = R"(
|
||
|
|
||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||
|
|
||
|
void blockMultSub(
|
||
|
__local double * restrict a,
|
||
|
__global const double * restrict b,
|
||
|
__global const double * restrict c)
|
||
|
{
|
||
|
const unsigned int block_size = 3;
|
||
|
const unsigned int warp_size = 32;
|
||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||
|
if(thread_id_in_warp < block_size * block_size){
|
||
|
const unsigned int row = thread_id_in_warp / block_size;
|
||
|
const unsigned int col = thread_id_in_warp % block_size;
|
||
|
double temp = 0.0;
|
||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||
|
temp += b[block_size * row + k] * c[block_size * k + col];
|
||
|
}
|
||
|
a[block_size * row + col] -= temp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void blockMult(
|
||
|
__local const double * restrict mat1,
|
||
|
__local const double * restrict mat2,
|
||
|
__global double * restrict resMat)
|
||
|
{
|
||
|
const unsigned int block_size = 3;
|
||
|
const unsigned int warp_size = 32;
|
||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||
|
if(thread_id_in_warp < block_size * block_size){
|
||
|
const unsigned int row = thread_id_in_warp / block_size;
|
||
|
const unsigned int col = thread_id_in_warp % block_size;
|
||
|
double temp = 0.0;
|
||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||
|
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
|
||
|
}
|
||
|
resMat[block_size * row + col] = temp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void invert(
|
||
|
__global const double * restrict matrix,
|
||
|
__local double * restrict inverse)
|
||
|
{
|
||
|
const unsigned int block_size = 3;
|
||
|
const unsigned int bs = block_size;
|
||
|
const unsigned int warp_size = 32;
|
||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||
|
if(thread_id_in_warp < block_size * block_size){
|
||
|
// code generated by maple, copied from Dune::DenseMatrix
|
||
|
double t4 = matrix[0] * matrix[4];
|
||
|
double t6 = matrix[0] * matrix[5];
|
||
|
double t8 = matrix[1] * matrix[3];
|
||
|
double t10 = matrix[2] * matrix[3];
|
||
|
double t12 = matrix[1] * matrix[6];
|
||
|
double t14 = matrix[2] * matrix[6];
|
||
|
|
||
|
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||
|
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||
|
double t17 = 1.0 / det;
|
||
|
|
||
|
const unsigned int r = thread_id_in_warp / block_size;
|
||
|
const unsigned int c = thread_id_in_warp % block_size;
|
||
|
const unsigned int r1 = (r+1) % bs;
|
||
|
const unsigned int c1 = (c+1) % bs;
|
||
|
const unsigned int r2 = (r+bs-1) % bs;
|
||
|
const unsigned int c2 = (c+bs-1) % bs;
|
||
|
inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__kernel void fgpilu_sweep(
|
||
|
__global const double * restrict Ut_vals,
|
||
|
__global const double * restrict L_vals,
|
||
|
__global const double * restrict LU_vals,
|
||
|
__global const int * restrict Ut_rows,
|
||
|
__global const int * restrict L_rows,
|
||
|
__global const int * restrict LU_rows,
|
||
|
__global const int * restrict Ut_cols,
|
||
|
__global const int * restrict L_cols,
|
||
|
__global const int * restrict LU_cols,
|
||
|
__global double * restrict Ltmp,
|
||
|
__global double * restrict Utmp,
|
||
|
const int Nb,
|
||
|
__local double *aij,
|
||
|
__local double *ujj)
|
||
|
{
|
||
|
const int bs = 3;
|
||
|
|
||
|
// for every row
|
||
|
const unsigned int warp_size = 32;
|
||
|
const unsigned int bsize = get_local_size(0);
|
||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||
|
const unsigned int num_groups = get_num_groups(0);
|
||
|
const unsigned int warps_per_group = bsize / warp_size;
|
||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||
|
const unsigned int warp_id_in_group = idx_t / warp_size;
|
||
|
const unsigned int lmem_offset = warp_id_in_group * bs * bs; // each workgroup gets some lmem, but the workitems have to share it
|
||
|
for (int row = idx_b; row < Nb; row+=num_groups) {
|
||
|
int jColStart = Ut_rows[row];
|
||
|
int jColEnd = Ut_rows[row + 1];
|
||
|
for (int ij = jColStart + warp_id_in_group; ij < jColEnd; ij+=warps_per_group) {
|
||
|
int col = Ut_cols[ij];
|
||
|
// refine Uij element (or diagonal)
|
||
|
int i1 = LU_rows[col];
|
||
|
int i2 = LU_rows[col+1];
|
||
|
int kk = 0;
|
||
|
for(kk = i1; kk < i2; ++kk) {
|
||
|
int c = LU_cols[kk];
|
||
|
if (c >= row) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(thread_id_in_warp < bs*bs){
|
||
|
aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
|
||
|
}
|
||
|
|
||
|
int jk = L_rows[col];
|
||
|
int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
|
||
|
|
||
|
for (int k = jColStart; k < ij; ++k) {
|
||
|
int ki = Ut_cols[k];
|
||
|
while (ik < ki) {
|
||
|
++jk;
|
||
|
ik = L_cols[jk];
|
||
|
}
|
||
|
if (ik == ki) {
|
||
|
blockMultSub(aij+lmem_offset, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(thread_id_in_warp < bs*bs){
|
||
|
Utmp[ij*bs*bs + thread_id_in_warp] = aij[lmem_offset + thread_id_in_warp];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// update L
|
||
|
int iRowStart = L_rows[row];
|
||
|
int iRowEnd = L_rows[row + 1];
|
||
|
|
||
|
for (int ij = iRowStart + warp_id_in_group; ij < iRowEnd; ij+=warps_per_group) {
|
||
|
// for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
|
||
|
int j = L_cols[ij];
|
||
|
// // refine Lij element
|
||
|
int i1 = LU_rows[row];
|
||
|
int i2 = LU_rows[row+1];
|
||
|
int kk = 0;
|
||
|
for(kk = i1; kk < i2; ++kk) {
|
||
|
int c = LU_cols[kk];
|
||
|
if (c >= j) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(thread_id_in_warp < bs*bs){
|
||
|
aij[lmem_offset+thread_id_in_warp] = LU_vals[kk*bs*bs + thread_id_in_warp];
|
||
|
}
|
||
|
|
||
|
int jk = Ut_rows[j];
|
||
|
int ik = Ut_cols[jk];
|
||
|
for (int k = iRowStart; k < ij; ++k) {
|
||
|
int ki = L_cols[k];
|
||
|
while(ik < ki) {
|
||
|
++jk;
|
||
|
ik = Ut_cols[jk];
|
||
|
}
|
||
|
|
||
|
if(ik == ki) {
|
||
|
blockMultSub(aij+lmem_offset, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// calculate aij / ujj
|
||
|
invert(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+lmem_offset);
|
||
|
|
||
|
// lij = aij / ujj
|
||
|
blockMult(aij+lmem_offset, ujj+lmem_offset, Ltmp + ij * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
)";
|
||
|
|
||
|
#else
|
||
|
|
||
|
inline const char* fgpilu_sweep_s = R"(
|
||
|
|
||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||
|
|
||
|
void blockMultSub(
|
||
|
__local double * restrict a,
|
||
|
__global const double * restrict b,
|
||
|
__global const double * restrict c)
|
||
|
{
|
||
|
const unsigned int block_size = 3;
|
||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||
|
double temp = 0.0;
|
||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||
|
temp += b[block_size * row + k] * c[block_size * k + col];
|
||
|
}
|
||
|
a[block_size * row + col] -= temp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void blockMult(
|
||
|
__local const double * restrict mat1,
|
||
|
__local const double * restrict mat2,
|
||
|
__global double * restrict resMat)
|
||
|
{
|
||
|
const unsigned int block_size = 3;
|
||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||
|
double temp = 0.0;
|
||
|
for (unsigned int k = 0; k < block_size; k++) {
|
||
|
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
|
||
|
}
|
||
|
resMat[block_size * row + col] = temp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
__kernel void inverter(
|
||
|
__global const double * restrict matrix,
|
||
|
__local double * restrict inverse)
|
||
|
{
|
||
|
// code generated by maple, copied from Dune::DenseMatrix
|
||
|
double t4 = matrix[0] * matrix[4];
|
||
|
double t6 = matrix[0] * matrix[5];
|
||
|
double t8 = matrix[1] * matrix[3];
|
||
|
double t10 = matrix[2] * matrix[3];
|
||
|
double t12 = matrix[1] * matrix[6];
|
||
|
double t14 = matrix[2] * matrix[6];
|
||
|
|
||
|
double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
|
||
|
t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
|
||
|
double t17 = 1.0 / det;
|
||
|
|
||
|
inverse[0] = (matrix[4] * matrix[8] - matrix[5] * matrix[7]) * t17;
|
||
|
inverse[1] = -(matrix[1] * matrix[8] - matrix[2] * matrix[7]) * t17;
|
||
|
inverse[2] = (matrix[1] * matrix[5] - matrix[2] * matrix[4]) * t17;
|
||
|
inverse[3] = -(matrix[3] * matrix[8] - matrix[5] * matrix[6]) * t17;
|
||
|
inverse[4] = (matrix[0] * matrix[8] - t14) * t17;
|
||
|
inverse[5] = -(t6 - t10) * t17;
|
||
|
inverse[6] = (matrix[3] * matrix[7] - matrix[4] * matrix[6]) * t17;
|
||
|
inverse[7] = -(matrix[0] * matrix[7] - t12) * t17;
|
||
|
inverse[8] = (t4 - t8) * t17;
|
||
|
}
|
||
|
|
||
|
__kernel void fgpilu_sweep(
|
||
|
__global const double * restrict Ut_vals,
|
||
|
__global const double * restrict L_vals,
|
||
|
__global const double * restrict LU_vals,
|
||
|
__global const int * restrict Ut_rows,
|
||
|
__global const int * restrict L_rows,
|
||
|
__global const int * restrict LU_rows,
|
||
|
__global const int * restrict Ut_cols,
|
||
|
__global const int * restrict L_cols,
|
||
|
__global const int * restrict LU_cols,
|
||
|
__global double * restrict Ltmp,
|
||
|
__global double * restrict Utmp,
|
||
|
const int Nb,
|
||
|
__local double *aij,
|
||
|
__local double *ujj)
|
||
|
{
|
||
|
const int bs = 3;
|
||
|
|
||
|
// for every row
|
||
|
const unsigned int warp_size = 32;
|
||
|
const unsigned int bsize = get_local_size(0);
|
||
|
const unsigned int idx_b = get_global_id(0) / bsize;
|
||
|
const unsigned int num_groups = get_num_groups(0);
|
||
|
const unsigned int warps_per_group = bsize / warp_size;
|
||
|
const unsigned int idx_t = get_local_id(0); // thread id in work group
|
||
|
const unsigned int thread_id_in_warp = idx_t % warp_size; // thread id in warp (32 threads)
|
||
|
const unsigned int warp_id_in_group = idx_t / warp_size;
|
||
|
for (int row = idx_b; row < Nb; row+=num_groups) {
|
||
|
int jColStart = Ut_rows[row];
|
||
|
int jColEnd = Ut_rows[row + 1];
|
||
|
for (int ij = jColStart + idx_t; ij < jColEnd; ij+=bsize) {
|
||
|
int col = Ut_cols[ij];
|
||
|
// refine Uij element (or diagonal)
|
||
|
int i1 = LU_rows[col];
|
||
|
int i2 = LU_rows[col+1];
|
||
|
int kk = 0;
|
||
|
for(kk = i1; kk < i2; ++kk) {
|
||
|
int c = LU_cols[kk];
|
||
|
if (c >= row) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for(int z = 0; z < bs*bs; ++z){
|
||
|
aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
|
||
|
}
|
||
|
|
||
|
int jk = L_rows[col];
|
||
|
int ik = (jk < L_rows[col+1]) ? L_cols[jk] : Nb;
|
||
|
|
||
|
for (int k = jColStart; k < ij; ++k) {
|
||
|
int ki = Ut_cols[k];
|
||
|
while (ik < ki) {
|
||
|
++jk;
|
||
|
ik = L_cols[jk];
|
||
|
}
|
||
|
if (ik == ki) {
|
||
|
blockMultSub(aij+idx_t*bs*bs, L_vals + jk * bs * bs, Ut_vals + k * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for(int z = 0; z < bs*bs; ++z){
|
||
|
Utmp[ij*bs*bs + z] = aij[idx_t*bs*bs+z];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// update L
|
||
|
int iRowStart = L_rows[row];
|
||
|
int iRowEnd = L_rows[row + 1];
|
||
|
|
||
|
for (int ij = iRowStart + idx_t; ij < iRowEnd; ij+=bsize) {
|
||
|
int j = L_cols[ij];
|
||
|
// // refine Lij element
|
||
|
int i1 = LU_rows[row];
|
||
|
int i2 = LU_rows[row+1];
|
||
|
int kk = 0;
|
||
|
for(kk = i1; kk < i2; ++kk) {
|
||
|
int c = LU_cols[kk];
|
||
|
if (c >= j) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for(int z = 0; z < bs*bs; ++z){
|
||
|
aij[idx_t*bs*bs+z] = LU_vals[kk*bs*bs + z];
|
||
|
}
|
||
|
|
||
|
int jk = Ut_rows[j];
|
||
|
int ik = Ut_cols[jk];
|
||
|
for (int k = iRowStart; k < ij; ++k) {
|
||
|
int ki = L_cols[k];
|
||
|
while(ik < ki) {
|
||
|
++jk;
|
||
|
ik = Ut_cols[jk];
|
||
|
}
|
||
|
|
||
|
if(ik == ki) {
|
||
|
blockMultSub(aij+idx_t*bs*bs, L_vals + k * bs * bs , Ut_vals + jk * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
// calculate aij / ujj
|
||
|
inverter(Ut_vals + (Ut_rows[j+1] - 1) * bs * bs, ujj+idx_t*bs*bs);
|
||
|
|
||
|
// lij = aij / ujj
|
||
|
blockMult(aij+idx_t*bs*bs, ujj+idx_t*bs*bs, Ltmp + ij * bs * bs);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
)";
|
||
|
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
void FGPILU::decomposition(
|
||
|
cl::CommandQueue *queue, cl::Context *context,
|
||
|
int *Ut_ptrs, int *Ut_idxs, double *Ut_vals, int Ut_nnzbs,
|
||
|
int *L_rows, int *L_cols, double *L_vals, int L_nnzbs,
|
||
|
int *LU_rows, int *LU_cols, double *LU_vals, int LU_nnzbs,
|
||
|
int Nb, int num_sweeps, int verbosity)
|
||
|
{
|
||
|
const int block_size = 3;
|
||
|
|
||
|
try {
|
||
|
if (initialized == false) {
|
||
|
cl::Program::Sources source(1, std::make_pair(fgpilu_sweep_s, strlen(fgpilu_sweep_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
|
||
|
cl::Program program = cl::Program(*context, source, &err);
|
||
|
|
||
|
std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
|
||
|
program.build(devices);
|
||
|
|
||
|
fgpilu_sweep_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||
|
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||
|
cl::Buffer&, cl::Buffer&,
|
||
|
const int, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "fgpilu_sweep", &err)));
|
||
|
|
||
|
// allocate GPU memory
|
||
|
d_Ut_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
|
||
|
d_L_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
|
||
|
d_LU_vals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * LU_nnzbs * block_size * block_size);
|
||
|
d_Ut_ptrs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||
|
d_L_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||
|
d_LU_rows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb+1));
|
||
|
d_Ut_idxs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Ut_nnzbs);
|
||
|
d_L_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * L_nnzbs);
|
||
|
d_LU_cols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LU_nnzbs);
|
||
|
d_Ltmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * L_nnzbs * block_size * block_size);
|
||
|
d_Utmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Ut_nnzbs * block_size * block_size);
|
||
|
|
||
|
Dune::Timer t_copy_pattern;
|
||
|
err |= queue->enqueueWriteBuffer(d_Ut_ptrs, CL_TRUE, 0, sizeof(int) * (Nb+1), Ut_ptrs);
|
||
|
err |= queue->enqueueWriteBuffer(d_L_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), L_rows);
|
||
|
err |= queue->enqueueWriteBuffer(d_LU_rows, CL_TRUE, 0, sizeof(int) * (Nb+1), LU_rows);
|
||
|
err |= queue->enqueueWriteBuffer(d_Ut_idxs, CL_TRUE, 0, sizeof(int) * Ut_nnzbs, Ut_idxs);
|
||
|
err |= queue->enqueueWriteBuffer(d_L_cols, CL_TRUE, 0, sizeof(int) * L_nnzbs, L_cols);
|
||
|
err |= queue->enqueueWriteBuffer(d_LU_cols, CL_TRUE, 0, sizeof(int) * LU_nnzbs, LU_cols);
|
||
|
if (verbosity >= 3){
|
||
|
std::ostringstream out;
|
||
|
out << "FGPILU copy sparsity pattern time: " << t_copy_pattern.stop() << " s";
|
||
|
OpmLog::info(out.str());
|
||
|
}
|
||
|
if (verbosity >= 2){
|
||
|
std::ostringstream out;
|
||
|
out << "FGPILU PARALLEL: " << PARALLEL;
|
||
|
OpmLog::info(out.str());
|
||
|
}
|
||
|
|
||
|
initialized = true;
|
||
|
}
|
||
|
|
||
|
|
||
|
// copy to GPU
|
||
|
Dune::Timer t_copy1;
|
||
|
err = queue->enqueueWriteBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
|
||
|
err |= queue->enqueueWriteBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
|
||
|
err |= queue->enqueueWriteBuffer(d_LU_vals, CL_TRUE, 0, sizeof(double) * LU_nnzbs * block_size * block_size, LU_vals);
|
||
|
if (verbosity >= 3){
|
||
|
std::ostringstream out;
|
||
|
out << "FGPILU copy1 time: " << t_copy1.stop() << " s";
|
||
|
OpmLog::info(out.str());
|
||
|
}
|
||
|
if (err != CL_SUCCESS) {
|
||
|
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||
|
OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueWriteBuffer error");
|
||
|
}
|
||
|
|
||
|
// call kernel
|
||
|
for (int sweep = 0; sweep < num_sweeps; ++sweep) {
|
||
|
// normally, L_vals and Ltmp are swapped after the sweep is done
|
||
|
// these conditionals implement that without actually swapping pointers
|
||
|
// 1st sweep reads X_vals, writes to Xtmp
|
||
|
// 2nd sweep reads Xtmp, writes to X_vals
|
||
|
auto *Larg1 = (sweep % 2 == 0) ? &d_L_vals : &d_Ltmp;
|
||
|
auto *Larg2 = (sweep % 2 == 0) ? &d_Ltmp : &d_L_vals;
|
||
|
auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
|
||
|
auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
|
||
|
int num_work_groups = Nb;
|
||
|
#if PARALLEL
|
||
|
int work_group_size = 32;
|
||
|
#else
|
||
|
int work_group_size = 16;
|
||
|
#endif
|
||
|
int total_work_items = num_work_groups * work_group_size;
|
||
|
int lmem_per_work_group = work_group_size * block_size * block_size * sizeof(double);
|
||
|
Dune::Timer t_kernel;
|
||
|
event = (*fgpilu_sweep_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||
|
*Uarg1, *Larg1, d_LU_vals,
|
||
|
d_Ut_ptrs, d_L_rows, d_LU_rows,
|
||
|
d_Ut_idxs, d_L_cols, d_LU_cols,
|
||
|
*Larg2, *Uarg2, Nb, cl::Local(lmem_per_work_group), cl::Local(lmem_per_work_group));
|
||
|
event.wait();
|
||
|
if (verbosity >= 3){
|
||
|
std::ostringstream out;
|
||
|
out << "FGPILU sweep kernel time: " << t_copy1.stop() << " s";
|
||
|
OpmLog::info(out.str());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// copy back
|
||
|
Dune::Timer t_copy2;
|
||
|
if (num_sweeps % 2 == 0) {
|
||
|
err = queue->enqueueReadBuffer(d_Ut_vals, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
|
||
|
err |= queue->enqueueReadBuffer(d_L_vals, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
|
||
|
} else {
|
||
|
err = queue->enqueueReadBuffer(d_Utmp, CL_TRUE, 0, sizeof(double) * Ut_nnzbs * block_size * block_size, Ut_vals);
|
||
|
err |= queue->enqueueReadBuffer(d_Ltmp, CL_TRUE, 0, sizeof(double) * L_nnzbs * block_size * block_size, L_vals);
|
||
|
}
|
||
|
err |= queue->enqueueBarrierWithWaitList();
|
||
|
if (verbosity >= 3){
|
||
|
std::ostringstream out;
|
||
|
out << "FGPILU copy2 time: " << t_copy2.stop() << " s";
|
||
|
OpmLog::info(out.str());
|
||
|
}
|
||
|
if (err != CL_SUCCESS) {
|
||
|
// enqueueReadBuffer is C and does not throw exceptions like C++ OpenCL
|
||
|
OPM_THROW(std::logic_error, "FGPILU OpenCL enqueueReadBuffer error");
|
||
|
}
|
||
|
|
||
|
} catch (const cl::Error& error) {
|
||
|
std::ostringstream oss;
|
||
|
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||
|
oss << getErrorString(error.err()) << std::endl;
|
||
|
// rethrow exception
|
||
|
OPM_THROW(std::logic_error, oss.str());
|
||
|
} catch (const std::logic_error& error) {
|
||
|
// rethrow exception by OPM_THROW in the try{}
|
||
|
throw error;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
} // end namespace bda
|
||
|
|