rocsparseWellContributions: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-15 16:57:54 +02:00
parent ddcfcf681d
commit d2637e53ea
5 changed files with 125 additions and 85 deletions

View File

@ -60,10 +60,10 @@ WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useW
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
#endif
}
else if(accelerator_mode.compare("rocsparse") == 0){
else if (accelerator_mode.compare("rocsparse") == 0) {
if (!useWellConn) {
#if HAVE_ROCSPARSE
return std::make_unique<WellContributionsRocsparse>();
return std::make_unique<WellContributionsRocsparse<Scalar>>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
#endif

View File

@ -308,10 +308,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
}
// apply wellContributions
if (wellContribs.getNumWells() > 0){
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
}
if(verbosity >= 3) {
if (verbosity >= 3) {
queue->finish();
t_well.stop();
t_rest.start();

View File

@ -164,8 +164,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
// set stream here, the WellContributions object is destroyed every linear solve
// the number of wells can change every linear solve
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<double>&>(wellContribs).setStream(stream);
}
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
@ -254,8 +254,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
}
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_pw, d_v);
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -313,15 +313,15 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
d_Avals, d_Arows, d_Acols, block_size,
d_s, &zero, d_t));
#endif
if(verbosity >= 3){
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_spmv.stop();
t_well.start();
}
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_s, d_t);
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));

View File

@ -56,17 +56,17 @@ namespace Opm
#ifdef __HIP__
/// HIP kernel to apply the standard wellcontributions
__global__ void stdwell_apply(
const double *Cnnzs,
const double *Dnnzs,
const double *Bnnzs,
const unsigned *Ccols,
const unsigned *Bcols,
const double *x,
double *y,
const unsigned dim,
const unsigned dim_wells,
const unsigned *val_pointers)
template<class Scalar>
__global__ void stdwell_apply(const Scalar* Cnnzs,
const Scalar* Dnnzs,
const Scalar* Bnnzs,
const unsigned* Ccols,
const unsigned* Bcols,
const Scalar* x,
Scalar* y,
const unsigned dim,
const unsigned dim_wells,
const unsigned *val_pointers)
{
unsigned wgId = blockIdx.x;
unsigned wiId = threadIdx.x;
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
unsigned c = wiId % dim;
unsigned r = (wiId/dim) % dim_wells;
double temp;
Scalar temp;
extern __shared__ double localSum[];
double *z1 = localSum + gridDim.x;
double *z2 = z1 + dim_wells;
extern __shared__ Scalar localSum[];
Scalar* z1 = localSum + gridDim.x;
Scalar* z2 = z1 + dim_wells;
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
if (wiId < numActiveWorkItems) {
unsigned b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
while (b < valSize + val_pointers[wgId]) {
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// workitem i will hold the sum of workitems i and i + valsPerBlock
if(wiId < valsPerBlock){
if (wiId < valsPerBlock){
for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
localSum[wiId] += localSum[wiId + i*valsPerBlock];
}
}
if(c == 0 && wiId < valsPerBlock){
for(unsigned i = dim - 1; i > 0; --i){
if (c == 0 && wiId < valsPerBlock){
for (unsigned i = dim - 1; i > 0; --i) {
localSum[wiId] += localSum[wiId + i];
}
z1[r] = localSum[wiId];
@ -117,7 +117,7 @@ __global__ void stdwell_apply(
if(wiId < dim_wells){
temp = 0.0;
for(unsigned i = 0; i < dim_wells; ++i){
for (unsigned i = 0; i < dim_wells; ++i) {
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
@ -125,10 +125,10 @@ __global__ void stdwell_apply(
__syncthreads();
if(wiId < dim*valSize){
if (wiId < dim*valSize){
temp = 0.0;
unsigned bb = wiId/dim + val_pointers[wgId];
for (unsigned j = 0; j < dim_wells; ++j){
for (unsigned j = 0; j < dim_wells; ++j) {
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
@ -138,17 +138,26 @@ __global__ void stdwell_apply(
}
#endif
void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
[[maybe_unused]] double *d_y){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply_stdwells([[maybe_unused]] Scalar* d_x,
[[maybe_unused]] Scalar* d_y)
{
#ifdef __HIP__
unsigned gridDim = num_std_wells;
unsigned blockDim = 64;
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
// dim3(N) will create a vector {N, 1, 1}
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
d_x, d_y, dim, dim_wells, d_val_pointers_hip
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
d_Dnnzs_hip,
d_Bnnzs_hip,
d_Ccols_hip,
d_Bcols_hip,
d_x,
d_y,
dim,
dim_wells,
d_val_pointers_hip
);
HIP_CHECK(hipStreamSynchronize(stream));
#else
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
#endif
}
void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply_mswells(Scalar* d_x, Scalar* d_y)
{
if (h_x.empty()) {
h_x.resize(N);
h_y.resize(N);
h_x.resize(this->N);
h_y.resize(this->N);
}
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream));
// actually apply MultisegmentWells
for (auto& well : multisegments) {
for (auto& well : this->multisegments) {
well->apply(h_x.data(), h_y.data());
}
// copy vector y from CPU to GPU
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipStreamSynchronize(stream));
}
void WellContributionsRocsparse::apply(double *d_x, double *d_y){
if(num_std_wells > 0){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply(Scalar* d_x, Scalar* d_y)
{
if (this->num_std_wells > 0) {
apply_stdwells(d_x, d_y);
}
if(num_ms_wells > 0){
if (this->num_ms_wells > 0) {
apply_mswells(d_x, d_y);
}
}
void WellContributionsRocsparse::setStream(hipStream_t stream_){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
{
stream = stream_;
}
void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
int* colIndices,
double* values,
unsigned int val_size)
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
APIaddMatrix(MatrixType type,
int* colIndices,
Scalar* values,
unsigned int val_size)
{
if (!allocated) {
if (!this->allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
sizeof(d_Ccols_hip) * val_size,
hipMemcpyHostToDevice, stream));
break;
case MatrixType::D:
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
hipMemcpyHostToDevice, stream));
break;
case MatrixType::B:
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
sizeof(d_Bcols_hip) * val_size,
hipMemcpyHostToDevice, stream));
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
this->val_pointers[this->num_std_wells] = this->num_blocks;
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
hipMemcpyHostToDevice, stream));
}
break;
@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
HIP_CHECK(hipStreamSynchronize(stream));
}
void WellContributionsRocsparse::APIalloc()
template<class Scalar>
void WellContributionsRocsparse<Scalar>::APIalloc()
{
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
}
} //namespace Opm
template class WellContributionsRocsparse<double>;
} // namespace Opm

View File

@ -26,33 +26,35 @@
#include <vector>
namespace Opm {
namespace Opm
{
class WellContributionsRocsparse : public WellContributions<double>
template<class Scalar>
class WellContributionsRocsparse : public WellContributions<Scalar>
{
private:
hipStream_t stream;
public:
void apply_stdwells(double *d_x, double *d_y);
void apply_mswells(double *d_x, double *d_y);
void apply(double *d_x, double *d_y);
void apply_stdwells(Scalar* d_x, Scalar* d_y);
void apply_mswells(Scalar* d_x, Scalar* d_y);
void apply(Scalar* d_x, Scalar* d_y);
void setStream(hipStream_t stream);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
using MatrixType = typename WellContributions<Scalar>::MatrixType;
double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
void APIaddMatrix(MatrixType type, int* colIndices,
Scalar* values, unsigned int val_size) override;
Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
unsigned *d_Ccols_hip, *d_Bcols_hip;
unsigned *d_val_pointers_hip;
std::vector<double> h_x;
std::vector<double> h_y;
std::vector<Scalar> h_x;
std::vector<Scalar> h_y;
};
} //namespace Opm