mirror of
https://github.com/OPM/opm-simulators.git
synced 2024-07-07 04:53:03 -05:00
rocsparseWellContributions: template Scalar type
This commit is contained in:
parent
ddcfcf681d
commit
d2637e53ea
|
@ -60,10 +60,10 @@ WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useW
|
|||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
|
||||
#endif
|
||||
}
|
||||
else if(accelerator_mode.compare("rocsparse") == 0){
|
||||
else if (accelerator_mode.compare("rocsparse") == 0) {
|
||||
if (!useWellConn) {
|
||||
#if HAVE_ROCSPARSE
|
||||
return std::make_unique<WellContributionsRocsparse>();
|
||||
return std::make_unique<WellContributionsRocsparse<Scalar>>();
|
||||
#else
|
||||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
|
||||
#endif
|
||||
|
|
|
@ -308,10 +308,10 @@ gpu_pbicgstab(WellContributions<double>& wellContribs, BdaResult& res)
|
|||
}
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0){
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsOCL<double>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
if(verbosity >= 3) {
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_well.stop();
|
||||
t_rest.start();
|
||||
|
|
|
@ -164,8 +164,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
|
|||
|
||||
// set stream here, the WellContributions object is destroyed every linear solve
|
||||
// the number of wells can change every linear solve
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<double>&>(wellContribs).setStream(stream);
|
||||
}
|
||||
|
||||
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
|
||||
|
@ -254,8 +254,8 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
|
|||
}
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
@ -313,15 +313,15 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<double>& wellContribs,
|
|||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
d_s, &zero, d_t));
|
||||
#endif
|
||||
if(verbosity >= 3){
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
t_spmv.stop();
|
||||
t_well.start();
|
||||
}
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<double>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
|
|
@ -56,17 +56,17 @@ namespace Opm
|
|||
|
||||
#ifdef __HIP__
|
||||
/// HIP kernel to apply the standard wellcontributions
|
||||
__global__ void stdwell_apply(
|
||||
const double *Cnnzs,
|
||||
const double *Dnnzs,
|
||||
const double *Bnnzs,
|
||||
const unsigned *Ccols,
|
||||
const unsigned *Bcols,
|
||||
const double *x,
|
||||
double *y,
|
||||
const unsigned dim,
|
||||
const unsigned dim_wells,
|
||||
const unsigned *val_pointers)
|
||||
template<class Scalar>
|
||||
__global__ void stdwell_apply(const Scalar* Cnnzs,
|
||||
const Scalar* Dnnzs,
|
||||
const Scalar* Bnnzs,
|
||||
const unsigned* Ccols,
|
||||
const unsigned* Bcols,
|
||||
const Scalar* x,
|
||||
Scalar* y,
|
||||
const unsigned dim,
|
||||
const unsigned dim_wells,
|
||||
const unsigned *val_pointers)
|
||||
{
|
||||
unsigned wgId = blockIdx.x;
|
||||
unsigned wiId = threadIdx.x;
|
||||
|
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
|
|||
unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
|
||||
unsigned c = wiId % dim;
|
||||
unsigned r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
Scalar temp;
|
||||
|
||||
extern __shared__ double localSum[];
|
||||
double *z1 = localSum + gridDim.x;
|
||||
double *z2 = z1 + dim_wells;
|
||||
extern __shared__ Scalar localSum[];
|
||||
Scalar* z1 = localSum + gridDim.x;
|
||||
Scalar* z2 = z1 + dim_wells;
|
||||
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
if (wiId < numActiveWorkItems) {
|
||||
unsigned b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
while (b < valSize + val_pointers[wgId]) {
|
||||
int colIdx = Bcols[b];
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
|
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
|
|||
// 6 7 8 18 19 20
|
||||
// 9 10 11 21 22 23
|
||||
// workitem i will hold the sum of workitems i and i + valsPerBlock
|
||||
if(wiId < valsPerBlock){
|
||||
if (wiId < valsPerBlock){
|
||||
for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
|
||||
localSum[wiId] += localSum[wiId + i*valsPerBlock];
|
||||
}
|
||||
}
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned i = dim - 1; i > 0; --i){
|
||||
if (c == 0 && wiId < valsPerBlock){
|
||||
for (unsigned i = dim - 1; i > 0; --i) {
|
||||
localSum[wiId] += localSum[wiId + i];
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
|
@ -117,7 +117,7 @@ __global__ void stdwell_apply(
|
|||
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned i = 0; i < dim_wells; ++i){
|
||||
for (unsigned i = 0; i < dim_wells; ++i) {
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
|
@ -125,10 +125,10 @@ __global__ void stdwell_apply(
|
|||
|
||||
__syncthreads();
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
if (wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
unsigned bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned j = 0; j < dim_wells; ++j){
|
||||
for (unsigned j = 0; j < dim_wells; ++j) {
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
|
||||
|
@ -138,17 +138,26 @@ __global__ void stdwell_apply(
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
|
||||
[[maybe_unused]] double *d_y){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply_stdwells([[maybe_unused]] Scalar* d_x,
|
||||
[[maybe_unused]] Scalar* d_y)
|
||||
{
|
||||
#ifdef __HIP__
|
||||
unsigned gridDim = num_std_wells;
|
||||
unsigned blockDim = 64;
|
||||
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
|
||||
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
|
||||
d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
|
||||
d_x, d_y, dim, dim_wells, d_val_pointers_hip
|
||||
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
|
||||
d_Dnnzs_hip,
|
||||
d_Bnnzs_hip,
|
||||
d_Ccols_hip,
|
||||
d_Bcols_hip,
|
||||
d_x,
|
||||
d_y,
|
||||
dim,
|
||||
dim_wells,
|
||||
d_val_pointers_hip
|
||||
);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
|
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
|
|||
#endif
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply_mswells(Scalar* d_x, Scalar* d_y)
|
||||
{
|
||||
if (h_x.empty()) {
|
||||
h_x.resize(N);
|
||||
h_y.resize(N);
|
||||
h_x.resize(this->N);
|
||||
h_y.resize(this->N);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (auto& well : multisegments) {
|
||||
for (auto& well : this->multisegments) {
|
||||
well->apply(h_x.data(), h_y.data());
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::apply(double *d_x, double *d_y){
|
||||
if(num_std_wells > 0){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply(Scalar* d_x, Scalar* d_y)
|
||||
{
|
||||
if (this->num_std_wells > 0) {
|
||||
apply_stdwells(d_x, d_y);
|
||||
}
|
||||
|
||||
if(num_ms_wells > 0){
|
||||
if (this->num_ms_wells > 0) {
|
||||
apply_mswells(d_x, d_y);
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::setStream(hipStream_t stream_){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
|
||||
{
|
||||
stream = stream_;
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
double* values,
|
||||
unsigned int val_size)
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
Scalar* values,
|
||||
unsigned int val_size)
|
||||
{
|
||||
if (!allocated) {
|
||||
if (!this->allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
|
||||
sizeof(d_Ccols_hip) * val_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
break;
|
||||
|
||||
case MatrixType::D:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
|
||||
values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
break;
|
||||
|
||||
case MatrixType::B:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
|
||||
sizeof(d_Bcols_hip) * val_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if (num_std_wells_so_far == num_std_wells - 1) {
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
|
||||
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
|
||||
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
|
||||
this->val_pointers[this->num_std_wells] = this->num_blocks;
|
||||
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
|
||||
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
|
|||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::APIalloc()
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::APIalloc()
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
|
||||
sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
|
||||
sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
|
||||
sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
|
||||
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
|
||||
}
|
||||
|
||||
} //namespace Opm
|
||||
template class WellContributionsRocsparse<double>;
|
||||
|
||||
} // namespace Opm
|
||||
|
|
|
@ -26,33 +26,35 @@
|
|||
|
||||
#include <vector>
|
||||
|
||||
namespace Opm {
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
class WellContributionsRocsparse : public WellContributions<double>
|
||||
template<class Scalar>
|
||||
class WellContributionsRocsparse : public WellContributions<Scalar>
|
||||
{
|
||||
private:
|
||||
hipStream_t stream;
|
||||
|
||||
public:
|
||||
void apply_stdwells(double *d_x, double *d_y);
|
||||
void apply_mswells(double *d_x, double *d_y);
|
||||
void apply(double *d_x, double *d_y);
|
||||
void apply_stdwells(Scalar* d_x, Scalar* d_y);
|
||||
void apply_mswells(Scalar* d_x, Scalar* d_y);
|
||||
void apply(Scalar* d_x, Scalar* d_y);
|
||||
void setStream(hipStream_t stream);
|
||||
|
||||
protected:
|
||||
/// Allocate memory for the StandardWells
|
||||
void APIalloc() override;
|
||||
|
||||
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
|
||||
using MatrixType = typename WellContributions<Scalar>::MatrixType;
|
||||
|
||||
double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
|
||||
void APIaddMatrix(MatrixType type, int* colIndices,
|
||||
Scalar* values, unsigned int val_size) override;
|
||||
|
||||
Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
|
||||
unsigned *d_Ccols_hip, *d_Bcols_hip;
|
||||
unsigned *d_val_pointers_hip;
|
||||
|
||||
std::vector<double> h_x;
|
||||
std::vector<double> h_y;
|
||||
std::vector<Scalar> h_x;
|
||||
std::vector<Scalar> h_y;
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
|
Loading…
Reference in New Issue
Block a user