mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Added openclSolver
Usage: --gpu-mode=[cusparse|opencl|none] on command line
This commit is contained in:
parent
e4ea932a9d
commit
f974a5f6db
@ -134,6 +134,14 @@ if(CUDA_FOUND)
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
|
||||
find_package(OpenCL)
|
||||
|
||||
if(OpenCL_FOUND)
|
||||
set(HAVE_OPENCL 1)
|
||||
include_directories(${OpenCL_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
# read the list of components from this file (in the project directory);
|
||||
# it should set various lists with the names of the files to include
|
||||
include (CMakeLists_files.cmake)
|
||||
@ -386,3 +394,7 @@ if(CUDA_FOUND)
|
||||
target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
|
||||
endif()
|
||||
|
||||
if(OpenCL_FOUND)
|
||||
target_link_libraries( opmsimulators ${OpenCL_LIBRARIES} )
|
||||
endif()
|
||||
|
||||
|
@ -46,6 +46,13 @@ if(CUDA_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||
endif()
|
||||
if(OPENCL_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||
endif()
|
||||
if(MPI_FOUND)
|
||||
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
||||
opm/simulators/utils/ParallelSerialization.cpp)
|
||||
@ -142,8 +149,14 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
|
||||
opm/simulators/linalg/bda/BdaBridge.hpp
|
||||
opm/simulators/linalg/bda/BdaResult.hpp
|
||||
opm/simulators/linalg/bda/BdaSolver.hpp
|
||||
opm/simulators/linalg/bda/BILU0.hpp
|
||||
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
||||
opm/simulators/linalg/bda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/Reorder.hpp
|
||||
opm/simulators/linalg/bda/openclKernels.hpp
|
||||
opm/simulators/linalg/bda/openclSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
|
||||
opm/simulators/linalg/bda/WellContributions.hpp
|
||||
opm/simulators/linalg/BlackoilAmg.hpp
|
||||
|
@ -6,6 +6,7 @@ set (opm-simulators_CONFIG_VAR
|
||||
HAVE_MPI
|
||||
HAVE_PETSC
|
||||
HAVE_CUDA
|
||||
HAVE_OPENCL
|
||||
HAVE_SUITESPARSE_UMFPACK_H
|
||||
HAVE_DUNE_ISTL
|
||||
DUNE_ISTL_VERSION_MAJOR
|
||||
|
@ -67,7 +67,7 @@ NEW_PROP_TAG(CprEllSolvetype);
|
||||
NEW_PROP_TAG(CprReuseSetup);
|
||||
NEW_PROP_TAG(LinearSolverConfiguration);
|
||||
NEW_PROP_TAG(LinearSolverConfigurationJsonFile);
|
||||
NEW_PROP_TAG(UseGpu);
|
||||
NEW_PROP_TAG(GpuMode);
|
||||
|
||||
SET_SCALAR_PROP(FlowIstlSolverParams, LinearSolverReduction, 1e-2);
|
||||
SET_SCALAR_PROP(FlowIstlSolverParams, IluRelaxation, 0.9);
|
||||
@ -94,7 +94,7 @@ SET_INT_PROP(FlowIstlSolverParams, CprEllSolvetype, 0);
|
||||
SET_INT_PROP(FlowIstlSolverParams, CprReuseSetup, 3);
|
||||
SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfiguration, "ilu0");
|
||||
SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfigurationJsonFile, "none");
|
||||
SET_BOOL_PROP(FlowIstlSolverParams, UseGpu, false);
|
||||
SET_STRING_PROP(FlowIstlSolverParams, GpuMode, "none");
|
||||
|
||||
|
||||
|
||||
@ -167,7 +167,7 @@ namespace Opm
|
||||
bool scale_linear_system_;
|
||||
std::string linear_solver_configuration_;
|
||||
std::string linear_solver_configuration_json_file_;
|
||||
bool use_gpu_;
|
||||
std::string gpu_mode_;
|
||||
|
||||
template <class TypeTag>
|
||||
void init()
|
||||
@ -196,7 +196,7 @@ namespace Opm
|
||||
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
|
||||
linear_solver_configuration_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfiguration);
|
||||
linear_solver_configuration_json_file_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile);
|
||||
use_gpu_ = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||
}
|
||||
|
||||
template <class TypeTag>
|
||||
@ -225,7 +225,7 @@ namespace Opm
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse Amg Setup");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfiguration, "Configuration of solver valid is: ilu0 (default), cpr_quasiimpes, cpr_trueimpes or file (specified in LinearSolverConfigurationJsonFile) ");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile, "Filename of JSON configuration for flexible linear solver system.");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, bool, UseGpu, "Use GPU cusparseSolver as the linear solver");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver");
|
||||
}
|
||||
|
||||
FlowLinearSolverParameters() { reset(); }
|
||||
@ -247,7 +247,7 @@ namespace Opm
|
||||
ilu_milu_ = MILU_VARIANT::ILU;
|
||||
ilu_redblack_ = false;
|
||||
ilu_reorder_sphere_ = true;
|
||||
use_gpu_ = false;
|
||||
gpu_mode_ = "none";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -317,18 +317,18 @@ protected:
|
||||
}
|
||||
const auto& gridForConn = simulator_.vanguard().grid();
|
||||
#if HAVE_CUDA
|
||||
bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
if (gridForConn.comm().size() > 1 && use_gpu) {
|
||||
std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||
if (gridForConn.comm().size() > 1 && "none" != gpu_mode) {
|
||||
OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
|
||||
use_gpu = false;
|
||||
gpu_mode = "none";
|
||||
}
|
||||
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
|
||||
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
|
||||
bdaBridge.reset(new BdaBridge(gpu_mode, linear_solver_verbosity, maxit, tolerance));
|
||||
#else
|
||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
if (use_gpu) {
|
||||
const bool gpu_mode = EWOMS_GET_PARAM(TypeTag, bool, GpuMode);
|
||||
if ("none" != gpu_mode) {
|
||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
|
||||
}
|
||||
#endif
|
||||
|
352
opm/simulators/linalg/bda/BILU0.cpp
Normal file
352
opm/simulators/linalg/bda/BILU0.cpp
Normal file
@ -0,0 +1,352 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
BILU0::BILU0(bool level_scheduling_, bool graph_coloring_, int verbosity_) :
|
||||
level_scheduling(level_scheduling_), graph_coloring(graph_coloring_), verbosity(verbosity_)
|
||||
{
|
||||
if (level_scheduling == graph_coloring) {
|
||||
OPM_THROW(std::logic_error, "Error, either level_scheduling or graph_coloring must be true, not both\n");
|
||||
}
|
||||
}
|
||||
|
||||
BILU0::~BILU0()
|
||||
{
|
||||
delete[] invDiagVals;
|
||||
delete[] diagIndex;
|
||||
delete[] rowsPerColor;
|
||||
freeBlockedMatrix(&LUMat);
|
||||
freeBlockedMatrix(&LMat);
|
||||
freeBlockedMatrix(&UMat);
|
||||
delete[] toOrder;
|
||||
delete[] fromOrder;
|
||||
freeBlockedMatrix(&rMat);
|
||||
}
|
||||
|
||||
bool BILU0::init(BlockedMatrix *mat, unsigned int block_size)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
BlockedMatrix *CSCmat = nullptr;
|
||||
|
||||
this->N = mat->Nb * block_size;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * block_size * block_size;
|
||||
this->nnzbs = mat->nnzbs;
|
||||
this->block_size = block_size;
|
||||
|
||||
toOrder = new int[N];
|
||||
fromOrder = new int[N];
|
||||
if (level_scheduling) {
|
||||
CSCmat = new BlockedMatrix();
|
||||
CSCmat->Nb = Nb;
|
||||
CSCmat->nnzbs = nnzbs;
|
||||
CSCmat->nnzValues = new Block[nnzbs];
|
||||
CSCmat->colIndices = new int[nnzbs];
|
||||
CSCmat->rowPointers = new int[Nb + 1];
|
||||
if(verbosity >= 3){
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
bcsr_to_bcsc(mat->nnzValues, mat->colIndices, mat->rowPointers, CSCmat->nnzValues, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb);
|
||||
if(verbosity >= 3){
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
if(verbosity >= 3){
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
rMat = allocateBlockedMatrix(mat->Nb, mat->nnzbs);
|
||||
LUMat = soft_copyBlockedMatrix(rMat);
|
||||
if (level_scheduling) {
|
||||
rowsPerColor = findLevelScheduling(mat->colIndices, mat->rowPointers, CSCmat->colIndices, CSCmat->rowPointers, mat->Nb, &numColors, toOrder, fromOrder);
|
||||
} else if (graph_coloring) {
|
||||
rowsPerColor = findGraphColoring(mat->colIndices, mat->rowPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder);
|
||||
}
|
||||
if (rowsPerColor == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if(verbosity >= 3){
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 analysis took: " << t2 - t1 << " s, " << numColors << " colors";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
diagIndex = new int[mat->Nb];
|
||||
invDiagVals = new Block[mat->Nb];
|
||||
|
||||
LMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
UMat = allocateBlockedMatrix(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
|
||||
LUMat->nnzValues = new Block[mat->nnzbs];
|
||||
|
||||
if (level_scheduling) {
|
||||
delete[] CSCmat->nnzValues;
|
||||
delete[] CSCmat->colIndices;
|
||||
delete[] CSCmat->rowPointers;
|
||||
delete CSCmat;
|
||||
}
|
||||
|
||||
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * LMat->nnzbs);
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * UMat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LMat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * UMat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LMat->Nb + 1));
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (UMat->Nb + 1));
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Block) * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
|
||||
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, mat->Nb * sizeof(Block), invDiagVals);
|
||||
|
||||
int *rowsPerColorPrefix = new int[numColors + 1];
|
||||
int prefix = 0;
|
||||
for (int i = 0; i < numColors + 1; ++i) {
|
||||
rowsPerColorPrefix[i] = prefix;
|
||||
prefix += rowsPerColor[i];
|
||||
}
|
||||
queue->enqueueWriteBuffer(s.rowsPerColor, CL_TRUE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix);
|
||||
delete[] rowsPerColorPrefix;
|
||||
|
||||
return true;
|
||||
} // end init()
|
||||
|
||||
|
||||
bool BILU0::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity >= 3){
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
blocked_reorder_matrix_by_pattern(mat, toOrder, fromOrder, rMat);
|
||||
if (verbosity >= 3){
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 reorder matrix: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
if (verbosity >= 3){
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(Block) * rMat->nnzbs);
|
||||
if (verbosity >= 3){
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 memcpy: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
int i, j, ij, ik, jk;
|
||||
int iRowStart, iRowEnd, jRowEnd;
|
||||
Block pivot;
|
||||
|
||||
int LSize = 0;
|
||||
const int blockSquare = block_size * block_size;
|
||||
|
||||
if (verbosity >= 3){
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
// go through all rows
|
||||
for (i = 0; i < LUMat->Nb; i++) {
|
||||
iRowStart = LUMat->rowPointers[i];
|
||||
iRowEnd = LUMat->rowPointers[i + 1];
|
||||
|
||||
// go through all elements of the row
|
||||
for (ij = iRowStart; ij < iRowEnd; ij++) {
|
||||
j = LUMat->colIndices[ij];
|
||||
// if the element is the diagonal, store the index and go to next row
|
||||
if (j == i) {
|
||||
diagIndex[i] = ij;
|
||||
break;
|
||||
}
|
||||
// if an element beyond the diagonal is reach, no diagonal was found
|
||||
// throw an error now. TODO: perform reordering earlier to prevent this
|
||||
if (j > i) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 Error could not find diagonal value in row: " << i;
|
||||
OpmLog::error(out.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
LSize++;
|
||||
// calculate the pivot of this row
|
||||
blockMult(LUMat->nnzValues[ij], invDiagVals[j], pivot);
|
||||
|
||||
memcpy(LUMat->nnzValues[ij], &pivot, sizeof(double)*block_size * block_size);
|
||||
|
||||
jRowEnd = LUMat->rowPointers[j + 1];
|
||||
jk = diagIndex[j] + 1;
|
||||
ik = ij + 1;
|
||||
// substract that row scaled by the pivot from this row.
|
||||
while (ik < iRowEnd && jk < jRowEnd) {
|
||||
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
|
||||
blockMultSub(LUMat->nnzValues[ik], pivot, LUMat->nnzValues[jk]);
|
||||
ik++;
|
||||
jk++;
|
||||
} else {
|
||||
if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
|
||||
{ ik++; }
|
||||
else
|
||||
{ jk++; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// store the inverse in the diagonal!
|
||||
blockInvert3x3(LUMat->nnzValues[ij], invDiagVals[i]);
|
||||
|
||||
memcpy(LUMat->nnzValues[ij], invDiagVals[i], sizeof(double)*block_size * block_size);
|
||||
}
|
||||
|
||||
LMat->rowPointers[0] = 0;
|
||||
UMat->rowPointers[0] = 0;
|
||||
|
||||
// Split the LU matrix into two by comparing column indices to diagonal indices
|
||||
for (i = 0; i < LUMat->Nb; i++) {
|
||||
int offsetL = LMat->rowPointers[i];
|
||||
int rowSize = diagIndex[i] - LUMat->rowPointers[i];
|
||||
int offsetLU = LUMat->rowPointers[i];
|
||||
memcpy(LMat->nnzValues[offsetL], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize);
|
||||
memcpy(LMat->colIndices + offsetL, LUMat->colIndices + offsetLU, sizeof(int) * rowSize);
|
||||
offsetL += rowSize;
|
||||
LMat->rowPointers[i + 1] = offsetL;
|
||||
}
|
||||
// Reverse the order or the (blocked) rows for the U matrix,
|
||||
// because the rows are accessed in reverse order when applying the ILU0
|
||||
int URowIndex = 0;
|
||||
for (i = LUMat->Nb - 1; i >= 0; i--) {
|
||||
int offsetU = UMat->rowPointers[URowIndex];
|
||||
int rowSize = LUMat->rowPointers[i + 1] - diagIndex[i] - 1;
|
||||
int offsetLU = diagIndex[i] + 1;
|
||||
memcpy(UMat->nnzValues[offsetU], LUMat->nnzValues[offsetLU], sizeof(double) * blockSquare * rowSize);
|
||||
memcpy(UMat->colIndices + offsetU, LUMat->colIndices + offsetLU, sizeof(int) * rowSize);
|
||||
offsetU += rowSize;
|
||||
UMat->rowPointers[URowIndex + 1] = offsetU;
|
||||
URowIndex++;
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 decomposition: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
if (pattern_uploaded == false) {
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_TRUE, 0, LMat->nnzbs * sizeof(int), LMat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_TRUE, 0, UMat->nnzbs * sizeof(int), UMat->colIndices);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_TRUE, 0, (LMat->Nb + 1) * sizeof(int), LMat->rowPointers);
|
||||
queue->enqueueWriteBuffer(s.Urows, CL_TRUE, 0, (UMat->Nb + 1) * sizeof(int), UMat->rowPointers);
|
||||
pattern_uploaded = true;
|
||||
}
|
||||
queue->enqueueWriteBuffer(s.Lvals, CL_TRUE, 0, LMat->nnzbs * sizeof(Block), LMat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.Uvals, CL_TRUE, 0, UMat->nnzbs * sizeof(Block), UMat->nnzValues);
|
||||
queue->enqueueWriteBuffer(s.invDiagVals, CL_TRUE, 0, Nb * sizeof(Block), invDiagVals);
|
||||
if (verbosity >= 3) {
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 copy to GPU: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
void BILU0::apply(cl::Buffer& x, cl::Buffer& y)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity >= 3) {
|
||||
t1 = BdaSolver::second();
|
||||
}
|
||||
const unsigned int block_size = 3;
|
||||
cl::Event event;
|
||||
|
||||
for(unsigned int color = 0; color < numColors; ++color){
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, (unsigned int)Nb, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
// event.wait();
|
||||
}
|
||||
for(int color = numColors-1; color >= 0; --color){
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, (unsigned int)Nb, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
event.wait();
|
||||
t2 = BdaSolver::second();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void BILU0::setOpenCLContext(cl::Context *context){
|
||||
this->context = context;
|
||||
}
|
||||
void BILU0::setOpenCLQueue(cl::CommandQueue *queue){
|
||||
this->queue = queue;
|
||||
}
|
||||
void BILU0::setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group){
|
||||
this->work_group_size = work_group_size;
|
||||
this->total_work_items = total_work_items;
|
||||
this->lmem_per_work_group = lmem_per_work_group;
|
||||
}
|
||||
void BILU0::setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
|
||||
){
|
||||
this->ILU_apply1 = ILU_apply1;
|
||||
this->ILU_apply2 = ILU_apply2;
|
||||
}
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
|
118
opm/simulators/linalg/bda/BILU0.hpp
Normal file
118
opm/simulators/linalg/bda/BILU0.hpp
Normal file
@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BILU0_HPP
|
||||
#define BILU0_HPP
|
||||
|
||||
#include <config.h> // CMake
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
#if HAVE_OPENCL
|
||||
#define __CL_ENABLE_EXCEPTIONS
|
||||
#include <CL/cl.hpp> // up to OpenCL 1.2
|
||||
#endif
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This class implementa a Blocked ILU0 preconditioner
|
||||
/// The decomposition is done on CPU, and reorders the rows of the matrix
|
||||
class BILU0
|
||||
{
|
||||
|
||||
private:
|
||||
int N; // number of rows of the matrix
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzbs; // number of blocks of the matrix
|
||||
unsigned int block_size;
|
||||
BlockedMatrix *LMat, *UMat, *LUMat;
|
||||
BlockedMatrix *rMat = nullptr; // only used with PAR_SIM
|
||||
Block *invDiagVals;
|
||||
int *diagIndex, *rowsPerColor;
|
||||
int *toOrder, *fromOrder;
|
||||
int numColors;
|
||||
int verbosity;
|
||||
|
||||
bool level_scheduling, graph_coloring;
|
||||
|
||||
typedef struct {
|
||||
cl::Buffer Lvals, Uvals, invDiagVals;
|
||||
cl::Buffer Lcols, Lrows;
|
||||
cl::Buffer Ucols, Urows;
|
||||
cl::Buffer rowsPerColor;
|
||||
} GPU_storage;
|
||||
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1;
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2;
|
||||
GPU_storage s;
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
int work_group_size = 0;
|
||||
int total_work_items = 0;
|
||||
int lmem_per_work_group = 0;
|
||||
bool pattern_uploaded = false;
|
||||
|
||||
public:
|
||||
|
||||
BILU0(bool level_scheduling, bool graph_coloring, int verbosity);
|
||||
|
||||
~BILU0();
|
||||
|
||||
// analysis
|
||||
bool init(BlockedMatrix *mat, unsigned int block_size);
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat);
|
||||
|
||||
// apply preconditioner, y = prec(x)
|
||||
void apply(cl::Buffer& x, cl::Buffer& y);
|
||||
|
||||
void setOpenCLContext(cl::Context *context);
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
|
||||
void setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2
|
||||
);
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
return toOrder;
|
||||
}
|
||||
|
||||
int* getFromOrder()
|
||||
{
|
||||
return fromOrder;
|
||||
}
|
||||
|
||||
BlockedMatrix* getRMat()
|
||||
{
|
||||
return rMat;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
||||
|
@ -35,11 +35,24 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
|
||||
: use_gpu(use_gpu_)
|
||||
using bda::BdaResult;
|
||||
using bda::BdaSolver;
|
||||
|
||||
BdaBridge::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance)
|
||||
: gpu_mode(gpu_mode_)
|
||||
{
|
||||
if (use_gpu) {
|
||||
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
if (gpu_mode == "cusparse") {
|
||||
use_gpu = true;
|
||||
backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
} else if (gpu_mode == "opencl") {
|
||||
#if HAVE_OPENCL
|
||||
use_gpu = true;
|
||||
backend.reset(new bda::openclSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
} else if (gpu_mode != "none") {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -159,21 +172,21 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
|
||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
||||
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
|
||||
BdaSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
|
||||
switch(status) {
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
||||
case BdaSolverStatus::BDA_SOLVER_SUCCESS:
|
||||
//OpmLog::info("cusparseSolver converged");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
case BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
case BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
default:
|
||||
OpmLog::warning("cusparseSolver returned unknown status code");
|
||||
OpmLog::warning("BdaSolver returned unknown status code");
|
||||
}
|
||||
|
||||
res.iterations = result.iterations;
|
||||
@ -190,7 +203,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
||||
template <class BridgeVector>
|
||||
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
|
||||
if (use_gpu) {
|
||||
backend->post_process(static_cast<double*>(&(x[0][0])));
|
||||
backend->get_result(static_cast<double*>(&(x[0][0])));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,10 +22,6 @@
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#if ! HAVE_CUDA
|
||||
#error "This file should only be included if CUDA is found"
|
||||
#endif
|
||||
|
||||
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
|
||||
|
||||
#include "dune/istl/bcrsmatrix.hh"
|
||||
@ -33,7 +29,13 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
#if HAVE_OPENCL
|
||||
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
@ -45,16 +47,17 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
class BdaBridge
|
||||
{
|
||||
private:
|
||||
std::unique_ptr<cusparseSolverBackend> backend;
|
||||
std::unique_ptr<bda::BdaSolver> backend;
|
||||
std::string gpu_mode;
|
||||
bool use_gpu;
|
||||
|
||||
public:
|
||||
/// Construct a BdaBridge
|
||||
/// \param[in] use_gpu true iff the cusparseSolver is used, is passed via command-line: '--use-gpu=[true|false]'
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
|
||||
/// \param[in] gpu_mode to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
|
||||
/// \param[in] linear_solver_verbosity verbosity of BdaSolver
|
||||
/// \param[in] maxit maximum number of iterations for BdaSolver
|
||||
/// \param[in] tolerance required relative tolerance for BdaSolver
|
||||
BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
|
@ -20,7 +20,7 @@
|
||||
#ifndef BDARESULT_HEADER_INCLUDED
|
||||
#define BDARESULT_HEADER_INCLUDED
|
||||
|
||||
namespace Opm
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
|
||||
|
94
opm/simulators/linalg/bda/BdaSolver.hpp
Normal file
94
opm/simulators/linalg/bda/BdaSolver.hpp
Normal file
@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
|
||||
#define OPM_BDASOLVER_BACKEND_HEADER_INCLUDED
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::WellContributions;
|
||||
|
||||
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
|
||||
/// This class is abstract, no instantiations can of it can be made, only of its children
|
||||
class BdaSolver
|
||||
{
|
||||
|
||||
protected:
|
||||
|
||||
// verbosity
|
||||
// 0: print nothing during solves, only when initializing
|
||||
// 1: print number of iterations and final norm
|
||||
// 2: also print norm each iteration
|
||||
// 3: also print timings of different backend functions
|
||||
|
||||
int verbosity = 0;
|
||||
|
||||
int maxit = 200;
|
||||
double tolerance = 1e-2;
|
||||
|
||||
int N; // number of rows
|
||||
int Nb; // number of blocked rows (Nb*block_size == N)
|
||||
int nnz; // number of nonzeroes (scalars)
|
||||
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
|
||||
int block_size; // size of block
|
||||
|
||||
bool initialized = false;
|
||||
|
||||
public:
|
||||
|
||||
enum class BdaSolverStatus {
|
||||
BDA_SOLVER_SUCCESS,
|
||||
BDA_SOLVER_ANALYSIS_FAILED,
|
||||
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
BDA_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
|
||||
|
||||
/// Define virtual destructor, so that the derivedclass destructor will be called
|
||||
virtual ~BdaSolver() {};
|
||||
|
||||
/// Define as pure virtual functions, so derivedclass must implement them
|
||||
virtual BdaSolverStatus solve_system(int N, int nnz, int dim,
|
||||
double *vals, int *rows, int *cols,
|
||||
double *b, WellContributions& wellContribs, BdaResult &res) = 0;
|
||||
|
||||
virtual void get_result(double *x) = 0;
|
||||
|
||||
/// Different implementations of BdaSolver can use this function for timing
|
||||
static double second(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
}; // end class BdaSolver
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
156
opm/simulators/linalg/bda/BlockedMatrix.cpp
Normal file
156
opm/simulators/linalg/bda/BlockedMatrix.cpp
Normal file
@ -0,0 +1,156 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <sys/time.h>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
using bda::BlockedMatrix;
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
BlockedMatrix *allocateBlockedMatrix(int Nb, int nnzbs) {
|
||||
BlockedMatrix *mat = new BlockedMatrix();
|
||||
|
||||
mat->nnzValues = new Block[nnzbs];
|
||||
mat->colIndices = new int[nnzbs];
|
||||
mat->rowPointers = new int[Nb + 1];
|
||||
mat->Nb = Nb;
|
||||
mat->nnzbs = nnzbs;
|
||||
|
||||
return mat;
|
||||
}
|
||||
|
||||
void freeBlockedMatrix(BlockedMatrix **mat) {
|
||||
if (*mat) {
|
||||
delete[] (*mat)->nnzValues;
|
||||
delete[] (*mat)->colIndices;
|
||||
delete[] (*mat)->rowPointers;
|
||||
delete (*mat);
|
||||
*mat = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
BlockedMatrix *soft_copyBlockedMatrix(BlockedMatrix *mat) {
|
||||
BlockedMatrix *res = new BlockedMatrix();
|
||||
res->nnzValues = mat->nnzValues;
|
||||
res->colIndices = mat->colIndices;
|
||||
res->rowPointers = mat->rowPointers;
|
||||
res->Nb = mat->Nb;
|
||||
res->nnzbs = mat->nnzbs;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
||||
|
||||
void sortBlockedRow(int *colIndices, Block *data, int left, int right) {
|
||||
int l = left;
|
||||
int r = right;
|
||||
int middle = colIndices[(l + r) >> 1];
|
||||
double lDatum[BLOCK_SIZE * BLOCK_SIZE];
|
||||
do {
|
||||
while (colIndices[l] < middle)
|
||||
l++;
|
||||
while (colIndices[r] > middle)
|
||||
r--;
|
||||
if (l <= r) {
|
||||
int lColIndex = colIndices[l];
|
||||
colIndices[l] = colIndices[r];
|
||||
colIndices[r] = lColIndex;
|
||||
memcpy(lDatum, data + l, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
|
||||
memcpy(data + l, data + r, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
|
||||
memcpy(data + r, lDatum, sizeof(double) * BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
l++;
|
||||
r--;
|
||||
}
|
||||
} while (l < r);
|
||||
|
||||
if (left < r)
|
||||
sortBlockedRow(colIndices, data, left, r);
|
||||
|
||||
if (right > l)
|
||||
sortBlockedRow(colIndices, data, l, right);
|
||||
}
|
||||
|
||||
|
||||
// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
|
||||
// a = a - (b * c)
|
||||
void blockMultSub(Block a, Block b, Block c)
|
||||
{
|
||||
for (int row = 0; row < BLOCK_SIZE; row++) {
|
||||
for (int col = 0; col < BLOCK_SIZE; col++) {
|
||||
double temp = 0.0;
|
||||
for (int k = 0; k < BLOCK_SIZE; k++) {
|
||||
temp += b[BLOCK_SIZE * row + k] * c[BLOCK_SIZE * k + col];
|
||||
}
|
||||
a[BLOCK_SIZE * row + col] -= temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
|
||||
|
||||
void blockMult(Block mat1, Block mat2, Block resMat) {
|
||||
for (int row = 0; row < BLOCK_SIZE; row++) {
|
||||
for (int col = 0; col < BLOCK_SIZE; col++) {
|
||||
double temp = 0;
|
||||
for (int k = 0; k < BLOCK_SIZE; k++) {
|
||||
temp += mat1[BLOCK_SIZE * row + k] * mat2[BLOCK_SIZE * k + col];
|
||||
}
|
||||
resMat[BLOCK_SIZE * row + col] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Calculate the inverse of a block. This function is specific for only 3x3 block size.*/
|
||||
|
||||
void blockInvert3x3(Block mat, Block res) {
|
||||
// code generated by maple, copied from DUNE
|
||||
double t4 = mat[0] * mat[4];
|
||||
double t6 = mat[0] * mat[5];
|
||||
double t8 = mat[1] * mat[3];
|
||||
double t10 = mat[2] * mat[3];
|
||||
double t12 = mat[1] * mat[6];
|
||||
double t14 = mat[2] * mat[6];
|
||||
|
||||
double det = (t4 * mat[8] - t6 * mat[7] - t8 * mat[8] +
|
||||
t10 * mat[7] + t12 * mat[5] - t14 * mat[4]);
|
||||
double t17 = 1.0 / det;
|
||||
|
||||
res[0] = (mat[4] * mat[8] - mat[5] * mat[7]) * t17;
|
||||
res[1] = -(mat[1] * mat[8] - mat[2] * mat[7]) * t17;
|
||||
res[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * t17;
|
||||
res[3] = -(mat[3] * mat[8] - mat[5] * mat[6]) * t17;
|
||||
res[4] = (mat[0] * mat[8] - t14) * t17;
|
||||
res[5] = -(t6 - t10) * t17;
|
||||
res[6] = (mat[3] * mat[7] - mat[4] * mat[6]) * t17;
|
||||
res[7] = -(mat[0] * mat[7] - t12) * t17;
|
||||
res[8] = (t4 - t8) * t17;
|
||||
}
|
||||
|
||||
|
||||
} // end namespace bda
|
80
opm/simulators/linalg/bda/BlockedMatrix.hpp
Normal file
80
opm/simulators/linalg/bda/BlockedMatrix.hpp
Normal file
@ -0,0 +1,80 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BLOCKED_MATRIX_HPP
|
||||
#define BLOCKED_MATRIX_HPP
|
||||
|
||||
#define BLOCK_SIZE 3
|
||||
|
||||
typedef double Block[9];
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
typedef struct {
|
||||
Block *nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
int Nb;
|
||||
int nnzbs;
|
||||
} BlockedMatrix;
|
||||
|
||||
/// Allocate BlockedMatrix and data arrays with given sizes
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
/// \return pointer to BlockedMatrix
|
||||
BlockedMatrix *allocateBlockedMatrix(int Nb, int nnzbs);
|
||||
|
||||
/// Allocate BlockedMatrix, but copy data pointers instead of allocating new memory
|
||||
/// \param[in] mat matrix to be copied
|
||||
/// \return pointer to BlockedMatrix
|
||||
BlockedMatrix *soft_copyBlockedMatrix(BlockedMatrix *mat);
|
||||
|
||||
/// Free BlockedMatrix and its data
|
||||
/// \param[in] mat matrix to be free'd
|
||||
void freeBlockedMatrix(BlockedMatrix **mat);
|
||||
|
||||
/// Sort a row of matrix elements from a blocked CSR-format
|
||||
/// \param[inout] colIndices
|
||||
/// \param[inout] data
|
||||
/// \param[in] left lower index of data of row
|
||||
/// \param[in] right upper index of data of row
|
||||
void sortBlockedRow(int *colIndices, Block *data, int left, int right);
|
||||
|
||||
/// Multiply and subtract blocks
|
||||
/// a = a - (b * c)
|
||||
/// \param[inout] a block to be subtracted from
|
||||
/// \param[in] b input block
|
||||
/// \param[in] c input block
|
||||
void blockMultSub(Block a, Block b, Block c);
|
||||
|
||||
/// Perform a 3x3 matrix-matrix multiplication on two blocks
|
||||
/// \param[in] mat1 input block 1
|
||||
/// \param[in] mat2 input block 2
|
||||
/// \param[inout] resMat output block
|
||||
void blockMult(Block mat1, Block mat2, Block resMat);
|
||||
|
||||
/// Calculate the inverse of a block. This function is specific for only 3x3 block size.
|
||||
/// \param[in] mat input block
|
||||
/// \param[inout] res output block
|
||||
void blockInvert3x3(Block mat, Block res);
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
@ -30,94 +30,108 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
|
||||
unsigned int Nb_, unsigned int Mb_,
|
||||
unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues)
|
||||
:
|
||||
dim(dim_), // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
dim_wells(dim_wells_), // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
N(Nb_*dim), // number of rows in vectors x and y, N == dim*Nb
|
||||
Nb(Nb_), // number of blockrows in x and y
|
||||
M(Mb_*dim_wells), // number of rows, M == dim_wells*Mb
|
||||
Mb(Mb_), // number of blockrows in C, D and B
|
||||
DnumBlocks(DnumBlocks_), // number of blocks in D
|
||||
BnumBlocks(BnumBlocks_), // number of blocks in C and B
|
||||
// copy data for matrix D into vectors to prevent it going out of scope
|
||||
Dvals(Dvalues, Dvalues + DnumBlocks*dim_wells*dim_wells),
|
||||
Dcols(DcolPointers, DcolPointers + M + 1),
|
||||
Drows(DrowIndices, DrowIndices + DnumBlocks*dim_wells*dim_wells)
|
||||
{
|
||||
Cvals = std::move(Cvalues);
|
||||
Bvals = std::move(Bvalues);
|
||||
Bcols = std::move(BcolIndices);
|
||||
Brows = std::move(BrowPointers);
|
||||
MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
|
||||
unsigned int Nb_, unsigned int Mb_,
|
||||
unsigned int BnumBlocks_, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks_, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues)
|
||||
:
|
||||
dim(dim_), // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
dim_wells(dim_wells_), // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
N(Nb_ * dim), // number of rows in vectors x and y, N == dim*Nb
|
||||
Nb(Nb_), // number of blockrows in x and y
|
||||
M(Mb_ * dim_wells), // number of rows, M == dim_wells*Mb
|
||||
Mb(Mb_), // number of blockrows in C, D and B
|
||||
DnumBlocks(DnumBlocks_), // number of blocks in D
|
||||
BnumBlocks(BnumBlocks_), // number of blocks in C and B
|
||||
// copy data for matrix D into vectors to prevent it going out of scope
|
||||
Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
|
||||
Dcols(DcolPointers, DcolPointers + M + 1),
|
||||
Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
|
||||
{
|
||||
Cvals = std::move(Cvalues);
|
||||
Bvals = std::move(Bvalues);
|
||||
Bcols = std::move(BcolIndices);
|
||||
Brows = std::move(BrowPointers);
|
||||
|
||||
z1.resize(Mb * dim_wells);
|
||||
z2.resize(Mb * dim_wells);
|
||||
z1.resize(Mb * dim_wells);
|
||||
z2.resize(Mb * dim_wells);
|
||||
|
||||
umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
|
||||
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
|
||||
}
|
||||
umfpack_di_symbolic(M, M, Dcols.data(), Drows.data(), Dvals.data(), &UMFPACK_Symbolic, nullptr, nullptr);
|
||||
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
|
||||
}
|
||||
|
||||
MultisegmentWellContribution::~MultisegmentWellContribution()
|
||||
{
|
||||
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
|
||||
umfpack_di_free_numeric(&UMFPACK_Numeric);
|
||||
}
|
||||
MultisegmentWellContribution::~MultisegmentWellContribution()
|
||||
{
|
||||
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
|
||||
umfpack_di_free_numeric(&UMFPACK_Numeric);
|
||||
}
|
||||
|
||||
|
||||
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
|
||||
// h_x and h_y reside on host
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
{
|
||||
// reset z1 and z2
|
||||
std::fill(z1.begin(), z1.end(), 0.0);
|
||||
std::fill(z2.begin(), z2.end(), 0.0);
|
||||
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
|
||||
// h_x and h_y reside on host
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
{
|
||||
// reset z1 and z2
|
||||
std::fill(z1.begin(), z1.end(), 0.0);
|
||||
std::fill(z2.begin(), z2.end(), 0.0);
|
||||
|
||||
// z1 = B * x
|
||||
for (unsigned int row = 0; row < Mb; ++row) {
|
||||
// for every block in the row
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
|
||||
unsigned int colIdx = Bcols[blockID];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j) {
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim; ++k) {
|
||||
temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
|
||||
}
|
||||
z1[row * dim_wells + j] += temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// z2 = D^-1 * (B * x)
|
||||
// umfpack
|
||||
umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
|
||||
|
||||
// y -= (C^T * z2)
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
for (unsigned int row = 0; row < Mb; ++row) {
|
||||
// for every block in the row
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row+1]; ++blockID) {
|
||||
unsigned int colIdx = Bcols[blockID];
|
||||
for (unsigned int j = 0; j < dim; ++j) {
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim_wells; ++k) {
|
||||
temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
|
||||
}
|
||||
h_y[colIdx * dim + j] -= temp;
|
||||
}
|
||||
// z1 = B * x
|
||||
for (unsigned int row = 0; row < Mb; ++row) {
|
||||
// for every block in the row
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||
unsigned int colIdx = getColIdx(Bcols[blockID]);
|
||||
for (unsigned int j = 0; j < dim_wells; ++j) {
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim; ++k) {
|
||||
temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
|
||||
}
|
||||
z1[row * dim_wells + j] += temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
stream = stream_;
|
||||
// z2 = D^-1 * (B * x)
|
||||
// umfpack
|
||||
umfpack_di_solve(UMFPACK_A, Dcols.data(), Drows.data(), Dvals.data(), z2.data(), z1.data(), UMFPACK_Numeric, nullptr, nullptr);
|
||||
|
||||
// y -= (C^T * z2)
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
for (unsigned int row = 0; row < Mb; ++row) {
|
||||
// for every block in the row
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||
unsigned int colIdx = getColIdx(Bcols[blockID]);
|
||||
for (unsigned int j = 0; j < dim; ++j) {
|
||||
double temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim_wells; ++k) {
|
||||
temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
|
||||
}
|
||||
h_y[colIdx * dim + j] -= temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
stream = stream_;
|
||||
}
|
||||
|
||||
unsigned int MultisegmentWellContribution::getColIdx(unsigned int idx)
|
||||
{
|
||||
if (reorder) {
|
||||
return toOrder[idx];
|
||||
} else {
|
||||
return idx;
|
||||
}
|
||||
}
|
||||
|
||||
void MultisegmentWellContribution::setReordering(int *toOrder, bool reorder)
|
||||
{
|
||||
this->toOrder = toOrder;
|
||||
this->reorder = reorder;
|
||||
}
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
|
@ -24,85 +24,101 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
/// This class serves to duplicate the functionality of the MultisegmentWell
|
||||
/// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
|
||||
/// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
|
||||
/// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq.
|
||||
/// B*x and D*B*x are a vector with M*numWellEq doubles
|
||||
/// C*D*B*x is a vector with N*numEq doubles.
|
||||
|
||||
class MultisegmentWellContribution
|
||||
{
|
||||
/// This class serves to duplicate the functionality of the MultisegmentWell
|
||||
/// A MultisegmentWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
|
||||
/// B and C are matrices, with M rows and N columns, where N is the size of the matrix. They contain blocks of MultisegmentWell::numEq by MultisegmentWell::numWellEq.
|
||||
/// D is a MxM matrix, the square blocks have size MultisegmentWell::numWellEq.
|
||||
/// B*x and D*B*x are a vector with M*numWellEq doubles
|
||||
/// C*D*B*x is a vector with N*numEq doubles.
|
||||
|
||||
private:
|
||||
unsigned int dim; // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
unsigned int dim_wells; // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
unsigned int N; // number of rows in vectors x and y, N == dim*Nb
|
||||
unsigned int Nb; // number of blockrows in x and y
|
||||
unsigned int M; // number of rows, M == dim_wells*Mb
|
||||
unsigned int Mb; // number of blockrows in C, D and B
|
||||
class MultisegmentWellContribution
|
||||
{
|
||||
|
||||
cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
|
||||
private:
|
||||
unsigned int dim; // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
unsigned int dim_wells; // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
unsigned int N; // number of rows in vectors x and y, N == dim*Nb
|
||||
unsigned int Nb; // number of blockrows in x and y
|
||||
unsigned int M; // number of rows, M == dim_wells*Mb
|
||||
unsigned int Mb; // number of blockrows in C, D and B
|
||||
|
||||
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
|
||||
// Sparsity pattern for C is not stored, since it is the same as B
|
||||
unsigned int DnumBlocks; // number of blocks in D
|
||||
unsigned int BnumBlocks; // number of blocks in C and B
|
||||
std::vector<double> Cvals;
|
||||
std::vector<double> Dvals;
|
||||
std::vector<double> Bvals;
|
||||
std::vector<int> Dcols; // Columnpointers, contains M+1 entries
|
||||
std::vector<unsigned int> Bcols;
|
||||
std::vector<int> Drows; // Rowindicies, contains DnumBlocks*dim*dim_wells entries
|
||||
std::vector<unsigned int> Brows;
|
||||
std::vector<double> z1; // z1 = B * x
|
||||
std::vector<double> z2; // z2 = D^-1 * B * x
|
||||
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
|
||||
#if HAVE_CUDA
|
||||
cudaStream_t stream; // not actually used yet, will be when MultisegmentWellContribution are applied on GPU
|
||||
#endif
|
||||
|
||||
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
|
||||
// Sparsity pattern for C is not stored, since it is the same as B
|
||||
unsigned int DnumBlocks; // number of blocks in D
|
||||
unsigned int BnumBlocks; // number of blocks in C and B
|
||||
std::vector<double> Cvals;
|
||||
std::vector<double> Dvals;
|
||||
std::vector<double> Bvals;
|
||||
std::vector<int> Dcols; // Columnpointers, contains M+1 entries
|
||||
std::vector<unsigned int> Bcols;
|
||||
std::vector<int> Drows; // Rowindicies, contains DnumBlocks*dim*dim_wells entries
|
||||
std::vector<unsigned int> Brows;
|
||||
std::vector<double> z1; // z1 = B * x
|
||||
std::vector<double> z2; // z2 = D^-1 * B * x
|
||||
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
|
||||
|
||||
public:
|
||||
int *toOrder = nullptr;
|
||||
bool reorder = false;
|
||||
|
||||
/// Set a cudaStream to be used
|
||||
/// \param[in] stream the cudaStream that is used
|
||||
void setCudaStream(cudaStream_t stream);
|
||||
/// Translate the columnIndex if needed
|
||||
/// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
|
||||
unsigned int getColIdx(unsigned int idx);
|
||||
|
||||
/// Create a new MultisegmentWellContribution
|
||||
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
|
||||
/// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
|
||||
/// \param[in] dim size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
|
||||
/// \param[in] dim_wells size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
|
||||
/// \param[in] Nb number of blocks in vectors x and y
|
||||
/// \param[in] Mb number of blockrows in C, B and D
|
||||
/// \param[in] BnumBlocks number of blocks in C and B
|
||||
/// \param[in] Bvalues nonzero values of matrix B
|
||||
/// \param[in] BcolIndices columnindices of blocks of matrix B
|
||||
/// \param[in] BrowPointers rowpointers of matrix B
|
||||
/// \param[in] DnumBlocks number of blocks in D
|
||||
/// \param[in] Dvalues nonzero values of matrix D
|
||||
/// \param[in] DcolPointers columnpointers of matrix D
|
||||
/// \param[in] DrowIndices rowindices of matrix D
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Nb, unsigned int Mb,
|
||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues);
|
||||
public:
|
||||
|
||||
/// Destroy a MultisegmentWellContribution, and free memory
|
||||
~MultisegmentWellContribution();
|
||||
#if HAVE_CUDA
|
||||
/// Set a cudaStream to be used
|
||||
/// \param[in] stream the cudaStream that is used
|
||||
void setCudaStream(cudaStream_t stream);
|
||||
#endif
|
||||
|
||||
/// Apply the MultisegmentWellContribution on CPU
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
|
||||
/// \param[in] h_x vector x, must be on CPU
|
||||
/// \param[inout] h_y vector y, must be on CPU
|
||||
void apply(double *h_x, double *h_y);
|
||||
/// Create a new MultisegmentWellContribution
|
||||
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
|
||||
/// The variables representing C, B and D will go out of scope when MultisegmentWell::addWellContribution() ends
|
||||
/// \param[in] dim size of blocks in blockvectors x and y, equal to MultisegmentWell::numEq
|
||||
/// \param[in] dim_wells size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
|
||||
/// \param[in] Nb number of blocks in vectors x and y
|
||||
/// \param[in] Mb number of blockrows in C, B and D
|
||||
/// \param[in] BnumBlocks number of blocks in C and B
|
||||
/// \param[in] Bvalues nonzero values of matrix B
|
||||
/// \param[in] BcolIndices columnindices of blocks of matrix B
|
||||
/// \param[in] BrowPointers rowpointers of matrix B
|
||||
/// \param[in] DnumBlocks number of blocks in D
|
||||
/// \param[in] Dvalues nonzero values of matrix D
|
||||
/// \param[in] DcolPointers columnpointers of matrix D
|
||||
/// \param[in] DrowIndices rowindices of matrix D
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Nb, unsigned int Mb,
|
||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues);
|
||||
|
||||
};
|
||||
/// Destroy a MultisegmentWellContribution, and free memory
|
||||
~MultisegmentWellContribution();
|
||||
|
||||
/// Apply the MultisegmentWellContribution on CPU
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
|
||||
/// \param[in] h_x vector x, must be on CPU
|
||||
/// \param[inout] h_y vector y, must be on CPU
|
||||
void apply(double *h_x, double *h_y);
|
||||
|
||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
||||
/// Those indices need to be mapped via toOrder
|
||||
/// \param[in] toOrder array with mappings
|
||||
void setReordering(int *toOrder, bool reorder);
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
|
343
opm/simulators/linalg/bda/Reorder.cpp
Normal file
343
opm/simulators/linalg/bda/Reorder.cpp
Normal file
@ -0,0 +1,343 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <algorithm> // for fill()
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
|
||||
/* Give every node in the matrix (of which only the sparsity pattern in the
|
||||
* form of row pointers and column indices arrays are in the input), a color
|
||||
* in the colors array. Also return the amount of colors in the return integer. */
|
||||
|
||||
int colorBlockedNodes(int rows, const int *rowPointers, const int *colIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor)
|
||||
{
|
||||
int left, c, t, i, j, k;
|
||||
const int max_tries = 100; // since coloring is random, it is possible that a coloring fails. In that case, try again.
|
||||
std::vector<int> randoms;
|
||||
randoms.reserve(rows);
|
||||
|
||||
std::vector<bool> visitedColumns;
|
||||
std::fill(visitedColumns.begin(), visitedColumns.end(), false);
|
||||
int colsInColor;
|
||||
int additionalColsInRow;
|
||||
|
||||
for (t = 0; t < max_tries; t++) {
|
||||
struct timeval tm;
|
||||
gettimeofday(&tm, nullptr);
|
||||
srand(tm.tv_sec + tm.tv_usec * 1000000ul);
|
||||
for (i = 0; i < rows; i++) {
|
||||
randoms[i] = rand();
|
||||
colors[i] = -1;
|
||||
}
|
||||
|
||||
for (c = 0; c < MAX_COLORS; c++) {
|
||||
int rowsInColor = 0;
|
||||
colsInColor = 0;
|
||||
for (i = 0; i < rows; i++)
|
||||
{
|
||||
char f = 1; // true iff you have max random
|
||||
|
||||
// ignore nodes colored earlier
|
||||
if ((colors[i] != -1))
|
||||
continue;
|
||||
|
||||
int ir = randoms[i];
|
||||
|
||||
// look at neighbors to check their random number
|
||||
for (k = rowPointers[i]; k < rowPointers[i + 1]; k++) {
|
||||
|
||||
// ignore nodes colored earlier (and yourself)
|
||||
j = colIndices[k];
|
||||
int jc = colors[j];
|
||||
if (((jc != -1) && (jc != c)) || (i == j)) {
|
||||
continue;
|
||||
}
|
||||
// The if statement below makes it both true graph coloring and no longer guaranteed to converge
|
||||
if (jc == c) {
|
||||
f = 0;
|
||||
break;
|
||||
}
|
||||
int jr = randoms[j];
|
||||
if (ir <= jr) f = 0;
|
||||
}
|
||||
|
||||
// assign color if you have the maximum random number
|
||||
if (f == 1) {
|
||||
additionalColsInRow = 0;
|
||||
for (k = rowPointers[i]; k < rowPointers[i + 1]; k++) {
|
||||
j = colIndices[k];
|
||||
if (!visitedColumns[j]) {
|
||||
visitedColumns[j] = true;
|
||||
additionalColsInRow += 3;
|
||||
}
|
||||
}
|
||||
if ((colsInColor + additionalColsInRow) > maxColsPerColor) {
|
||||
break;
|
||||
}
|
||||
colsInColor += additionalColsInRow;
|
||||
colors[i] = c;
|
||||
rowsInColor += 3;
|
||||
if ((rowsInColor + 2) >= maxRowsPerColor) {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
// Check if graph coloring is done.
|
||||
left = 0;
|
||||
for (k = 0; k < rows; k++) {
|
||||
if (colors[k] == -1) {
|
||||
left++;
|
||||
}
|
||||
}
|
||||
if (left == 0) {
|
||||
return c + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("Could not find a graph coloring with %d colors after %d tries.\nAmount of colorless nodes: %d.\n", c, t, left);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/* Reorder a matrix by a specified input order.
|
||||
* Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
|
||||
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
|
||||
|
||||
void blocked_reorder_matrix_by_pattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rMat) {
|
||||
int rIndex = 0;
|
||||
int i, j, k;
|
||||
|
||||
rMat->rowPointers[0] = 0;
|
||||
for (i = 0; i < mat->Nb; i++) {
|
||||
int thisRow = fromOrder[i];
|
||||
// put thisRow from the old matrix into row i of the new matrix
|
||||
rMat->rowPointers[i + 1] = rMat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
|
||||
for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
|
||||
for (j = 0; j < BLOCK_SIZE * BLOCK_SIZE; j++)
|
||||
rMat->nnzValues[rIndex][j] = mat->nnzValues[k][j];
|
||||
rMat->colIndices[rIndex] = mat->colIndices[k];
|
||||
rIndex++;
|
||||
}
|
||||
}
|
||||
// re-assign column indices according to the new positions of the nodes referenced by the column indices
|
||||
for (i = 0; i < mat->nnzbs; i++) {
|
||||
rMat->colIndices[i] = toOrder[rMat->colIndices[i]];
|
||||
}
|
||||
// re-sort the column indices of every row.
|
||||
for (i = 0; i < mat->Nb; i++) {
|
||||
sortBlockedRow(rMat->colIndices, rMat->nnzValues, rMat->rowPointers[i], rMat->rowPointers[i + 1] - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Reorder a matrix according to the colors that every node of the matrix has received*/
|
||||
|
||||
void colorsToReordering(int Nb, std::vector<int>& colors, int *toOrder, int *fromOrder, int *iters) {
|
||||
int reordered = 0;
|
||||
int i, c;
|
||||
for (i = 0; i < MAX_COLORS; i++) {
|
||||
iters[i] = 0;
|
||||
}
|
||||
|
||||
// Find reordering patterns
|
||||
for (c = 0; c < MAX_COLORS; c++) {
|
||||
for (i = 0; i < Nb; i++) {
|
||||
if (colors[i] == c) {
|
||||
iters[c]++;
|
||||
toOrder[i] = reordered;
|
||||
|
||||
fromOrder[reordered] = i;
|
||||
reordered++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder a matrix according to a reordering pattern
|
||||
|
||||
void blocked_reorder_vector_by_pattern(int Nb, double *vector, int *fromOrder, double *rVector) {
|
||||
for (int i = 0; i < Nb; i++) {
|
||||
for (int j = 0; j < BLOCK_SIZE; j++) {
|
||||
rVector[BLOCK_SIZE * i + j] = vector[BLOCK_SIZE * fromOrder[i] + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Check is operations on a node in the matrix can be started
|
||||
* A node can only be started if all nodes that it depends on during sequential execution have already completed.*/
|
||||
|
||||
bool canBeStarted(int rowIndex, int *rowPointers, int *colIndices, std::vector<bool>& doneRows) {
|
||||
bool canStart = !doneRows[rowIndex];
|
||||
int i, thisDependency;
|
||||
if (canStart) {
|
||||
for (i = rowPointers[rowIndex]; i < rowPointers[rowIndex + 1]; i++) {
|
||||
thisDependency = colIndices[i];
|
||||
// Only dependencies on rows that should execute before the current one are relevant
|
||||
if (thisDependency >= rowIndex)
|
||||
break;
|
||||
// Check if dependency has been resolved
|
||||
if (!doneRows[thisDependency]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return canStart;
|
||||
}
|
||||
|
||||
/*
|
||||
* The level scheduling of a non-symmetric, blocked matrix requires access to a CSC encoding and a CSR encoding of the same matrix.
|
||||
*/
|
||||
|
||||
int *findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCColIndices, int *CSCRowPointers, int Nb, int *iters, int *toOrder, int* fromOrder) {
|
||||
int activeRowIndex = 0, iterEnd, nextActiveRowIndex = 0;
|
||||
int thisRow;
|
||||
std::vector<bool> doneRows(Nb, false);
|
||||
std::vector<int> rowsPerIter;
|
||||
rowsPerIter.reserve(Nb);
|
||||
int *resRowsPerIter;
|
||||
|
||||
std::vector <int> rowsToStart;
|
||||
|
||||
// find starting rows: rows that are independent from all rows that come before them.
|
||||
for (thisRow = 0; thisRow < Nb; thisRow++) {
|
||||
if (canBeStarted(thisRow, CSRRowPointers, CSCColIndices, doneRows)) {
|
||||
fromOrder[nextActiveRowIndex] = thisRow;
|
||||
toOrder[thisRow] = nextActiveRowIndex;
|
||||
nextActiveRowIndex++;
|
||||
}
|
||||
}
|
||||
// 'do' compute on all active rows
|
||||
for (iterEnd = 0; iterEnd < nextActiveRowIndex; iterEnd++) {
|
||||
doneRows[fromOrder[iterEnd]] = true;
|
||||
}
|
||||
|
||||
rowsPerIter.emplace_back(nextActiveRowIndex - activeRowIndex);
|
||||
|
||||
while (iterEnd < Nb) {
|
||||
// Go over all rows active from the last iteration, and check which of their neighbours can be activated this iteration
|
||||
for (; activeRowIndex < iterEnd; activeRowIndex++) {
|
||||
thisRow = fromOrder[activeRowIndex];
|
||||
|
||||
for (int i = CSCRowPointers[thisRow]; i < CSCRowPointers[thisRow + 1]; i++) {
|
||||
int thatRow = CSCColIndices[i];
|
||||
|
||||
if (canBeStarted(thatRow, CSRRowPointers, CSRColIndices, doneRows)) {
|
||||
rowsToStart.emplace_back(thatRow);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 'do' compute on all active rows
|
||||
for (unsigned int i = 0; i < rowsToStart.size(); i++) {
|
||||
thisRow = rowsToStart[i];
|
||||
if (!doneRows[thisRow]) {
|
||||
doneRows[thisRow] = true;
|
||||
fromOrder[nextActiveRowIndex] = thisRow;
|
||||
toOrder[thisRow] = nextActiveRowIndex;
|
||||
nextActiveRowIndex++;
|
||||
}
|
||||
}
|
||||
iterEnd = nextActiveRowIndex;
|
||||
rowsPerIter.emplace_back(nextActiveRowIndex - activeRowIndex);
|
||||
}
|
||||
// Crop the rowsPerIter array to it minimum size.
|
||||
int numColors = rowsPerIter.size();
|
||||
resRowsPerIter = new int[numColors];
|
||||
for (int i = 0; i < numColors; i++) {
|
||||
resRowsPerIter[i] = rowsPerIter[i];
|
||||
}
|
||||
|
||||
*iters = rowsPerIter.size();
|
||||
|
||||
return resRowsPerIter;
|
||||
}
|
||||
|
||||
/* Perform the complete graph coloring algorithm on a matrix. Return an array with the amount of nodes per color.*/
|
||||
|
||||
int* findGraphColoring(int *colIndices, int *rowPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int* fromOrder) {
|
||||
std::vector<int> rowColor;
|
||||
rowColor.reserve(Nb);
|
||||
int *rowsPerColor = new int[MAX_COLORS];
|
||||
|
||||
if (colorBlockedNodes(Nb, rowPointers, colIndices, rowColor, maxRowsPerColor, maxColsPerColor) == -1) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
colorsToReordering(Nb, rowColor, toOrder, fromOrder, rowsPerColor);
|
||||
|
||||
*numColors = MAX_COLORS;
|
||||
while (rowsPerColor[*numColors - 1] == 0) {
|
||||
*numColors = *numColors - 1;
|
||||
}
|
||||
|
||||
return rowsPerColor;
|
||||
}
|
||||
|
||||
// based on the scipy package from python, scipy/sparse/sparsetools/csr.h on github
|
||||
// input : matrix A via Avals, Acols, Arows, Nb
|
||||
// output: matrix B via Bvals, Bcols, Brows
|
||||
// arrays for B must be preallocated
|
||||
void bcsr_to_bcsc(Block *Avals, int *Acols, int *Arows, Block *Bvals, int *Bcols, int *Brows, int Nb) {
|
||||
|
||||
int nnz = Arows[Nb];
|
||||
|
||||
// compute number of nnzs per column
|
||||
std::fill(Brows, Brows + Nb, 0);
|
||||
|
||||
for (int n = 0; n < nnz; ++n) {
|
||||
Brows[Acols[n]]++;
|
||||
}
|
||||
|
||||
// cumsum the nnz per col to get Brows
|
||||
for (int col = 0, cumsum = 0; col < Nb; ++col) {
|
||||
int temp = Brows[col];
|
||||
Brows[col] = cumsum;
|
||||
cumsum += temp;
|
||||
}
|
||||
Brows[Nb] = nnz;
|
||||
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
for (int j = Arows[row]; j < Arows[row + 1]; ++j) {
|
||||
int col = Acols[j];
|
||||
int dest = Brows[col];
|
||||
Bcols[dest] = row;
|
||||
memcpy(Bvals + dest, Avals + dest, sizeof(Block));
|
||||
Brows[col]++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int col = 0, last = 0; col <= Nb; ++col) {
|
||||
int temp = Brows[col];
|
||||
Brows[col] = last;
|
||||
last = temp;
|
||||
}
|
||||
}
|
||||
|
||||
} //namespace bda
|
116
opm/simulators/linalg/bda/Reorder.hpp
Normal file
116
opm/simulators/linalg/bda/Reorder.hpp
Normal file
@ -0,0 +1,116 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef REORDER_HPP
|
||||
#define REORDER_HPP
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
#define MAX_COLORS 256
|
||||
|
||||
/// Give every node in the matrix a color so that no neighbouring nodes share a color
|
||||
/// The color array must be allocated already
|
||||
/// This function does graph coloring based on random numbers
|
||||
/// \param[in] rows number of rows in the matrix
|
||||
/// \param[in] rowPointers array of row pointers
|
||||
/// \param[in] colIndices array of column indices
|
||||
/// \param[inout] colors output array containing the number of the color that each row is assigned to
|
||||
/// \param[in] maxRowsPerColor the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
|
||||
/// \param[in] maxColsPerColor the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
|
||||
/// \return the number of colors needed for the coloring, or -1 if no reordering was found with the given restrictions
|
||||
int colorBlockedNodes(int rows, const int *rowPointers, const int *colIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
|
||||
|
||||
/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
|
||||
/// rMat must be allocated already
|
||||
/// \param[in] mat matrix to be reordered
|
||||
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rMat reordered Matrix
|
||||
void blocked_reorder_matrix_by_pattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rMat);
|
||||
|
||||
/// Compute reorder mapping from the color that each node has received
|
||||
/// The toOrder, fromOrder and iters arrays must be allocated already
|
||||
/// \param[in] Nb number of blocks in the vector
|
||||
/// \param[in] colors array containing the number of the color that each row is assigned to
|
||||
/// \param[inout] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] iters array containing for each color the number of rows that it contains
|
||||
void colorsToReordering(int Nb, std::vector<int>& colors, int *toOrder, int *fromOrder, int *iters);
|
||||
|
||||
/// Reorder a vector according to the mapping in toOrder and fromOrder
|
||||
/// The rVector array must be allocated already
|
||||
/// \param[in] Nb number of blocks in the vector
|
||||
/// \param[in] vector vector to be reordered
|
||||
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rVector reordered vector
|
||||
void blocked_reorder_vector_by_pattern(int Nb, double *vector, int *fromOrder, double *rVector);
|
||||
|
||||
/// Determine whether all rows that a certain row depends on are done already
|
||||
/// \param[in] rowIndex index of the row that needs to be checked for
|
||||
/// \param[in] rowPointers row pointers of the matrix that the row is in
|
||||
/// \param[in] colIndices column indices of the matrix that the row is in
|
||||
/// \param[in] doneRows array that for each row lists whether it is done or not
|
||||
/// \return true iff all dependencies are done and if the result itself was not done yet
|
||||
bool canBeStarted(int rowIndex, int *rowPointers, int *colIndices, std::vector<bool>& doneRows);
|
||||
|
||||
/// Find a level scheduling reordering for an input matrix
|
||||
/// The toOrder and fromOrder arrays must be allocated already
|
||||
/// \param[in] CSRColIndices column indices array, obtained from storing the input matrix in the CSR format
|
||||
/// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format
|
||||
/// \param[in] CSCColIndices row indices array, obtained from storing the input matrix in the CSC format
|
||||
/// \param[in] CSCRowPointers column pointers array, obtained from storing the input matrix in the CSC format
|
||||
/// \param[in] Nb number of blockrows in the matrix
|
||||
/// \param[out] iters a pointer to the number of colors needed for the level scheduling
|
||||
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \return a pointer to an array that contains for each color, the number of rows that that color contains
|
||||
int* findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCColIndices, int *CSCRowPointers, int Nb, int *iters, int *toOrder, int* fromOrder);
|
||||
|
||||
/// Find a graph coloring reordering for an input matrix
|
||||
/// The toOrder and fromOrder arrays must be allocated already
|
||||
/// \param[in] colIndices column indices of the input matrix
|
||||
/// \param[in] rowPointers row pointers of the input matrix
|
||||
/// \param[in] Nb number of blockrows in the matrix
|
||||
/// \param[in] maxRowsPerColor the maximum number of rows that are allowed in one color (so: the maximum number of nodes per color)
|
||||
/// \param[in] maxColsPerColor the maximum number of columns that the rows in a color are allowed to share (so: the maximum number of nodes that the nodes in one color may be connected to)
|
||||
/// \param[out] numColors the number of colors used in the found graph coloring
|
||||
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \return a pointer to an array that contains for each color, the number of rows that that color contains
|
||||
int* findGraphColoring(int *colIndices, int *rowPointers, int Nb, int maxRowsPerColor, int maxColsPerColor, int *numColors, int *toOrder, int* fromOrder);
|
||||
|
||||
/// Convert BCSR matrix to BCSC
|
||||
/// Arrays for output matrix B must be allocated already
|
||||
/// Based on the csr_tocsc() function from the scipy package from python, https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
|
||||
/// \param[in] Avals non-zero values of the BCSR matrix
|
||||
/// \param[in] Acols column indices of the BCSR matrix
|
||||
/// \param[in] Arows row pointers of the BCSR matrix
|
||||
/// \param[inout] Bvals non-zero values of the result BCSC matrix
|
||||
/// \param[inout] Bcols row indices of the result BCSC matrix
|
||||
/// \param[inout] Brows column pointers of the result BCSC matrix
|
||||
/// \param[in] Nb number of blockrows in the matrix
|
||||
void bcsr_to_bcsc(Block *Avals, int *Acols, int *Arows, Block *Bvals, int *Bcols, int *Brows, int Nb);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -34,248 +34,302 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
|
||||
__global__ void apply_well_contributions(
|
||||
const double * __restrict__ Cnnzs,
|
||||
const double * __restrict__ Dnnzs,
|
||||
const double * __restrict__ Bnnzs,
|
||||
const int * __restrict__ Ccols,
|
||||
const int * __restrict__ Bcols,
|
||||
const double * __restrict__ x,
|
||||
double * __restrict__ y,
|
||||
const int dim,
|
||||
const int dim_wells,
|
||||
const unsigned int * __restrict__ val_pointers
|
||||
)
|
||||
{
|
||||
const int idx_b = blockIdx.x;
|
||||
const int idx_t = threadIdx.x;
|
||||
const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
|
||||
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
|
||||
__global__ void apply_well_contributions(
|
||||
const double * __restrict__ Cnnzs,
|
||||
const double * __restrict__ Dnnzs,
|
||||
const double * __restrict__ Bnnzs,
|
||||
const int * __restrict__ Ccols,
|
||||
const int * __restrict__ Bcols,
|
||||
const double * __restrict__ x,
|
||||
double * __restrict__ y,
|
||||
const int dim,
|
||||
const int dim_wells,
|
||||
const unsigned int * __restrict__ val_pointers
|
||||
)
|
||||
{
|
||||
const int idx_b = blockIdx.x;
|
||||
const int idx_t = threadIdx.x;
|
||||
const unsigned int val_size = val_pointers[idx_b + 1] - val_pointers[idx_b];
|
||||
|
||||
const int vals_per_block = dim * dim_wells; // 12
|
||||
const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
|
||||
const int num_blocks_per_warp = 32/vals_per_block; // 2
|
||||
const int lane = idx_t % 32;
|
||||
const int c = lane % dim; // col in block
|
||||
const int r = (lane / dim) % dim_wells; // row in block
|
||||
const int vals_per_block = dim * dim_wells; // 12
|
||||
const int num_active_threads = (32 / vals_per_block) * vals_per_block; // 24
|
||||
const int num_blocks_per_warp = 32 / vals_per_block; // 2
|
||||
const int lane = idx_t % 32;
|
||||
const int c = lane % dim; // col in block
|
||||
const int r = (lane / dim) % dim_wells; // row in block
|
||||
|
||||
extern __shared__ double smem[];
|
||||
double * __restrict__ z1 = smem;
|
||||
double * __restrict__ z2 = z1 + dim_wells;
|
||||
|
||||
if(idx_t < dim_wells){
|
||||
z1[idx_t] = 0.0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// z1 = B * x
|
||||
if(idx_t < num_active_threads){
|
||||
// multiply all blocks with x
|
||||
double temp = 0.0;
|
||||
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
|
||||
while(b < val_size + val_pointers[idx_b]){
|
||||
int colIdx = Bcols[b];
|
||||
temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
|
||||
b += num_blocks_per_warp;
|
||||
}
|
||||
|
||||
// merge all blocks into 1 dim*dim_wells block
|
||||
// since NORNE has only 2 parallel blocks, do not use a loop
|
||||
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
|
||||
|
||||
b = idx_t / vals_per_block + val_pointers[idx_b];
|
||||
|
||||
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
|
||||
if(idx_t < vals_per_block){
|
||||
// should be a loop as well, now only works for dim == 3
|
||||
if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
|
||||
if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
|
||||
}
|
||||
|
||||
// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
|
||||
if(c == 0 && idx_t < vals_per_block){
|
||||
z1[r] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// z2 = D^-1 * B * x = D^-1 * z1
|
||||
if(idx_t < dim_wells){
|
||||
double temp = 0.0;
|
||||
for(int c = 0; c < dim_wells; ++c){
|
||||
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
|
||||
}
|
||||
z2[idx_t] = temp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// y -= C^T * D^-1 * B * x
|
||||
// use dim * val_size threads, each block is assigned 'dim' threads
|
||||
if(idx_t < dim * val_size){
|
||||
double temp = 0.0;
|
||||
int b = idx_t / dim + val_pointers[idx_b];
|
||||
int cc = idx_t % dim;
|
||||
int colIdx = Ccols[b];
|
||||
for(unsigned int c = 0; c < dim_wells; ++c){
|
||||
temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
|
||||
}
|
||||
y[colIdx * dim + cc] -= temp;
|
||||
}
|
||||
extern __shared__ double smem[];
|
||||
double * __restrict__ z1 = smem;
|
||||
double * __restrict__ z2 = z1 + dim_wells;
|
||||
|
||||
if (idx_t < dim_wells) {
|
||||
z1[idx_t] = 0.0;
|
||||
}
|
||||
|
||||
void WellContributions::alloc()
|
||||
{
|
||||
if (num_std_wells > 0) {
|
||||
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
|
||||
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
|
||||
val_pointers = new unsigned int[num_std_wells + 1];
|
||||
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
|
||||
cudaCheckLastError("apply_gpu malloc failed");
|
||||
allocated = true;
|
||||
__syncthreads();
|
||||
|
||||
// z1 = B * x
|
||||
if (idx_t < num_active_threads) {
|
||||
// multiply all blocks with x
|
||||
double temp = 0.0;
|
||||
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
|
||||
while (b < val_size + val_pointers[idx_b]) {
|
||||
int colIdx = Bcols[b];
|
||||
temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
|
||||
b += num_blocks_per_warp;
|
||||
}
|
||||
|
||||
// merge all blocks into 1 dim*dim_wells block
|
||||
// since NORNE has only 2 parallel blocks, do not use a loop
|
||||
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
|
||||
|
||||
b = idx_t / vals_per_block + val_pointers[idx_b];
|
||||
|
||||
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
|
||||
if (idx_t < vals_per_block) {
|
||||
// should be a loop as well, now only works for dim == 3
|
||||
if (c == 0 || c == 2) {temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
|
||||
if (c == 0 || c == 1) {temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
|
||||
}
|
||||
|
||||
// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
|
||||
if (c == 0 && idx_t < vals_per_block) {
|
||||
z1[r] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
WellContributions::~WellContributions()
|
||||
{
|
||||
// free pinned memory for MultisegmentWellContributions
|
||||
if (h_x) {
|
||||
__syncthreads();
|
||||
|
||||
// z2 = D^-1 * B * x = D^-1 * z1
|
||||
if (idx_t < dim_wells) {
|
||||
double temp = 0.0;
|
||||
for (int c = 0; c < dim_wells; ++c) {
|
||||
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
|
||||
}
|
||||
z2[idx_t] = temp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// y -= C^T * D^-1 * B * x
|
||||
// use dim * val_size threads, each block is assigned 'dim' threads
|
||||
if (idx_t < dim * val_size) {
|
||||
double temp = 0.0;
|
||||
int b = idx_t / dim + val_pointers[idx_b];
|
||||
int cc = idx_t % dim;
|
||||
int colIdx = Ccols[b];
|
||||
for (unsigned int c = 0; c < dim_wells; ++c) {
|
||||
temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
|
||||
}
|
||||
y[colIdx * dim + cc] -= temp;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void WellContributions::alloc()
|
||||
{
|
||||
if (num_std_wells > 0) {
|
||||
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
|
||||
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
|
||||
val_pointers = new unsigned int[num_std_wells + 1];
|
||||
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
|
||||
cudaCheckLastError("apply_gpu malloc failed");
|
||||
allocated = true;
|
||||
}
|
||||
}
|
||||
|
||||
WellContributions::~WellContributions()
|
||||
{
|
||||
// free pinned memory for MultisegmentWellContributions
|
||||
if (h_x) {
|
||||
if (host_mem_cuda) {
|
||||
cudaFreeHost(h_x);
|
||||
cudaFreeHost(h_y);
|
||||
}
|
||||
|
||||
// delete MultisegmentWellContributions
|
||||
for (auto ms : multisegments) {
|
||||
delete ms;
|
||||
}
|
||||
multisegments.clear();
|
||||
|
||||
// delete data for StandardWell
|
||||
if (num_std_wells > 0) {
|
||||
cudaFree(d_Cnnzs);
|
||||
cudaFree(d_Dnnzs);
|
||||
cudaFree(d_Bnnzs);
|
||||
cudaFree(d_Ccols);
|
||||
cudaFree(d_Bcols);
|
||||
delete[] val_pointers;
|
||||
cudaFree(d_val_pointers);
|
||||
} else {
|
||||
delete[] h_x;
|
||||
delete[] h_y;
|
||||
}
|
||||
}
|
||||
|
||||
// delete MultisegmentWellContributions
|
||||
for (auto ms : multisegments) {
|
||||
delete ms;
|
||||
}
|
||||
multisegments.clear();
|
||||
|
||||
// Apply the WellContributions, similar to StandardWell::apply()
|
||||
// y -= (C^T *(D^-1*( B*x)))
|
||||
void WellContributions::apply(double *d_x, double *d_y)
|
||||
{
|
||||
// apply MultisegmentWells
|
||||
// delete data for StandardWell
|
||||
if (num_std_wells > 0) {
|
||||
cudaFree(d_Cnnzs);
|
||||
cudaFree(d_Dnnzs);
|
||||
cudaFree(d_Bnnzs);
|
||||
cudaFree(d_Ccols);
|
||||
cudaFree(d_Bcols);
|
||||
delete[] val_pointers;
|
||||
cudaFree(d_val_pointers);
|
||||
}
|
||||
}
|
||||
|
||||
// make sure the stream is empty if timing measurements are done
|
||||
|
||||
// Apply the WellContributions, similar to StandardWell::apply()
|
||||
// y -= (C^T *(D^-1*( B*x)))
|
||||
void WellContributions::apply(double *d_x, double *d_y)
|
||||
{
|
||||
// apply MultisegmentWells
|
||||
|
||||
// make sure the stream is empty if timing measurements are done
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
if (h_x == nullptr) {
|
||||
cudaMallocHost(&h_x, sizeof(double) * N);
|
||||
cudaMallocHost(&h_y, sizeof(double) * N);
|
||||
host_mem_cuda = true;
|
||||
}
|
||||
|
||||
// copy vectors x and y from GPU to CPU
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
if (h_x == nullptr) {
|
||||
cudaMallocHost(&h_x, sizeof(double) * N);
|
||||
cudaMallocHost(&h_y, sizeof(double) * N);
|
||||
}
|
||||
|
||||
|
||||
// copy vectors x and y from GPU to CPU
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (MultisegmentWellContribution *well : multisegments) {
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
// actually apply MultisegmentWells
|
||||
for (MultisegmentWellContribution *well : multisegments) {
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
// apply StandardWells
|
||||
if (num_std_wells > 0) {
|
||||
int smem_size = 2 * sizeof(double) * dim_wells;
|
||||
apply_well_contributions<<<num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
|
||||
// copy vector y from CPU to GPU
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
// apply StandardWells
|
||||
if (num_std_wells > 0) {
|
||||
int smem_size = 2 * sizeof(double) * dim_wells;
|
||||
apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y) {
|
||||
|
||||
// apply MultisegmentWells
|
||||
if (num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
if (h_x == nullptr) {
|
||||
h_x = new double[N];
|
||||
h_y = new double[N];
|
||||
}
|
||||
|
||||
// copy vectors x and y from GPU to CPU
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x);
|
||||
queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (MultisegmentWellContribution *well : multisegments) {
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y);
|
||||
}
|
||||
|
||||
|
||||
void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
|
||||
{
|
||||
if (!allocated) {
|
||||
OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||
}
|
||||
// apply StandardWells
|
||||
if (num_std_wells > 0) {
|
||||
OPM_THROW(std::logic_error, "Error StandardWells are not supported by openclSolver");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::D:
|
||||
cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::B:
|
||||
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if(num_std_wells_so_far == num_std_wells - 1){
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
}
|
||||
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells+1), cudaMemcpyHostToDevice);
|
||||
break;
|
||||
default:
|
||||
OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
|
||||
}
|
||||
cudaCheckLastError("WellContributions::addMatrix() failed");
|
||||
if (MatrixType::B == type) {
|
||||
num_blocks_so_far += val_size;
|
||||
num_std_wells_so_far++;
|
||||
}
|
||||
|
||||
void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
|
||||
{
|
||||
if (!allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||
}
|
||||
|
||||
void WellContributions::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
this->stream = stream_;
|
||||
for(MultisegmentWellContribution *well : multisegments){
|
||||
well->setCudaStream(stream_);
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::D:
|
||||
cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::B:
|
||||
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if (num_std_wells_so_far == num_std_wells - 1) {
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
}
|
||||
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
|
||||
break;
|
||||
default:
|
||||
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
|
||||
}
|
||||
|
||||
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
{
|
||||
dim = dim_;
|
||||
dim_wells = dim_wells_;
|
||||
cudaCheckLastError("WellContributions::addMatrix() failed");
|
||||
if (MatrixType::B == type) {
|
||||
num_blocks_so_far += val_size;
|
||||
num_std_wells_so_far++;
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::addNumBlocks(unsigned int nnz)
|
||||
{
|
||||
if (allocated) {
|
||||
OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
|
||||
}
|
||||
num_blocks += nnz;
|
||||
num_std_wells++;
|
||||
void WellContributions::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
this->stream = stream_;
|
||||
for (MultisegmentWellContribution *well : multisegments) {
|
||||
well->setCudaStream(stream_);
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
{
|
||||
dim = dim_;
|
||||
dim_wells = dim_wells_;
|
||||
}
|
||||
|
||||
void WellContributions::addNumBlocks(unsigned int nnz)
|
||||
{
|
||||
if (allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
|
||||
}
|
||||
num_blocks += nnz;
|
||||
num_std_wells++;
|
||||
}
|
||||
|
||||
void WellContributions::addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Nb, unsigned int Mb,
|
||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues)
|
||||
{
|
||||
this->N = Nb * dim;
|
||||
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
|
||||
multisegments.emplace_back(well);
|
||||
++num_ms_wells;
|
||||
{
|
||||
this->N = Nb * dim;
|
||||
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim, dim_wells, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
|
||||
multisegments.emplace_back(well);
|
||||
++num_ms_wells;
|
||||
}
|
||||
|
||||
|
||||
void WellContributions::setReordering(int *toOrder, bool reorder)
|
||||
{
|
||||
this->toOrder = toOrder;
|
||||
this->reorder = reorder;
|
||||
for (auto& ms : multisegments) {
|
||||
ms->setReordering(toOrder, reorder);
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue)
|
||||
{
|
||||
this->queue = queue;
|
||||
}
|
||||
#endif
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
|
@ -22,137 +22,167 @@
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#if ! HAVE_CUDA
|
||||
#error "This file should only be included if CUDA is found"
|
||||
#if HAVE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#if HAVE_OPENCL
|
||||
#define __CL_ENABLE_EXCEPTIONS
|
||||
#include <CL/cl.hpp>
|
||||
#endif
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
|
||||
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
|
||||
/// So far, StandardWell and MultisegmentWell are supported
|
||||
/// A single instance (or pointer) of this class is passed to the cusparseSolver.
|
||||
/// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.
|
||||
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
|
||||
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
|
||||
/// So far, StandardWell and MultisegmentWell are supported
|
||||
/// A single instance (or pointer) of this class is passed to the cusparseSolver.
|
||||
/// For StandardWell, this class contains all the data and handles the computation. For MultisegmentWell, the vector 'multisegments' contains all the data. For more information, check the MultisegmentWellContribution class.
|
||||
|
||||
/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
|
||||
/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
|
||||
/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
|
||||
/// B*x and D*B*x are a vector with numStaticWellEq doubles
|
||||
/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
|
||||
///
|
||||
/// This class is used in 3 phases:
|
||||
/// - get total size of all wellcontributions that must be stored here
|
||||
/// - allocate memory
|
||||
/// - copy data of wellcontributions
|
||||
class WellContributions
|
||||
{
|
||||
/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
|
||||
/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
|
||||
/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
|
||||
/// B*x and D*B*x are a vector with numStaticWellEq doubles
|
||||
/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
|
||||
///
|
||||
/// This class is used in 3 phases:
|
||||
/// - get total size of all wellcontributions that must be stored here
|
||||
/// - allocate memory
|
||||
/// - copy data of wellcontributions
|
||||
class WellContributions
|
||||
{
|
||||
|
||||
private:
|
||||
unsigned int num_blocks = 0; // total number of blocks in all wells
|
||||
unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq
|
||||
unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
|
||||
unsigned int num_std_wells = 0; // number of StandardWells in this object
|
||||
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
|
||||
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
|
||||
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
|
||||
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
|
||||
unsigned int N; // number of rows (not blockrows) in vectors x and y
|
||||
bool allocated = false;
|
||||
std::vector<MultisegmentWellContribution*> multisegments;
|
||||
cudaStream_t stream;
|
||||
private:
|
||||
unsigned int num_blocks = 0; // total number of blocks in all wells
|
||||
unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq
|
||||
unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
|
||||
unsigned int num_std_wells = 0; // number of StandardWells in this object
|
||||
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
|
||||
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
|
||||
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
|
||||
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
|
||||
unsigned int N; // number of rows (not blockrows) in vectors x and y
|
||||
bool allocated = false;
|
||||
std::vector<MultisegmentWellContribution*> multisegments;
|
||||
|
||||
// data for StandardWells, could remain nullptrs if not used
|
||||
double *d_Cnnzs = nullptr;
|
||||
double *d_Dnnzs = nullptr;
|
||||
double *d_Bnnzs = nullptr;
|
||||
int *d_Ccols = nullptr;
|
||||
int *d_Bcols = nullptr;
|
||||
double *d_z1 = nullptr;
|
||||
double *d_z2 = nullptr;
|
||||
unsigned int *d_val_pointers = nullptr;
|
||||
#if HAVE_CUDA
|
||||
cudaStream_t stream;
|
||||
#endif
|
||||
|
||||
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
|
||||
#if HAVE_OPENCL
|
||||
cl::CommandQueue *queue = nullptr;
|
||||
#endif
|
||||
|
||||
// data for StandardWells, could remain nullptrs if not used
|
||||
double *d_Cnnzs = nullptr;
|
||||
double *d_Dnnzs = nullptr;
|
||||
double *d_Bnnzs = nullptr;
|
||||
int *d_Ccols = nullptr;
|
||||
int *d_Bcols = nullptr;
|
||||
double *d_z1 = nullptr;
|
||||
double *d_z2 = nullptr;
|
||||
unsigned int *d_val_pointers = nullptr;
|
||||
|
||||
public:
|
||||
double *h_x = nullptr, *h_y = nullptr; // CUDA pinned memory for GPU memcpy
|
||||
bool host_mem_cuda = false; // true iff h_x and h_y are allocated by cudaMallocHost(), so they need to be freed using cudaFreeHost()
|
||||
|
||||
/// StandardWell has C, D and B matrices that need to be copied
|
||||
enum class MatrixType {
|
||||
C,
|
||||
D,
|
||||
B
|
||||
};
|
||||
int *toOrder = nullptr;
|
||||
bool reorder = false;
|
||||
|
||||
/// Set a cudaStream to be used
|
||||
/// \param[in] stream the cudaStream that is used to launch the kernel in
|
||||
void setCudaStream(cudaStream_t stream);
|
||||
public:
|
||||
|
||||
/// Create a new WellContributions, implementation is empty
|
||||
WellContributions(){};
|
||||
|
||||
/// Destroy a WellContributions, and free memory
|
||||
~WellContributions();
|
||||
|
||||
/// Apply all Wells in this object
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
||||
/// \param[in] d_x vector x, must be on GPU
|
||||
/// \param[inout] d_y vector y, must be on GPU
|
||||
void apply(double *d_x, double *d_y);
|
||||
|
||||
/// Allocate memory for the StandardWells
|
||||
void alloc();
|
||||
|
||||
/// Indicate how large the blocks of the StandardWell (C and B) are
|
||||
/// \param[in] dim number of columns
|
||||
/// \param[in] dim_wells number of rows
|
||||
void setBlockSize(unsigned int dim, unsigned int dim_wells);
|
||||
|
||||
/// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
|
||||
/// \param[in] numBlocks number of blocks in C and B of next StandardWell
|
||||
void addNumBlocks(unsigned int numBlocks);
|
||||
|
||||
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||
/// \param[in] type indicate if C, D or B is sent
|
||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||
/// \param[in] values array of nonzeroes
|
||||
/// \param[in] val_size number of blocks in C or B, ignored for D
|
||||
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
|
||||
|
||||
/// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
|
||||
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
|
||||
/// \param[in] dim size of blocks in vectors x and y, equal to MultisegmentWell::numEq
|
||||
/// \param[in] dim_wells size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
|
||||
/// \param[in] Nb number of blocks in vectors x and y
|
||||
/// \param[in] Mb number of blockrows in C, B and D
|
||||
/// \param[in] BnumBlocks number of blocks in C and B
|
||||
/// \param[in] Bvalues nonzero values of matrix B
|
||||
/// \param[in] BcolIndices columnindices of blocks of matrix B
|
||||
/// \param[in] BrowPointers rowpointers of matrix B
|
||||
/// \param[in] DnumBlocks number of blocks in D
|
||||
/// \param[in] Dvalues nonzero values of matrix D
|
||||
/// \param[in] DcolPointers columnpointers of matrix D
|
||||
/// \param[in] DrowIndices rowindices of matrix D
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Nb, unsigned int Mb,
|
||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues);
|
||||
|
||||
/// Return the number of wells added to this object
|
||||
/// \return the number of wells added to this object
|
||||
unsigned int getNumWells(){
|
||||
return num_std_wells + num_ms_wells;
|
||||
}
|
||||
/// StandardWell has C, D and B matrices that need to be copied
|
||||
enum class MatrixType {
|
||||
C,
|
||||
D,
|
||||
B
|
||||
};
|
||||
|
||||
/// Set a cudaStream to be used
|
||||
/// \param[in] stream the cudaStream that is used to launch the kernel in
|
||||
void setCudaStream(cudaStream_t stream);
|
||||
|
||||
/// Create a new WellContributions, implementation is empty
|
||||
WellContributions() {};
|
||||
|
||||
/// Destroy a WellContributions, and free memory
|
||||
~WellContributions();
|
||||
|
||||
/// Apply all Wells in this object
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
||||
/// \param[in] d_x vector x, must be on GPU
|
||||
/// \param[inout] d_y vector y, must be on GPU
|
||||
void apply(double *d_x, double *d_y);
|
||||
#if HAVE_OPENCL
|
||||
void apply(cl::Buffer& x, cl::Buffer& y);
|
||||
#endif
|
||||
|
||||
/// Allocate memory for the StandardWells
|
||||
void alloc();
|
||||
|
||||
/// Indicate how large the blocks of the StandardWell (C and B) are
|
||||
/// \param[in] dim number of columns
|
||||
/// \param[in] dim_wells number of rows
|
||||
void setBlockSize(unsigned int dim, unsigned int dim_wells);
|
||||
|
||||
/// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
|
||||
/// \param[in] numBlocks number of blocks in C and B of next StandardWell
|
||||
void addNumBlocks(unsigned int numBlocks);
|
||||
|
||||
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||
/// \param[in] type indicate if C, D or B is sent
|
||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||
/// \param[in] values array of nonzeroes
|
||||
/// \param[in] val_size number of blocks in C or B, ignored for D
|
||||
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
|
||||
|
||||
/// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
|
||||
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
|
||||
/// \param[in] dim size of blocks in vectors x and y, equal to MultisegmentWell::numEq
|
||||
/// \param[in] dim_wells size of blocks of C, B and D, equal to MultisegmentWell::numWellEq
|
||||
/// \param[in] Nb number of blocks in vectors x and y
|
||||
/// \param[in] Mb number of blockrows in C, B and D
|
||||
/// \param[in] BnumBlocks number of blocks in C and B
|
||||
/// \param[in] Bvalues nonzero values of matrix B
|
||||
/// \param[in] BcolIndices columnindices of blocks of matrix B
|
||||
/// \param[in] BrowPointers rowpointers of matrix B
|
||||
/// \param[in] DnumBlocks number of blocks in D
|
||||
/// \param[in] Dvalues nonzero values of matrix D
|
||||
/// \param[in] DcolPointers columnpointers of matrix D
|
||||
/// \param[in] DrowIndices rowindices of matrix D
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Nb, unsigned int Mb,
|
||||
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, int *DcolPointers, int *DrowIndices,
|
||||
std::vector<double> &Cvalues);
|
||||
|
||||
/// Return the number of wells added to this object
|
||||
/// \return the number of wells added to this object
|
||||
unsigned int getNumWells() {
|
||||
return num_std_wells + num_ms_wells;
|
||||
}
|
||||
|
||||
|
||||
/// If the rows of the matrix are reordered, the columnindices of the matrixdata are incorrect
|
||||
/// Those indices need to be mapped via toOrder
|
||||
/// \param[in] toOrder array with mappings
|
||||
/// \param[in] reorder whether the columnindices need to be reordered or not
|
||||
void setReordering(int *toOrder, bool reorder);
|
||||
|
||||
#if HAVE_OPENCL
|
||||
/// This object copies some cl::Buffers, it requires a CommandQueue to do that
|
||||
/// \param[in] queue the opencl commandqueue to be used
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
#endif
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
#endif
|
||||
|
@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
#ifndef __NVCC__
|
||||
#error "Cannot compile for cusparse: NVIDIA compiler not found"
|
||||
#error "Cannot compile for cusparse: NVIDIA compiler not found"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
@ -42,488 +42,483 @@
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
namespace Opm
|
||||
namespace bda
|
||||
{
|
||||
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
using Opm::OpmLog;
|
||||
|
||||
double second(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance) {}
|
||||
|
||||
cusparseSolverBackend::~cusparseSolverBackend() {
|
||||
finalize();
|
||||
}
|
||||
|
||||
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
double t_total1, t_total2;
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
float it;
|
||||
|
||||
t_total1 = second();
|
||||
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
wellContribs.setCudaStream(stream);
|
||||
}
|
||||
|
||||
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) {
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||
|
||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
cusparseSolverBackend::~cusparseSolverBackend() {
|
||||
finalize();
|
||||
}
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
|
||||
|
||||
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
double t_total1, t_total2;
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
float it;
|
||||
|
||||
t_total1 = second();
|
||||
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
wellContribs.setCudaStream(stream);
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
|
||||
cublasDscal(cublasHandle, n, &beta, d_p, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
|
||||
}
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, \
|
||||
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
wellContribs.apply(d_pw, d_v);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
alpha = rho / tmp1;
|
||||
nalpha = -alpha;
|
||||
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
wellContribs.apply(d_s, d_t);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||
omega = tmp1 / tmp2;
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
for (it = 0.5; it < maxit; it+=0.5) {
|
||||
rhop = rho;
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho/rhop) * (alpha/omega);
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
|
||||
cublasDscal(cublasHandle, n, &beta, d_p, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
|
||||
}
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, \
|
||||
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
wellContribs.apply(d_pw, d_v);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
alpha = rho / tmp1;
|
||||
nalpha = -alpha;
|
||||
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
if (norm < tolerance * norm_0 && it > minit) {
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
wellContribs.apply(d_s, d_t);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||
omega = tmp1 / tmp2;
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if (norm < tolerance * norm_0 && it > minit) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t_total2 = second();
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm/norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t_total2 = second();
|
||||
|
||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->block_size = dim;
|
||||
this->nnzb = nnz/block_size/block_size;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm / norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
int deviceID = 0;
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
struct cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
cudaCheckLastError("Could not create stream");
|
||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->block_size = dim;
|
||||
this->nnzb = nnz / block_size / block_size;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cublasCreate(&cublasHandle);
|
||||
cudaCheckLastError("Could not create cublasHandle");
|
||||
int deviceID = 0;
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
struct cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cusparseCreate(&cusparseHandle);
|
||||
cudaCheckLastError("Could not create cusparseHandle");
|
||||
cudaStreamCreate(&stream);
|
||||
cudaCheckLastError("Could not create stream");
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
cublasCreate(&cublasHandle);
|
||||
cudaCheckLastError("Could not create cublasHandle");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cublas");
|
||||
cusparseSetStream(cusparseHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
cusparseCreate(&cusparseHandle);
|
||||
cudaCheckLastError("Could not create cusparseHandle");
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb + 1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cublas");
|
||||
cusparseSetStream(cusparseHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate pinned memory");
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate pinned memory");
|
||||
#endif
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void cusparseSolverBackend::finalize() {
|
||||
if (initialized) {
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_r);
|
||||
cudaFree(d_rw);
|
||||
cudaFree(d_p);
|
||||
cudaFree(d_pw);
|
||||
cudaFree(d_s);
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
cudaFree(d_buffer);
|
||||
cusparseDestroyBsrilu02Info(info_M);
|
||||
cusparseDestroyBsrsv2Info(info_L);
|
||||
cusparseDestroyBsrsv2Info(info_U);
|
||||
cusparseDestroyMatDescr(descr_B);
|
||||
cusparseDestroyMatDescr(descr_M);
|
||||
cusparseDestroyMatDescr(descr_L);
|
||||
cusparseDestroyMatDescr(descr_U);
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
void cusparseSolverBackend::finalize() {
|
||||
if (initialized) {
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_r);
|
||||
cudaFree(d_rw);
|
||||
cudaFree(d_p);
|
||||
cudaFree(d_pw);
|
||||
cudaFree(d_s);
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
cudaFree(d_buffer);
|
||||
cusparseDestroyBsrilu02Info(info_M);
|
||||
cusparseDestroyBsrsv2Info(info_L);
|
||||
cusparseDestroyBsrsv2Info(info_U);
|
||||
cusparseDestroyMatDescr(descr_B);
|
||||
cusparseDestroyMatDescr(descr_M);
|
||||
cusparseDestroyMatDescr(descr_L);
|
||||
cusparseDestroyMatDescr(descr_U);
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
#if COPY_ROW_BY_ROW
|
||||
cudaFreeHost(vals_contiguous);
|
||||
cudaFreeHost(vals_contiguous);
|
||||
#endif
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
} // end finalize()
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
} // end finalize()
|
||||
|
||||
|
||||
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
|
||||
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for(int i = 0; i < Nb; ++i){
|
||||
int size_row = rows[i+1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for(int i = 0; i < Nb; ++i){
|
||||
int size_row = rows[i+1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
void cusparseSolverBackend::reset_prec_on_gpu() {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
|
||||
bool cusparseSolverBackend::analyse_matrix() {
|
||||
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
double t1, t2;
|
||||
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cusparseCreateMatDescr(&descr_B);
|
||||
cusparseCreateMatDescr(&descr_M);
|
||||
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
|
||||
|
||||
cusparseSetMatIndexBase(descr_B, base_type);
|
||||
cusparseSetMatIndexBase(descr_M, base_type);
|
||||
|
||||
cusparseCreateMatDescr(&descr_L);
|
||||
cusparseSetMatIndexBase(descr_L, base_type);
|
||||
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
|
||||
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
|
||||
|
||||
cusparseCreateMatDescr(&descr_U);
|
||||
cusparseSetMatIndexBase(descr_U, base_type);
|
||||
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
|
||||
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
|
||||
cudaCheckLastError("Could not initialize matrix descriptions");
|
||||
|
||||
cusparseCreateBsrilu02Info(&info_M);
|
||||
cusparseCreateBsrsv2Info(&info_L);
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
cudaMalloc((void**)&d_buffer, d_bufferSize);
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
analysis_done = true;
|
||||
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
bool cusparseSolverBackend::create_preconditioner() {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
cudaCheckLastError("Could not perform ilu decomposition");
|
||||
|
||||
int structural_zero;
|
||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
|
||||
// actually solve
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void cusparseSolverBackend::post_process(double *x) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end post_process()
|
||||
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
|
||||
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
if (initialized == false) {
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
}else{
|
||||
update_system_on_gpu(vals, rows, b);
|
||||
}
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if (create_preconditioner()) {
|
||||
solve_system(wellContribs, res);
|
||||
}else{
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
void cusparseSolverBackend::reset_prec_on_gpu() {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
bool cusparseSolverBackend::analyse_matrix() {
|
||||
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
double t1, t2;
|
||||
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cusparseCreateMatDescr(&descr_B);
|
||||
cusparseCreateMatDescr(&descr_M);
|
||||
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
|
||||
|
||||
cusparseSetMatIndexBase(descr_B, base_type);
|
||||
cusparseSetMatIndexBase(descr_M, base_type);
|
||||
|
||||
cusparseCreateMatDescr(&descr_L);
|
||||
cusparseSetMatIndexBase(descr_L, base_type);
|
||||
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
|
||||
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
|
||||
|
||||
cusparseCreateMatDescr(&descr_U);
|
||||
cusparseSetMatIndexBase(descr_U, base_type);
|
||||
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
|
||||
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
|
||||
cudaCheckLastError("Could not initialize matrix descriptions");
|
||||
|
||||
cusparseCreateBsrilu02Info(&info_M);
|
||||
cusparseCreateBsrsv2Info(&info_L);
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
cudaMalloc((void**)&d_buffer, d_bufferSize);
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
block_size, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::analyse_matrix(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
analysis_done = true;
|
||||
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
bool cusparseSolverBackend::create_preconditioner() {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
cudaCheckLastError("Could not perform ilu decomposition");
|
||||
|
||||
int structural_zero;
|
||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::create_preconditioner(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
|
||||
// actually solve
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void cusparseSolverBackend::get_result(double *x) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::post_process(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end post_process()
|
||||
|
||||
|
||||
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
|
||||
|
||||
BdaSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
if (initialized == false) {
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
} else {
|
||||
update_system_on_gpu(vals, rows, b);
|
||||
}
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if (create_preconditioner()) {
|
||||
solve_system(wellContribs, res);
|
||||
} else {
|
||||
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
return BdaSolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -24,21 +24,18 @@
|
||||
#include "cublas_v2.h"
|
||||
#include "cusparse_v2.h"
|
||||
|
||||
#include "opm/simulators/linalg/bda/BdaResult.hpp"
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
namespace Opm
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
||||
class cusparseSolverBackend{
|
||||
class cusparseSolverBackend : public BdaSolver {
|
||||
|
||||
private:
|
||||
|
||||
int minit;
|
||||
int maxit;
|
||||
double tolerance;
|
||||
|
||||
cublasHandle_t cublasHandle;
|
||||
cusparseHandle_t cusparseHandle;
|
||||
cudaStream_t stream;
|
||||
@ -49,24 +46,13 @@ private:
|
||||
double *d_bVals, *d_mVals;
|
||||
int *d_bCols, *d_mCols;
|
||||
int *d_bRows, *d_mRows;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
double *d_pw, *d_s, *d_t, *d_v;
|
||||
void *d_buffer;
|
||||
int N, Nb, nnz, nnzb;
|
||||
double *vals_contiguous;
|
||||
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||
|
||||
int block_size;
|
||||
|
||||
bool initialized = false;
|
||||
bool analysis_done = false;
|
||||
|
||||
// verbosity
|
||||
// 0: print nothing during solves, only when initializing
|
||||
// 1: print number of iterations and final norm
|
||||
// 2: also print norm each iteration
|
||||
// 3: also print timings of different backend functions
|
||||
|
||||
int verbosity = 0;
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
@ -113,12 +99,6 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
enum class cusparseSolverStatus {
|
||||
CUSPARSE_SOLVER_SUCCESS,
|
||||
CUSPARSE_SOLVER_ANALYSIS_FAILED,
|
||||
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
CUSPARSE_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
/// Construct a cusparseSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
@ -140,15 +120,15 @@ public:
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
|
||||
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Post processing after linear solve, now only copies resulting x vector back
|
||||
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void post_process(double *x);
|
||||
void get_result(double *x) override;
|
||||
|
||||
}; // end class cusparseSolverBackend
|
||||
|
||||
}
|
||||
} // namespace bda
|
||||
|
||||
#endif
|
||||
|
||||
|
401
opm/simulators/linalg/bda/openclKernels.hpp
Normal file
401
opm/simulators/linalg/bda/openclKernels.hpp
Normal file
@ -0,0 +1,401 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPENCL_HPP
|
||||
#define OPENCL_HPP
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
const char* kernel_1 = R"(
|
||||
__kernel void hello1(void)
|
||||
{
|
||||
printf("in the kernel 1\n");
|
||||
}
|
||||
)";
|
||||
|
||||
const char* kernel_2 = R"(
|
||||
__kernel void hello2(
|
||||
__global int *in1,
|
||||
__global int *in2,
|
||||
__global int *out)
|
||||
{
|
||||
printf("in the kernel 2\n");
|
||||
int idx = get_global_id(0);
|
||||
out[idx] = in1[idx];
|
||||
for(int i = 0; i < 20000000; ++i){
|
||||
out[idx] = in1[idx] - in2[idx] * out[idx];
|
||||
if(out[idx] > 100000){
|
||||
out[idx] -= 100000;
|
||||
}
|
||||
if(out[idx] < -100000){
|
||||
out[idx] += 100000;
|
||||
}
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
const char* axpy_s = R"(
|
||||
__kernel void axpy(
|
||||
__global double *in,
|
||||
const double a,
|
||||
__global double *out,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
int idx = get_global_id(0);
|
||||
|
||||
while(idx < N){
|
||||
out[idx] += a * in[idx];
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// returns partial sums, instead of the final dot product
|
||||
const char* dot_1_s = R"(
|
||||
__kernel void dot_1(
|
||||
__global double *in1,
|
||||
__global double *in2,
|
||||
__global double *out,
|
||||
const unsigned int N,
|
||||
__local double *tmp)
|
||||
{
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int bsize = get_local_size(0);
|
||||
unsigned int bid = get_global_id(0) / bsize;
|
||||
unsigned int i = get_global_id(0);
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
|
||||
double sum = 0.0;
|
||||
while(i < N){
|
||||
sum += in1[i] * in2[i];
|
||||
i += NUM_THREADS;
|
||||
}
|
||||
tmp[tid] = sum;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// returns partial sums, instead of the final norm
|
||||
// the square root must be computed on CPU
|
||||
const char* norm_s = R"(
|
||||
__kernel void norm(
|
||||
__global double *in,
|
||||
__global double *out,
|
||||
const unsigned int N,
|
||||
__local double *tmp)
|
||||
{
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int bsize = get_local_size(0);
|
||||
unsigned int bid = get_global_id(0) / bsize;
|
||||
unsigned int i = get_global_id(0);
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
|
||||
double local_sum = 0.0;
|
||||
while(i < N){
|
||||
local_sum += in[i] * in[i];
|
||||
i += NUM_THREADS;
|
||||
}
|
||||
tmp[tid] = local_sum;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) out[get_group_id(0)] = tmp[0];
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// p = (p - omega * v) * beta + r
|
||||
const char* custom_s = R"(
|
||||
__kernel void custom(
|
||||
__global double *p,
|
||||
__global double *v,
|
||||
__global double *r,
|
||||
const double omega,
|
||||
const double beta,
|
||||
const int N)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
unsigned int idx = get_global_id(0);
|
||||
|
||||
while(idx < N){
|
||||
double res = p[idx];
|
||||
res -= omega * v[idx];
|
||||
res *= beta;
|
||||
res += r[idx];
|
||||
p[idx] = res;
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// b = mat * x
|
||||
// algorithm based on:
|
||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
const char* spmv_blocked_s = R"(
|
||||
__kernel void spmv_blocked(
|
||||
__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int Nb,
|
||||
__global const double *x,
|
||||
__global double *b,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bsize = get_local_size(0);
|
||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
unsigned int idx = idx_b * bsize + idx_t;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
// for 3x3 blocks:
|
||||
// num_active_threads: 27
|
||||
// num_blocks_per_warp: 3
|
||||
|
||||
while(target_block_row < Nb){
|
||||
unsigned int first_block = rows[target_block_row];
|
||||
unsigned int last_block = rows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
double x_elem = x[cols[block]*bs + c];
|
||||
double A_elem = vals[block*bs*bs + c + r*bs];
|
||||
local_out += x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
b[row] = tmp[lane];
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
|
||||
// ILU apply part 1: forward substitution
|
||||
// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||
const char* ILU_apply1_s = R"(
|
||||
__kernel void ILU_apply1(
|
||||
__global const double *Lvals,
|
||||
__global const unsigned int *Lcols,
|
||||
__global const unsigned int *Lrows,
|
||||
const unsigned int LrowSize,
|
||||
__global const double *y,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
idx += nodesPerColorPrefix[color];
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = Lrows[target_block_row];
|
||||
const unsigned int last_block = Lrows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
local_out = y[target_block_row*bs+lane];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[Lcols[block]*bs + c];
|
||||
const double A_elem = Lvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = tmp[lane];
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// ILU apply part 2: backward substitution
|
||||
// solves U*x=y where L is a lower triangular sparse blocked matrix
|
||||
const char* ILU_apply2_s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *Uvals,
|
||||
__global const int *Ucols,
|
||||
__global const int *Urows,
|
||||
const unsigned int UrowSize,
|
||||
__global const double *invDiagVals,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
idx += nodesPerColorPrefix[color];
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
const double relaxation = 0.9;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = Urows[UrowSize-target_block_row-1];
|
||||
const unsigned int last_block = Urows[UrowSize-target_block_row];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs+lane;
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[Ucols[block]*bs + c];
|
||||
const double A_elem = Uvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
local_out = tmp[lane];
|
||||
|
||||
if(lane < bs){
|
||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||
double sum = 0.0;
|
||||
for(int i = 0; i < bs; ++i){
|
||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = relaxation * sum;
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
763
opm/simulators/linalg/bda/openclSolverBackend.cpp
Normal file
763
opm/simulators/linalg/bda/openclSolverBackend.cpp
Normal file
@ -0,0 +1,763 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include <config.h> // CMake
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
|
||||
#define __CL_ENABLE_EXCEPTIONS
|
||||
#include <CL/cl.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
|
||||
|
||||
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
|
||||
|
||||
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
|
||||
#define LEVEL_SCHEDULING 1
|
||||
#define GRAPH_COLORING 0
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver(verbosity_, maxit_, tolerance_) {
|
||||
prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
|
||||
}
|
||||
|
||||
|
||||
openclSolverBackend::~openclSolverBackend() {
|
||||
finalize();
|
||||
}
|
||||
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B)
|
||||
{
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
// just for verifying and debugging
|
||||
bool equal(float a, float b)
|
||||
{
|
||||
const float tol_abs = 1e-2;
|
||||
const float tol_rel = 1e-2;
|
||||
return std::abs(a - b) <= std::max(tol_rel * std::max(std::abs(a), std::abs(b)), tol_abs);
|
||||
}
|
||||
|
||||
double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
const unsigned int work_group_size = 1024;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
if (verbosity >= 4) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
|
||||
|
||||
double gpu_sum = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_sum += tmp[i];
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
t2 = second();
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolver dot_w time: " << t2 - t1;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_sum;
|
||||
}
|
||||
|
||||
double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
const unsigned int work_group_size = 1024;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
if (verbosity >= 4) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
|
||||
|
||||
double gpu_norm = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_norm += tmp[i];
|
||||
}
|
||||
gpu_norm = sqrt(gpu_norm);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
t2 = second();
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolver norm_w time: " << t2 - t1;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_norm;
|
||||
}
|
||||
|
||||
void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
if (verbosity >= 4) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
t2 = second();
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolver axpy_w time: " << t2 - t1;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
if (verbosity >= 4) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
t2 = second();
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolver custom_w time: " << t2 - t1;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
|
||||
{
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
if (verbosity >= 4) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
t2 = second();
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolver spmv_blocked_w time: " << t2 - t1;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
|
||||
float it;
|
||||
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
|
||||
double t_total1, t_total2, t1 = 0.0, t2 = 0.0;
|
||||
double prec_time = 0.0, spmv_time = 0.0, well_time = 0.0, rest_time = 0.0;
|
||||
t_total1 = second();
|
||||
|
||||
wellContribs.setOpenCLQueue(queue.get());
|
||||
wellContribs.setReordering(toOrder, true);
|
||||
|
||||
// set r to the initial residual
|
||||
// if initial x guess is not 0, must call applyblockedscaleadd(), not implemented
|
||||
//applyblockedscaleadd(-1.0, mat, x, r);
|
||||
|
||||
// set initial values
|
||||
cl::Event event;
|
||||
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
|
||||
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
rho = 1.0;
|
||||
alpha = 1.0;
|
||||
omega = 1.0;
|
||||
|
||||
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
norm_0 = norm;
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "openclSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
t1 = second();
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
rho = dot_w(d_rw, d_r, d_tmp);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
custom_w(d_p, d_v, d_r, omega, beta);
|
||||
}
|
||||
t2 = second();
|
||||
rest_time += t2 - t1;
|
||||
|
||||
// pw = prec(p)
|
||||
t1 = second();
|
||||
prec->apply(d_p, d_pw);
|
||||
t2 = second();
|
||||
prec_time += t2 - t1;
|
||||
|
||||
// v = A * pw
|
||||
t1 = second();
|
||||
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
|
||||
t2 = second();
|
||||
spmv_time += t2 - t1;
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
t1 = second();
|
||||
wellContribs.apply(d_pw, d_v);
|
||||
t2 = second();
|
||||
well_time += t2 - t1;
|
||||
}
|
||||
|
||||
t1 = second();
|
||||
tmp1 = dot_w(d_rw, d_v, d_tmp);
|
||||
alpha = rho / tmp1;
|
||||
axpy_w(d_v, -alpha, d_r); // r = r - alpha * v
|
||||
axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
t2 = second();
|
||||
rest_time += t2 - t1;
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// s = prec(r)
|
||||
t1 = second();
|
||||
prec->apply(d_r, d_s);
|
||||
t2 = second();
|
||||
prec_time += t2 - t1;
|
||||
|
||||
// t = A * s
|
||||
t1 = second();
|
||||
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
|
||||
t2 = second();
|
||||
spmv_time += t2 - t1;
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
t1 = second();
|
||||
wellContribs.apply(d_s, d_t);
|
||||
t2 = second();
|
||||
well_time += t2 - t1;
|
||||
}
|
||||
|
||||
t1 = second();
|
||||
tmp1 = dot_w(d_t, d_r, d_tmp);
|
||||
tmp2 = dot_w(d_t, d_t, d_tmp);
|
||||
omega = tmp1 / tmp2;
|
||||
axpy_w(d_s, omega, d_x); // x = x + omega * s
|
||||
axpy_w(d_t, -omega, d_r); // r = r - omega * t
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
t2 = second();
|
||||
rest_time += t2 - t1;
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t2 = second();
|
||||
t_total2 = second();
|
||||
rest_time += t2 - t1;
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm / norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::ily_apply: " << prec_time << "s\n";
|
||||
out << "openclSolver::spmv: " << spmv_time << "s\n";
|
||||
out << "openclSolver::rest: " << rest_time << "s\n";
|
||||
out << "openclSolver::total_solve: " << res.elapsed << "s\n";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->block_size = dim;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
|
||||
int deviceID = 0;
|
||||
|
||||
cl_int err = CL_SUCCESS;
|
||||
try {
|
||||
std::vector<cl::Platform> platforms;
|
||||
cl::Platform::get(&platforms);
|
||||
if (platforms.size() == 0)
|
||||
{
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
|
||||
}
|
||||
out << "Found " << platforms.size() << " OpenCL platforms" << "\n\n";
|
||||
|
||||
if (verbosity >= 1) {
|
||||
std::string platform_info;
|
||||
for (unsigned int platformID = 0; platformID < platforms.size(); ++platformID) {
|
||||
platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
|
||||
out << "Platform name : " << platform_info << "\n";
|
||||
platforms[platformID].getInfo(CL_PLATFORM_VENDOR, &platform_info);
|
||||
out << "Platform vendor : " << platform_info << "\n";
|
||||
platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
|
||||
out << "Platform version : " << platform_info << "\n";
|
||||
platforms[platformID].getInfo(CL_PLATFORM_PROFILE, &platform_info);
|
||||
out << "Platform profile : " << platform_info << "\n";
|
||||
platforms[platformID].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
|
||||
out << "Platform extensions: " << platform_info << "\n\n";
|
||||
}
|
||||
}
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
|
||||
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[deviceID])(), 0};
|
||||
context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
|
||||
|
||||
std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
|
||||
if (devices.size() == 0){
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
|
||||
}
|
||||
out << "Found " << devices.size() << " OpenCL devices" << "\n\n";
|
||||
|
||||
if (verbosity >= 1) {
|
||||
for (unsigned int i = 0; i < devices.size(); ++i) {
|
||||
std::string device_info;
|
||||
std::vector<size_t> work_sizes;
|
||||
std::vector<cl_device_partition_property> partitions;
|
||||
|
||||
devices[i].getInfo(CL_DEVICE_NAME, &device_info);
|
||||
out << "CL_DEVICE_NAME : " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
|
||||
out << "CL_DEVICE_VENDOR : " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
|
||||
out << "CL_DRIVER_VERSION : " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
|
||||
out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_VERSION, &device_info);
|
||||
out << "CL_DEVICE_VERSION : " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
|
||||
out << "CL_DEVICE_PROFILE : " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
|
||||
out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
|
||||
devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
|
||||
out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n";
|
||||
|
||||
devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
|
||||
for (unsigned int j = 0; j < work_sizes.size(); ++j) {
|
||||
out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
|
||||
}
|
||||
devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
|
||||
for (unsigned int j = 0; j < partitions.size(); ++j) {
|
||||
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
|
||||
}
|
||||
partitions.clear();
|
||||
devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
|
||||
for (unsigned int j = 0; j < partitions.size(); ++j) {
|
||||
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
|
||||
}
|
||||
|
||||
// C-style properties
|
||||
cl_device_id tmp_id = devices[i]();
|
||||
cl_ulong size;
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n";
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n";
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n";
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n";
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n";
|
||||
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
|
||||
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n";
|
||||
}
|
||||
}
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cl::Program::Sources source(1, std::make_pair(kernel_1, strlen(kernel_1))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
|
||||
source.emplace_back(std::make_pair(kernel_2, strlen(kernel_2)));
|
||||
source.emplace_back(std::make_pair(axpy_s, strlen(axpy_s)));
|
||||
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
|
||||
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
|
||||
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
|
||||
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||
cl::Program program_ = cl::Program(*context, source);
|
||||
|
||||
program_.build(devices);
|
||||
|
||||
cl::Event event;
|
||||
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
|
||||
|
||||
prec->setOpenCLContext(context.get());
|
||||
prec->setOpenCLQueue(queue.get());
|
||||
|
||||
rb = new double[N];
|
||||
tmp = new double[N];
|
||||
#if COPY_ROW_BY_ROW
|
||||
vals_contiguous = new double[N];
|
||||
#endif
|
||||
|
||||
mat = (BlockedMatrix *)malloc(sizeof(BlockedMatrix));
|
||||
mat->Nb = Nb;
|
||||
mat->nnzbs = nnzb;
|
||||
mat->nnzValues = (Block*)vals;
|
||||
mat->colIndices = cols;
|
||||
mat->rowPointers = rows;
|
||||
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
|
||||
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
|
||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
|
||||
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
|
||||
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
|
||||
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
|
||||
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||
|
||||
} catch (cl::Error error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")";
|
||||
OpmLog::error(oss.str());
|
||||
}
|
||||
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void openclSolverBackend::finalize() {
|
||||
delete[] rb;
|
||||
delete[] tmp;
|
||||
#if COPY_ROW_BY_ROW
|
||||
delete[] vals_contiguous;
|
||||
#endif
|
||||
} // end finalize()
|
||||
|
||||
|
||||
void openclSolverBackend::copy_system_to_gpu() {
|
||||
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
|
||||
#else
|
||||
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
|
||||
#endif
|
||||
|
||||
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
|
||||
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
|
||||
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::copy_system_to_gpu(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void openclSolverBackend::update_system_on_gpu() {
|
||||
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cl::Event event;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
|
||||
#else
|
||||
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
|
||||
#endif
|
||||
|
||||
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::update_system_on_gpu(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
bool openclSolverBackend::analyse_matrix() {
|
||||
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
bool success = prec->init(mat, block_size);
|
||||
int work_group_size = 32;
|
||||
int num_work_groups = ceilDivision(N, work_group_size);
|
||||
int total_work_items = num_work_groups * work_group_size;
|
||||
int lmem_per_work_group = work_group_size * sizeof(double);
|
||||
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
|
||||
|
||||
toOrder = prec->getToOrder();
|
||||
fromOrder = prec->getFromOrder();
|
||||
rmat = prec->getRMat();
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::analyse_matrix(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
analysis_done = true;
|
||||
|
||||
return success;
|
||||
} // end analyse_matrix()
|
||||
|
||||
|
||||
void openclSolverBackend::update_system(double *vals, double *b) {
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
mat->nnzValues = (Block*)vals;
|
||||
//mat->nnzValues = static_cast<Block*>(vals);
|
||||
blocked_reorder_vector_by_pattern(mat->Nb, b, fromOrder, rb);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::update_system(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system()
|
||||
|
||||
|
||||
bool openclSolverBackend::create_preconditioner() {
|
||||
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
bool result = prec->create_preconditioner(mat);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::create_preconditioner(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void openclSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
|
||||
// actually solve
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::solve_system(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void openclSolverBackend::get_result(double *x) {
|
||||
|
||||
double t1 = 0.0, t2 = 0.0;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
|
||||
blocked_reorder_vector_by_pattern(mat->Nb, rb, toOrder, x);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::get_result(): " << t2 - t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
|
||||
typedef BdaSolver::BdaSolverStatus BdaSolverStatus;
|
||||
|
||||
BdaSolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return BdaSolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
update_system(vals, b);
|
||||
if (!create_preconditioner()) {
|
||||
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
copy_system_to_gpu();
|
||||
} else {
|
||||
update_system(vals, b);
|
||||
if (!create_preconditioner()) {
|
||||
return BdaSolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
update_system_on_gpu();
|
||||
}
|
||||
solve_system(wellContribs, res);
|
||||
return BdaSolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
|
197
opm/simulators/linalg/bda/openclSolverBackend.hpp
Normal file
197
opm/simulators/linalg/bda/openclSolverBackend.hpp
Normal file
@ -0,0 +1,197 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
|
||||
#define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
|
||||
|
||||
#if HAVE_OPENCL
|
||||
#define __CL_ENABLE_EXCEPTIONS
|
||||
#include <CL/cl.hpp> // up to OpenCL 1.2
|
||||
#endif
|
||||
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
typedef bda::BILU0 Preconditioner;
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
|
||||
class openclSolverBackend : public BdaSolver
|
||||
{
|
||||
|
||||
private:
|
||||
|
||||
double *rb; // reordered b vector, the matrix is reordered, so b must also be
|
||||
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
|
||||
bool analysis_done = false;
|
||||
|
||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU
|
||||
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
|
||||
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
||||
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
||||
double *tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
|
||||
// shared pointers are also passed to BILU0
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
|
||||
|
||||
Preconditioner *prec; // only supported preconditioner is BILU0
|
||||
int *toOrder, *fromOrder; // BILU0 reorders rows of the matrix via these mappings
|
||||
BlockedMatrix *mat, *rmat; // normal and reordered matrix
|
||||
|
||||
|
||||
/// Divide A by B, and round up: return (int)ceil(A/B)
|
||||
/// \param[in] A dividend
|
||||
/// \param[in] B divisor
|
||||
/// \return rounded division result
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B);
|
||||
|
||||
/// Calculate dot product between in1 and in2, partial sums are stored in out, which are summed on CPU
|
||||
/// \param[in] in1 input vector 1
|
||||
/// \param[in] in2 input vector 2
|
||||
/// \param[out] out output vector containing partial sums
|
||||
/// \return dot product
|
||||
double dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out);
|
||||
|
||||
/// Calculate the norm of in, partial sums are stored in out, which are summed on the CPU
|
||||
/// Equal to Dune::DenseVector::two_norm()
|
||||
/// \param[in] in input vector
|
||||
/// \param[out] out output vector containing partial sums
|
||||
/// \return norm
|
||||
double norm_w(cl::Buffer in, cl::Buffer out);
|
||||
|
||||
/// Perform axpy: out += a * in
|
||||
/// \param[in] in input vector
|
||||
/// \param[in] a scalar value to multiply input vector
|
||||
/// \param[inout] out output vector
|
||||
void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
|
||||
|
||||
/// Custom function that combines scale, axpy and add functions in bicgstab
|
||||
/// p = (p - omega * v) * beta + r
|
||||
/// \param[inout] p output vector
|
||||
/// \param[in] v input vector
|
||||
/// \param[in] r input vector
|
||||
/// \param[in] omega scalar value
|
||||
/// \param[in] beta scalar value
|
||||
void custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta);
|
||||
|
||||
/// Sparse matrix-vector multiply, spmv
|
||||
/// b = A * x
|
||||
/// Matrix A, must be in BCRS format
|
||||
/// \param[in] vals nnzs of matrix A
|
||||
/// \param[in] cols columnindices of matrix A
|
||||
/// \param[in] rows rowpointers of matrix A
|
||||
/// \param[in] x input vector
|
||||
/// \param[out] b output vector
|
||||
void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
/// Copy linear system to GPU
|
||||
void copy_system_to_gpu();
|
||||
|
||||
/// Reorder the linear system so it corresponds with the coloring
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vectors, contains N values
|
||||
void update_system(double *vals, double *b);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
void update_system_on_gpu();
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
bool analyse_matrix();
|
||||
|
||||
/// Perform ilu0-decomposition
|
||||
/// \return true iff decomposition was successful
|
||||
bool create_preconditioner();
|
||||
|
||||
/// Solve linear system
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions& wellContribs, BdaResult &res);
|
||||
|
||||
public:
|
||||
|
||||
/// Construct a openclSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of openclSolver
|
||||
/// \param[in] maxit maximum number of iterations for openclSolver
|
||||
/// \param[in] tolerance required relative tolerance for openclSolver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~openclSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] vals_prec array of nonzeroes for preconditioner
|
||||
/// \param[in] rows_prec array of rowPointers for preconditioner
|
||||
/// \param[in] cols_prec array of columnIndices for preconditioner
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
BdaSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace bda
|
||||
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user