mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Generating opencl kernels
This commit is contained in:
parent
141af23db5
commit
5b4deab7e4
@ -220,298 +220,266 @@ namespace bda
|
||||
|
||||
// ILU apply part 1: forward substitution
|
||||
// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||
inline const char* ILU_apply1_s = R"(
|
||||
__kernel void ILU_apply1(
|
||||
__global const double *Lvals,
|
||||
__global const unsigned int *Lcols,
|
||||
__global const unsigned int *Lrows,
|
||||
__global const int *diagIndex,
|
||||
__global const double *y,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
idx += nodesPerColorPrefix[color];
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = Lrows[target_block_row];
|
||||
const unsigned int last_block = diagIndex[target_block_row];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
local_out = y[target_block_row*bs+lane];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[Lcols[block]*bs + c];
|
||||
const double A_elem = Lvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
std::string get_ILU_apply1_string(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply1(
|
||||
__global const double *LUvals,
|
||||
__global const unsigned int *LUcols,
|
||||
__global const unsigned int *LUrows,
|
||||
__global const int *diagIndex,
|
||||
__global const double *y,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = tmp[lane];
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = LUrows[target_block_row];
|
||||
)";
|
||||
if (full_matrix) {
|
||||
s += "const unsigned int last_block = diagIndex[target_block_row]; ";
|
||||
} else {
|
||||
s += "const unsigned int last_block = LUrows[target_block_row+1]; ";
|
||||
}
|
||||
s += R"(
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
local_out = y[target_block_row*bs+lane];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = tmp[lane];
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
// ILU apply part 2: backward substitution
|
||||
// solves U*x=y where L is a lower triangular sparse blocked matrix
|
||||
inline const char* ILU_apply2_s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
__global const int *diagIndex,
|
||||
__global const double *invDiagVals,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
idx += nodesPerColorPrefix[color];
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
const double relaxation = 0.9;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
const unsigned int first_block = diagIndex[target_block_row] + 1;
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs+lane;
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
std::string get_ILU_apply2_string(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *LUvals,
|
||||
__global const int *LUcols,
|
||||
__global const int *LUrows,
|
||||
__global const int *diagIndex,
|
||||
__global const double *invDiagVals,
|
||||
__global double *x,
|
||||
__global const unsigned int *nodesPerColorPrefix,
|
||||
const unsigned int color,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
const double relaxation = 0.9;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
)";
|
||||
if (full_matrix) {
|
||||
s += "const unsigned int first_block = diagIndex[target_block_row] + 1; ";
|
||||
} else {
|
||||
s += "const unsigned int first_block = LUrows[target_block_row]; ";
|
||||
}
|
||||
s += R"(
|
||||
const unsigned int last_block = LUrows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
if(lane < num_active_threads){
|
||||
if(lane < bs){
|
||||
const unsigned int row = target_block_row*bs+lane;
|
||||
local_out = x[row];
|
||||
}
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
const double x_elem = x[LUcols[block]*bs + c];
|
||||
const double A_elem = LUvals[block*bs*bs + c + r*bs];
|
||||
local_out -= x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
local_out = tmp[lane];
|
||||
|
||||
if(lane < bs){
|
||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||
double sum = 0.0;
|
||||
for(int i = 0; i < bs; ++i){
|
||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = relaxation * sum;
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
|
||||
/// Generate string with the stdwell_apply kernels
|
||||
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
|
||||
/// the x/y vector is reordered, use toOrder to address that
|
||||
/// \param[in] reorder whether the matrix is reordered or not
|
||||
std::string get_stdwell_apply_string(bool reorder) {
|
||||
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
|
||||
std::string s = "__kernel void " + kernel_name + R"((
|
||||
__global const double *Cnnzs,
|
||||
__global const double *Dnnzs,
|
||||
__global const double *Bnnzs,
|
||||
__global const int *Ccols,
|
||||
__global const int *Bcols,
|
||||
__global const double *x,
|
||||
__global double *y,
|
||||
)";
|
||||
if (reorder) {
|
||||
s += R"(__global const int *toOrder,
|
||||
)";
|
||||
}
|
||||
s += R"(const unsigned int dim,
|
||||
const unsigned int dim_wells,
|
||||
__global const unsigned int *val_pointers,
|
||||
__local double *localSum,
|
||||
__local double *z1,
|
||||
__local double *z2){
|
||||
int wgId = get_group_id(0);
|
||||
int wiId = get_local_id(0);
|
||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||
int valsPerBlock = dim*dim_wells;
|
||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||
int c = wiId % dim;
|
||||
int r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
local_out = tmp[lane];
|
||||
|
||||
if(lane < bs){
|
||||
tmp[lane + bs*idx_t/warpsize] = local_out;
|
||||
double sum = 0.0;
|
||||
for(int i = 0; i < bs; ++i){
|
||||
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
)";
|
||||
if (reorder) {
|
||||
s += "int colIdx = toOrder[Bcols[b]]; ";
|
||||
} else {
|
||||
s += "int colIdx = Bcols[b]; ";
|
||||
}
|
||||
s += R"(
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
}
|
||||
|
||||
if(wiId < valsPerBlock){
|
||||
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||
}
|
||||
|
||||
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||
localSum[wiId] += localSum[wiId + stride];
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = relaxation * sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
inline const char* stdwell_apply_s = R"(
|
||||
__kernel void stdwell_apply(__global const double *Cnnzs,
|
||||
__global const double *Dnnzs,
|
||||
__global const double *Bnnzs,
|
||||
__global const int *Ccols,
|
||||
__global const int *Bcols,
|
||||
__global const double *x,
|
||||
__global double *y,
|
||||
__global const int *toOrder,
|
||||
const unsigned int dim,
|
||||
const unsigned int dim_wells,
|
||||
__global const unsigned int *val_pointers,
|
||||
__local double *localSum,
|
||||
__local double *z1,
|
||||
__local double *z2){
|
||||
int wgId = get_group_id(0);
|
||||
int wiId = get_local_id(0);
|
||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||
int valsPerBlock = dim*dim_wells;
|
||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||
int c = wiId % dim;
|
||||
int r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
int colIdx = toOrder[Bcols[b]];
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
}
|
||||
|
||||
if(wiId < valsPerBlock){
|
||||
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||
}
|
||||
|
||||
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||
localSum[wiId] += localSum[wiId + stride];
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
int bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
)";
|
||||
if (reorder) {
|
||||
s += "int colIdx = toOrder[Ccols[bb]]; ";
|
||||
} else {
|
||||
s += "int colIdx = Ccols[bb]; ";
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
int bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
int colIdx = toOrder[Ccols[bb]];
|
||||
y[colIdx*dim + c] -= temp;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
inline const char* stdwell_apply_no_reorder_s = R"(
|
||||
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
|
||||
__global const double *Dnnzs,
|
||||
__global const double *Bnnzs,
|
||||
__global const int *Ccols,
|
||||
__global const int *Bcols,
|
||||
__global const double *x,
|
||||
__global double *y,
|
||||
const unsigned int dim,
|
||||
const unsigned int dim_wells,
|
||||
__global const unsigned int *val_pointers,
|
||||
__local double *localSum,
|
||||
__local double *z1,
|
||||
__local double *z2){
|
||||
int wgId = get_group_id(0);
|
||||
int wiId = get_local_id(0);
|
||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||
int valsPerBlock = dim*dim_wells;
|
||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||
int c = wiId % dim;
|
||||
int r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
int colIdx = Bcols[b];
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
}
|
||||
|
||||
if(wiId < valsPerBlock){
|
||||
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||
}
|
||||
|
||||
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||
localSum[wiId] += localSum[wiId + stride];
|
||||
s += R"(
|
||||
y[colIdx*dim + c] -= temp;
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
int bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
int colIdx = Ccols[bb];
|
||||
y[colIdx*dim + c] -= temp;
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
)";
|
||||
|
||||
|
||||
inline const char* ilu_decomp_s = R"(
|
||||
|
||||
|
@ -475,19 +475,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
out.clear();
|
||||
|
||||
try {
|
||||
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
|
||||
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
|
||||
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
|
||||
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
|
||||
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
|
||||
source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
|
||||
program = cl::Program(*context, source);
|
||||
program.build(devices);
|
||||
|
||||
prec->setOpenCLContext(context.get());
|
||||
prec->setOpenCLQueue(queue.get());
|
||||
|
||||
@ -519,6 +506,49 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
||||
}
|
||||
|
||||
get_opencl_kernels();
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
oss << getErrorString(error.err());
|
||||
// rethrow exception
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
} catch (const std::logic_error& error) {
|
||||
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
|
||||
throw error;
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::get_opencl_kernels() {
|
||||
|
||||
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
|
||||
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
|
||||
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
|
||||
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
|
||||
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
|
||||
#if CHOW_PATEL
|
||||
bool full_matrix = false;
|
||||
#else
|
||||
bool full_matrix = true;
|
||||
#endif
|
||||
std::string ILU_apply1_s = get_ILU_apply1_string(full_matrix);
|
||||
source.emplace_back(std::make_pair(ILU_apply1_s.c_str(), strlen(ILU_apply1_s.c_str())));
|
||||
std::string ILU_apply2_s = get_ILU_apply2_string(full_matrix);
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s.c_str(), strlen(ILU_apply2_s.c_str())));
|
||||
std::string stdwell_apply_s = get_stdwell_apply_string(true);
|
||||
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
|
||||
source.emplace_back(std::make_pair(stdwell_apply_s.c_str(), strlen(stdwell_apply_s.c_str())));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s.c_str(), strlen(stdwell_apply_no_reorder_s.c_str())));
|
||||
source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
|
||||
cl::Program program = cl::Program(*context, source);
|
||||
program.build(devices);
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
|
||||
@ -540,22 +570,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
ilu_decomp_k.reset(new cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg>(cl::Kernel(program, "ilu_decomp")));
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
oss << getErrorString(error.err());
|
||||
// rethrow exception
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
} catch (const std::logic_error& error) {
|
||||
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
|
||||
throw error;
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
} // end get_opencl_kernels()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::finalize() {
|
||||
|
@ -64,7 +64,6 @@ private:
|
||||
|
||||
// shared pointers are also passed to other objects
|
||||
std::vector<cl::Device> devices;
|
||||
cl::Program program;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
@ -149,6 +148,9 @@ private:
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
/// Generate and compile opencl kernels
|
||||
void get_opencl_kernels();
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user