Generating opencl kernels

This commit is contained in:
Tong Dong Qiu 2021-02-24 15:39:40 +01:00
parent 141af23db5
commit 5b4deab7e4
3 changed files with 287 additions and 302 deletions

View File

@ -220,298 +220,266 @@ namespace bda
// ILU apply part 1: forward substitution
// solves L*x=y where L is a lower triangular sparse blocked matrix
inline const char* ILU_apply1_s = R"(
__kernel void ILU_apply1(
__global const double *Lvals,
__global const unsigned int *Lcols,
__global const unsigned int *Lrows,
__global const int *diagIndex,
__global const double *y,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
idx += nodesPerColorPrefix[color];
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = Lrows[target_block_row];
const unsigned int last_block = diagIndex[target_block_row];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[Lcols[block]*bs + c];
const double A_elem = Lvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
std::string get_ILU_apply1_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply1(
__global const double *LUvals,
__global const unsigned int *LUcols,
__global const unsigned int *LUrows,
__global const int *diagIndex,
__global const double *y,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
x[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row];
)";
if (full_matrix) {
s += "const unsigned int last_block = diagIndex[target_block_row]; ";
} else {
s += "const unsigned int last_block = LUrows[target_block_row+1]; ";
}
s += R"(
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
const double A_elem = LUvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
x[row] = tmp[lane];
}
target_block_row += num_warps_in_grid;
}
}
)";
return s;
}
)";
// ILU apply part 2: backward substitution
// solves U*x=y where L is a lower triangular sparse blocked matrix
inline const char* ILU_apply2_s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
__global const int *diagIndex,
__global const double *invDiagVals,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
idx += nodesPerColorPrefix[color];
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
const double relaxation = 0.9;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = diagIndex[target_block_row] + 1;
const unsigned int last_block = LUrows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
const double A_elem = LUvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
std::string get_ILU_apply2_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
__global const int *diagIndex,
__global const double *invDiagVals,
__global double *x,
__global const unsigned int *nodesPerColorPrefix,
const unsigned int color,
const unsigned int block_size,
__local double *tmp)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
const unsigned int warpsize = 32;
const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
const double relaxation = 0.9;
while(target_block_row < nodesPerColorPrefix[color+1]){
)";
if (full_matrix) {
s += "const unsigned int first_block = diagIndex[target_block_row] + 1; ";
} else {
s += "const unsigned int first_block = LUrows[target_block_row]; ";
}
s += R"(
const unsigned int last_block = LUrows[target_block_row+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
const double A_elem = LUvals[block*bs*bs + c + r*bs];
local_out -= x_elem * A_elem;
}
}
// do reduction in shared mem
tmp[lane] = local_out;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
{
if (lane + offset < warpsize)
{
tmp[lane] += tmp[lane + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
local_out = tmp[lane];
if(lane < bs){
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
}
const unsigned int row = target_block_row*bs + lane;
x[row] = relaxation * sum;
}
target_block_row += num_warps_in_grid;
}
}
)";
return s;
}
/// Generate string with the stdwell_apply kernels
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
/// the x/y vector is reordered, use toOrder to address that
/// \param[in] reorder whether the matrix is reordered or not
std::string get_stdwell_apply_string(bool reorder) {
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
std::string s = "__kernel void " + kernel_name + R"((
__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
)";
if (reorder) {
s += R"(__global const int *toOrder,
)";
}
s += R"(const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
}
local_out = tmp[lane];
if(lane < bs){
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
)";
if (reorder) {
s += "int colIdx = toOrder[Bcols[b]]; ";
} else {
s += "int colIdx = Bcols[b]; ";
}
s += R"(
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
}
const unsigned int row = target_block_row*bs + lane;
x[row] = relaxation * sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
target_block_row += num_warps_in_grid;
}
}
)";
inline const char* stdwell_apply_s = R"(
__kernel void stdwell_apply(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
__global const int *toOrder,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = toOrder[Bcols[b]];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
z1[r] = localSum[wiId];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
)";
if (reorder) {
s += "int colIdx = toOrder[Ccols[bb]]; ";
} else {
s += "int colIdx = Ccols[bb]; ";
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = toOrder[Ccols[bb]];
y[colIdx*dim + c] -= temp;
}
}
)";
inline const char* stdwell_apply_no_reorder_s = R"(
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
s += R"(
y[colIdx*dim + c] -= temp;
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = Ccols[bb];
y[colIdx*dim + c] -= temp;
}
)";
return s;
}
)";
inline const char* ilu_decomp_s = R"(

View File

@ -475,19 +475,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
out.clear();
try {
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
program = cl::Program(*context, source);
program.build(devices);
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
@ -519,6 +506,49 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
get_opencl_kernels();
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
} catch (const std::logic_error& error) {
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
throw error;
}
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_opencl_kernels() {
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
#if CHOW_PATEL
bool full_matrix = false;
#else
bool full_matrix = true;
#endif
std::string ILU_apply1_s = get_ILU_apply1_string(full_matrix);
source.emplace_back(std::make_pair(ILU_apply1_s.c_str(), strlen(ILU_apply1_s.c_str())));
std::string ILU_apply2_s = get_ILU_apply2_string(full_matrix);
source.emplace_back(std::make_pair(ILU_apply2_s.c_str(), strlen(ILU_apply2_s.c_str())));
std::string stdwell_apply_s = get_stdwell_apply_string(true);
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
source.emplace_back(std::make_pair(stdwell_apply_s.c_str(), strlen(stdwell_apply_s.c_str())));
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s.c_str(), strlen(stdwell_apply_no_reorder_s.c_str())));
source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
cl::Program program = cl::Program(*context, source);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -540,22 +570,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg>(cl::Kernel(program, "ilu_decomp")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
} catch (const std::logic_error& error) {
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
throw error;
}
initialized = true;
} // end initialize()
} // end get_opencl_kernels()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {

View File

@ -64,7 +64,6 @@ private:
// shared pointers are also passed to other objects
std::vector<cl::Device> devices;
cl::Program program;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
@ -149,6 +148,9 @@ private:
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Generate and compile opencl kernels
void get_opencl_kernels();
/// Clean memory
void finalize();