Merge pull request #3754 from blattms/clean-opencl-kernels-rebased

Clean opencl kernels rebased (rebased #3749)
2025-02-25 18:55:30 -06:00 · 2021-12-22 15:52:40 +01:00
parent e970449667 89bc7d9ef0
commit 994260aaea
25 changed files with 1156 additions and 926 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,6 +287,10 @@ macro (prereqs_hook)
 endmacro (prereqs_hook)

 macro (sources_hook)
+  if(OPENCL_FOUND)
+    include(opencl-source-provider)
+    list(APPEND opm-simulators_SOURCES ${PROJECT_BINARY_DIR}/clSources.cpp)
+  endif()
 endmacro (sources_hook)

 macro (fortran_hook)
--- a/opencl-source-provider.cmake
+++ b/opencl-source-provider.cmake
@@ -0,0 +1,51 @@
+set(BDA_DIR opm/simulators/linalg/bda)
+set(KERNELS_DIR ${BDA_DIR}/opencl/kernels)
+
+option(DEBUG_OPENCL_KERNELS_INTEL "Run ocloc to check kernel (works only on Intel)" OFF)
+if(DEBUG_OPENCL_KERNELS_INTEL)
+  set(DEBUG_OPENCL_DIR ${KERNELS_DIR}/.debug)
+
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${DEBUG_OPENCL_DIR}
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+  )
+endif()
+
+set(CL_SRC_FILE ${PROJECT_BINARY_DIR}/clSources.cpp)
+file(WRITE ${CL_SRC_FILE} "// This file is auto-generated. Do not edit!\n\n")
+file(APPEND ${CL_SRC_FILE} "#include \"${BDA_DIR}/openclKernels.hpp\"\n\n")
+file(APPEND ${CL_SRC_FILE} "namespace Opm\{\n\n")
+file(APPEND ${CL_SRC_FILE} "namespace Accelerator\{\n\n")
+
+file(GLOB CL_LIST "${KERNELS_DIR}/*.cl")
+
+if(USE_CHOW_PATEL_ILU)
+  list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1_fm.cl")
+  list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2_fm.cl")
+else()
+  list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply1.cl")
+  list(REMOVE_ITEM CL_LIST "${PROJECT_SOURCE_DIR}/${KERNELS_DIR}/ILU_apply2.cl")
+endif()
+
+foreach(CL ${CL_LIST})
+  get_filename_component(FNAME ${CL} NAME_WE)
+
+  file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
+  file(READ "${CL}" CL_CONTENT)
+  file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
+  file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")
+
+  if(DEBUG_OPENCL_KERNELS_INTEL)
+    execute_process(
+      COMMAND ocloc -file ${CL} -device kbl -out_dir ${DEBUG_OPENCL_DIR}
+      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+      )
+  endif()
+endforeach()
+
+file(APPEND ${CL_SRC_FILE} "\}\n")
+file(APPEND ${CL_SRC_FILE} "\}")
+
+if(DEBUG_OPENCL_KERNELS_INTEL)
+  file(REMOVE_RECURSE ${DEBUG_DIR})
+endif()
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
@@ -0,0 +1,68 @@
+/// ILU apply part 1: forward substitution.
+/// Solves L*x=y where L is a lower triangular sparse blocked matrix.
+/// Here, L is it's own BSR matrix.
+__kernel void ILU_apply1(
+    __global const double *LUvals,
+    __global const unsigned int *LUcols,
+    __global const unsigned int *LUrows,
+    __global const int *diagIndex,
+    __global const double *y,
+    __global double *x,
+    __global const unsigned int *nodesPerColorPrefix,
+    const unsigned int color,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bs = block_size;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int idx = get_global_id(0);
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    target_block_row += nodesPerColorPrefix[color];
+
+    while(target_block_row < nodesPerColorPrefix[color+1]){
+        const unsigned int first_block = LUrows[target_block_row];
+        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            if(lane < bs){
+                local_out = y[target_block_row*bs+lane];
+            }
+            for(; block < last_block; block += num_blocks_per_warp){
+                const double x_elem = x[LUcols[block]*bs + c];
+                const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                local_out -= x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lane < bs){
+            const unsigned int row = target_block_row*bs + lane;
+            x[row] = tmp[lane];
+        }
+
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
@@ -0,0 +1,68 @@
+/// ILU apply part 1: forward substitution.
+/// Solves L*x=y where L is a lower triangular sparse blocked matrix.
+/// Here, L is inside a normal, square matrix.
+/// In this case, diagIndex indicates where the rows of L end.
+__kernel void ILU_apply1(
+    __global const double *LUvals,
+    __global const unsigned int *LUcols,
+    __global const unsigned int *LUrows,
+    __global const int *diagIndex,
+    __global const double *y,
+    __global double *x,
+    __global const unsigned int *nodesPerColorPrefix,
+    const unsigned int color,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bs = block_size;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int idx = get_global_id(0);
+    unsigned int target_block_row = idx / warpsize;
+    target_block_row += nodesPerColorPrefix[color];
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    while(target_block_row < nodesPerColorPrefix[color+1]){
+        const unsigned int first_block = LUrows[target_block_row];
+        const unsigned int last_block = diagIndex[target_block_row];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            if(lane < bs){
+                local_out = y[target_block_row*bs+lane];
+            }
+            for(; block < last_block; block += num_blocks_per_warp){
+                const double x_elem = x[LUcols[block]*bs + c];
+                const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                local_out -= x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lane < bs){
+            const unsigned int row = target_block_row*bs + lane;
+            x[row] = tmp[lane];
+        }
+
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
@@ -0,0 +1,75 @@
+/// ILU apply part 2: backward substitution.
+/// Solves U*x=y where U is an upper triangular sparse blocked matrix.
+/// Here, U is it's own BSR matrix.
+__kernel void ILU_apply2(
+    __global const double *LUvals,
+    __global const int *LUcols,
+    __global const int *LUrows,
+    __global const int *diagIndex,
+    __global const double *invDiagVals,
+    __global double *x,
+    __global const unsigned int *nodesPerColorPrefix,
+    const unsigned int color,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bs = block_size;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int idx_g = get_global_id(0);
+    unsigned int target_block_row = idx_g / warpsize;
+    target_block_row += nodesPerColorPrefix[color];
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    while(target_block_row < nodesPerColorPrefix[color+1]){
+        const unsigned int first_block = LUrows[target_block_row];
+        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            if(lane < bs){
+                const unsigned int row = target_block_row*bs+lane;
+                local_out = x[row];
+            }
+            for(; block < last_block; block += num_blocks_per_warp){
+                const double x_elem = x[LUcols[block]*bs + c];
+                const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                local_out -= x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        local_out = tmp[lane];
+
+        if(lane < bs){
+            tmp[lane + bs*idx_t/warpsize] = local_out;
+            double sum = 0.0;
+            for(int i = 0; i < bs; ++i){
+                sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+            }
+
+            const unsigned int row = target_block_row*bs + lane;
+            x[row] = sum;
+        }
+
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
@@ -0,0 +1,77 @@
+/// ILU apply part 2: backward substitution.
+/// Solves U*x=y where U is an upper triangular sparse blocked matrix.
+/// Here, U is inside a normal, square matrix.
+/// In this case diagIndex indicates where the rows of U start.
+__kernel void ILU_apply2(
+    __global const double *LUvals,
+    __global const int *LUcols,
+    __global const int *LUrows,
+    __global const int *diagIndex,
+    __global const double *invDiagVals,
+    __global double *x,
+    __global const unsigned int *nodesPerColorPrefix,
+    const unsigned int color,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bs = block_size;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int idx_g = get_global_id(0);
+    unsigned int target_block_row = idx_g / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    target_block_row += nodesPerColorPrefix[color];
+
+    while(target_block_row < nodesPerColorPrefix[color+1]){
+        const unsigned int first_block = diagIndex[target_block_row] + 1;
+        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            if(lane < bs){
+                const unsigned int row = target_block_row*bs+lane;
+                local_out = x[row];
+            }
+            for(; block < last_block; block += num_blocks_per_warp){
+                const double x_elem = x[LUcols[block]*bs + c];
+                const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                local_out -= x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        local_out = tmp[lane];
+
+        if(lane < bs){
+            tmp[lane + bs*idx_t/warpsize] = local_out;
+            double sum = 0.0;
+            for(int i = 0; i < bs; ++i){
+                sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+            }
+
+            const unsigned int row = target_block_row*bs + lane;
+            x[row] = sum;
+        }
+
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
@@ -0,0 +1,137 @@
+// a = a - (b * c)
+__kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
+{
+    const unsigned int block_size = 3;
+    const unsigned int hwarp_size = 16;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+    if(thread_id_in_hwarp < block_size * block_size){
+        const unsigned int row = thread_id_in_hwarp / block_size;
+        const unsigned int col = thread_id_in_hwarp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += b[block_size * row + k] * c[block_size * k + col];
+        }
+        a[block_size * row + col] -= temp;
+    }
+}
+
+// c = a * b
+__kernel void block_mult(__global double *a, __global double *b, __local double *c)
+{
+    const unsigned int block_size = 3;
+    const unsigned int hwarp_size = 16;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+    if(thread_id_in_hwarp < block_size * block_size){
+        const unsigned int row = thread_id_in_hwarp / block_size;
+        const unsigned int col = thread_id_in_hwarp % block_size;
+        double temp = 0.0;
+        for (unsigned int k = 0; k < block_size; k++) {
+            temp += a[block_size * row + k] * b[block_size * k + col];
+        }
+        c[block_size * row + col] = temp;
+    }
+}
+
+// invert 3x3 matrix
+__kernel void inverter(__global double *matrix, __global double *inverse)
+{
+    const unsigned int block_size = 3;
+    const unsigned int bs = block_size;                           // rename to shorter name
+    const unsigned int hwarp_size = 16;
+    const unsigned int idx_t = get_local_id(0);                   // thread id in work group
+    const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
+    if(thread_id_in_hwarp < bs * bs){
+        double t4  = matrix[0] * matrix[4];
+        double t6  = matrix[0] * matrix[5];
+        double t8  = matrix[1] * matrix[3];
+        double t10 = matrix[2] * matrix[3];
+        double t12 = matrix[1] * matrix[6];
+        double t14 = matrix[2] * matrix[6];
+
+        double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
+                        t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
+        double t17 = 1.0 / det;
+
+        const unsigned int r = thread_id_in_hwarp / bs;
+        const unsigned int c = thread_id_in_hwarp % bs;
+        const unsigned int r1 = (r+1) % bs;
+        const unsigned int c1 = (c+1) % bs;
+        const unsigned int r2 = (r+bs-1) % bs;
+        const unsigned int c2 = (c+bs-1) % bs;
+        inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
+    }
+}
+
+/// Exact ilu decomposition kernel
+/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
+__kernel void ilu_decomp(const unsigned int firstRow,
+                         const unsigned int lastRow,
+                         __global double *LUvals,
+                         __global const int *LUcols,
+                         __global const int *LUrows,
+                         __global double *invDiagVals,
+                         __global int *diagIndex,
+                         const unsigned int Nb,
+                         __local double *pivot)
+{
+    const unsigned int bs = 3;
+    const unsigned int hwarp_size = 16;
+    const unsigned int work_group_size = get_local_size(0);
+    const unsigned int work_group_id = get_group_id(0);
+    const unsigned int num_groups = get_num_groups(0);
+    const unsigned int hwarps_per_group = work_group_size / hwarp_size;
+    const unsigned int thread_id_in_group = get_local_id(0);      // thread id in work group
+    const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size;     // thread id in hwarp (16 threads)
+    const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
+    const unsigned int lmem_offset = hwarp_id_in_group * bs * bs;  // each workgroup gets some lmem, but the workitems have to share it
+                                                                    // every workitem in a hwarp has the same lmem_offset
+
+    // go through all rows
+    for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
+    {
+        int iRowStart = LUrows[i];
+        int iRowEnd = LUrows[i + 1];
+
+        // go through all elements of the row
+        for (int ij = iRowStart; ij < iRowEnd; ij++) {
+            int j = LUcols[ij];
+
+            if (j < i) {
+                // calculate the pivot of this row
+                block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
+
+                // copy pivot
+                if (thread_id_in_hwarp < bs * bs) {
+                    LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
+                }
+
+                int jRowEnd = LUrows[j + 1];
+                int jk = diagIndex[j] + 1;
+                int ik = ij + 1;
+                // subtract that row scaled by the pivot from this row.
+                while (ik < iRowEnd && jk < jRowEnd) {
+                    if (LUcols[ik] == LUcols[jk]) {
+                        block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
+                        ik++;
+                        jk++;
+                    } else {
+                        if (LUcols[ik] < LUcols[jk])
+                        { ik++; }
+                        else
+                        { jk++; }
+                    }
+                }
+            }
+        }
+
+        // store the inverse in the diagonal
+        inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
+
+        // copy inverse
+        if (thread_id_in_hwarp < bs * bs) {
+            LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
+        }
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/add_coarse_pressure_correction.cl
@@ -0,0 +1,17 @@
+/// add the coarse pressure solution back to the finer, complete solution
+/// every workitem handles one blockrow
+__kernel void add_coarse_pressure_correction(
+    __global const double *coarse_x,
+    __global double *fine_x,
+    const unsigned int pressure_idx,
+    const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = get_global_id(0);
+
+    while(target_block_row < Nb){
+        fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
+        target_block_row += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/axpy.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/axpy.cl
@@ -0,0 +1,15 @@
+/// axpy kernel: a = a + alpha * b
+__kernel void axpy(
+    __global double *in,
+    const double a,
+    __global double *out,
+    const int N)
+{
+    unsigned int NUM_THREADS = get_global_size(0);
+    int idx = get_global_id(0);
+
+    while(idx < N){
+        out[idx] += a * in[idx];
+        idx += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/custom.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/custom.cl
@@ -0,0 +1,22 @@
+/// Custom kernel: combines some bicgstab vector operations into 1
+/// p = (p - omega * v) * beta + r
+__kernel void custom(
+    __global double *p,
+    __global double *v,
+    __global double *r,
+    const double omega,
+    const double beta,
+    const int N)
+{
+    const unsigned int NUM_THREADS = get_global_size(0);
+    unsigned int idx = get_global_id(0);
+
+    while(idx < N){
+        double res = p[idx];
+        res -= omega * v[idx];
+        res *= beta;
+        res += r[idx];
+        p[idx] = res;
+        idx += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/dot_1.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/dot_1.cl
@@ -0,0 +1,37 @@
+/// returns partial sums, instead of the final dot product
+/// partial sums are added on CPU
+__kernel void dot_1(
+    __global double *in1,
+    __global double *in2,
+    __global double *out,
+    const unsigned int N,
+    __local double *tmp)
+{
+    unsigned int tid = get_local_id(0);
+    unsigned int bsize = get_local_size(0);
+    unsigned int bid = get_global_id(0) / bsize;
+    unsigned int i = get_global_id(0);
+    unsigned int NUM_THREADS = get_global_size(0);
+
+    double sum = 0.0;
+    while(i < N){
+        sum += in1[i] * in2[i];
+        i += NUM_THREADS;
+    }
+    tmp[tid] = sum;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+    {
+        if (tid < s)
+        {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) out[get_group_id(0)] = tmp[0];
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/full_to_pressure_restriction.cl
@@ -0,0 +1,22 @@
+/// transform blocked vector to scalar vector using pressure-weights
+/// every workitem handles one blockrow
+__kernel void full_to_pressure_restriction(
+    __global const double *fine_y,
+    __global const double *weights,
+    __global double *coarse_y,
+    const unsigned int Nb)
+{
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int block_size = 3;
+    unsigned int target_block_row = get_global_id(0);
+
+    while(target_block_row < Nb){
+        double sum = 0.0;
+        unsigned int idx = block_size * target_block_row;
+        for (unsigned int i = 0; i < block_size; ++i) {
+            sum += fine_y[idx + i] * weights[idx + i];
+        }
+        coarse_y[target_block_row] = sum;
+        target_block_row += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/norm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/norm.cl
@@ -0,0 +1,36 @@
+/// returns partial sums, instead of the final norm
+/// the square root must be computed on CPU
+__kernel void norm(
+    __global double *in,
+    __global double *out,
+    const unsigned int N,
+    __local double *tmp)
+{
+    unsigned int tid = get_local_id(0);
+    unsigned int bsize = get_local_size(0);
+    unsigned int bid = get_global_id(0) / bsize;
+    unsigned int i = get_global_id(0);
+    unsigned int NUM_THREADS = get_global_size(0);
+
+    double local_sum = 0.0;
+    while(i < N){
+        local_sum += in[i] * in[i];
+        i += NUM_THREADS;
+    }
+    tmp[tid] = local_sum;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
+    {
+        if (tid < s)
+        {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) out[get_group_id(0)] = tmp[0];
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/prolongate_vector.cl
@@ -0,0 +1,16 @@
+/// prolongate vector during amg cycle
+/// every workitem handles one row
+__kernel void prolongate_vector(
+    __global const double *in,
+    __global double *out,
+    __global const int *cols,
+    const unsigned int N)
+{
+    const unsigned int NUM_THREADS = get_global_size(0);
+    unsigned int row = get_global_id(0);
+
+    while(row < N){
+        out[row] += in[cols[row]];
+        row += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/residual.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/residual.cl
@@ -0,0 +1,50 @@
+/// res = rhs - mat * x
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void residual(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int N,
+    __global const double *x,
+    __global const double *rhs,
+    __global double *out,
+    __local double *tmp)
+{
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_workgroups = get_num_groups(0);
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        double local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = rhs[row] - tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/residual_blocked.cl
@@ -0,0 +1,68 @@
+/// res = rhs - mat * x
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void residual_blocked(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int Nb,
+    __global const double *x,
+    __global const double *rhs,
+    __global double *out,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    unsigned int idx = idx_b * bsize + idx_t;
+    const unsigned int bs = block_size;
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    // for 3x3 blocks:
+    // num_active_threads: 27
+    // num_blocks_per_warp: 3
+
+    while(target_block_row < Nb){
+        unsigned int first_block = rows[target_block_row];
+        unsigned int last_block = rows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            for(; block < last_block; block += num_blocks_per_warp){
+                double x_elem = x[cols[block]*bs + c];
+                double A_elem = vals[block*bs*bs + c + r*bs];
+                local_out += x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lane < bs){
+            unsigned int row = target_block_row*bs + lane;
+            out[row] = rhs[row] - tmp[lane];
+        }
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/scale.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/scale.cl
@@ -0,0 +1,14 @@
+/// scale vector with scalar: a = a * alpha
+__kernel void scale(
+    __global double *vec,
+    const double a,
+    const int N)
+{
+    unsigned int NUM_THREADS = get_global_size(0);
+    int idx = get_global_id(0);
+
+    while(idx < N){
+        vec[idx] *= a;
+        idx += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/spmv.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/spmv.cl
@@ -0,0 +1,49 @@
+/// b = mat * x
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void spmv(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int N,
+    __global const double *x,
+    __global double *out,
+    __local double *tmp)
+{
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_workgroups = get_num_groups(0);
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        double local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] = tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/spmv_blocked.cl
@@ -0,0 +1,67 @@
+/// b = mat * x
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void spmv_blocked(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int Nb,
+    __global const double *x,
+    __global double *out,
+    const unsigned int block_size,
+    __local double *tmp)
+{
+    const unsigned int warpsize = 32;
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    unsigned int idx = idx_b * bsize + idx_t;
+    const unsigned int bs = block_size;
+    const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+    const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+    const unsigned int NUM_THREADS = get_global_size(0);
+    const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+    unsigned int target_block_row = idx / warpsize;
+    const unsigned int lane = idx_t % warpsize;
+    const unsigned int c = (lane / bs) % bs;
+    const unsigned int r = lane % bs;
+
+    // for 3x3 blocks:
+    // num_active_threads: 27
+    // num_blocks_per_warp: 3
+
+    while(target_block_row < Nb){
+        unsigned int first_block = rows[target_block_row];
+        unsigned int last_block = rows[target_block_row+1];
+        unsigned int block = first_block + lane / (bs*bs);
+        double local_out = 0.0;
+
+        if(lane < num_active_threads){
+            for(; block < last_block; block += num_blocks_per_warp){
+                double x_elem = x[cols[block]*bs + c];
+                double A_elem = vals[block*bs*bs + c + r*bs];
+                local_out += x_elem * A_elem;
+            }
+        }
+
+        // do reduction in shared mem
+        tmp[lane] = local_out;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+        {
+            if (lane + offset < warpsize)
+            {
+                tmp[lane] += tmp[lane + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lane < bs){
+            unsigned int row = target_block_row*bs + lane;
+            out[row] = tmp[lane];
+        }
+        target_block_row += num_warps_in_grid;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/spmv_noreset.cl
@@ -0,0 +1,48 @@
+/// algorithm based on:
+/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
+/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
+__kernel void spmv_noreset(
+    __global const double *vals,
+    __global const int *cols,
+    __global const int *rows,
+    const int N,
+    __global const double *x,
+    __global double *out,
+    __local double *tmp)
+{
+    const unsigned int bsize = get_local_size(0);
+    const unsigned int idx_b = get_global_id(0) / bsize;
+    const unsigned int idx_t = get_local_id(0);
+    const unsigned int num_workgroups = get_num_groups(0);
+
+    int row = idx_b;
+
+    while (row < N) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        int rowLength = rowEnd - rowStart;
+        double local_sum = 0.0;
+        for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
+            int col = cols[j];
+            local_sum += vals[j] * x[col];
+        }
+
+        tmp[idx_t] = local_sum;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int offset = bsize / 2;
+        while(offset > 0) {
+            if (idx_t < offset) {
+                tmp[idx_t] += tmp[idx_t + offset];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset = offset / 2;
+        }
+
+        if (idx_t == 0) {
+            out[row] += tmp[idx_t];
+        }
+
+        row += num_workgroups;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply.cl
@@ -0,0 +1,76 @@
+/// In this kernel there is reordering: the B/Ccols do not correspond with the x/y vector
+/// the x/y vector is reordered, using toOrder to address that
+__kernel void stdwell_apply(
+            __global const double *Cnnzs,
+            __global const double *Dnnzs,
+            __global const double *Bnnzs,
+            __global const int *Ccols,
+            __global const int *Bcols,
+            __global const double *x,
+            __global double *y,
+            __global const int *toOrder,
+            const unsigned int dim,
+            const unsigned int dim_wells,
+            __global const unsigned int *val_pointers,
+            __local double *localSum,
+            __local double *z1,
+            __local double *z2)
+{
+    int wgId = get_group_id(0);
+    int wiId = get_local_id(0);
+    int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+    int valsPerBlock = dim*dim_wells;
+    int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
+    int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
+    int c = wiId % dim;
+    int r = (wiId/dim) % dim_wells;
+    double temp;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    localSum[wiId] = 0;
+    if(wiId < numActiveWorkItems){
+        int b = wiId/valsPerBlock + val_pointers[wgId];
+        while(b < valSize + val_pointers[wgId]){
+            int colIdx = toOrder[Bcols[b]];
+            localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+            b += numBlocksPerWarp;
+        }
+
+        if(wiId < valsPerBlock){
+            localSum[wiId] += localSum[wiId + valsPerBlock];
+        }
+
+        b = wiId/valsPerBlock + val_pointers[wgId];
+
+        if(c == 0 && wiId < valsPerBlock){
+            for(unsigned int stride = 2; stride > 0; stride >>= 1){
+                localSum[wiId] += localSum[wiId + stride];
+            }
+            z1[r] = localSum[wiId];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(wiId < dim_wells){
+        temp = 0.0;
+        for(unsigned int i = 0; i < dim_wells; ++i){
+            temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+        }
+        z2[wiId] = temp;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(wiId < dim*valSize){
+        temp = 0.0;
+        int bb = wiId/dim + val_pointers[wgId];
+        for (unsigned int j = 0; j < dim_wells; ++j){
+            temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+        }
+
+        int colIdx = toOrder[Ccols[bb]];
+        y[colIdx*dim + c] -= temp;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/stdwell_apply_no_reorder.cl
@@ -0,0 +1,74 @@
+/// Applies sdtwells without reordering
+__kernel void stdwell_apply_no_reorder(
+            __global const double *Cnnzs,
+            __global const double *Dnnzs,
+            __global const double *Bnnzs,
+            __global const int *Ccols,
+            __global const int *Bcols,
+            __global const double *x,
+            __global double *y,
+            const unsigned int dim,
+            const unsigned int dim_wells,
+            __global const unsigned int *val_pointers,
+            __local double *localSum,
+            __local double *z1,
+            __local double *z2)
+{
+    int wgId = get_group_id(0);
+    int wiId = get_local_id(0);
+    int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+    int valsPerBlock = dim*dim_wells;
+    int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
+    int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
+    int c = wiId % dim;
+    int r = (wiId/dim) % dim_wells;
+    double temp;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    localSum[wiId] = 0;
+    if(wiId < numActiveWorkItems){
+        int b = wiId/valsPerBlock + val_pointers[wgId];
+        while(b < valSize + val_pointers[wgId]){
+            int colIdx = Bcols[b];
+            localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+            b += numBlocksPerWarp;
+        }
+
+        if(wiId < valsPerBlock){
+            localSum[wiId] += localSum[wiId + valsPerBlock];
+        }
+
+        b = wiId/valsPerBlock + val_pointers[wgId];
+
+        if(c == 0 && wiId < valsPerBlock){
+            for(unsigned int stride = 2; stride > 0; stride >>= 1){
+                localSum[wiId] += localSum[wiId + stride];
+            }
+            z1[r] = localSum[wiId];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(wiId < dim_wells){
+        temp = 0.0;
+        for(unsigned int i = 0; i < dim_wells; ++i){
+            temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+        }
+        z2[wiId] = temp;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(wiId < dim*valSize){
+        temp = 0.0;
+        int bb = wiId/dim + val_pointers[wgId];
+        for (unsigned int j = 0; j < dim_wells; ++j){
+            temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+        }
+
+        int colIdx = Ccols[bb];
+        y[colIdx*dim + c] -= temp;
+    }
+}
--- a/opm/simulators/linalg/bda/opencl/kernels/vmul.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/vmul.cl
@@ -0,0 +1,17 @@
+/// multiply vector with another vector and a scalar, element-wise
+/// add result to a third vector
+__kernel void vmul(
+    const double alpha,
+    __global double const *in1,
+    __global double const *in2,
+    __global double *out,
+    const int N)
+{
+    unsigned int NUM_THREADS = get_global_size(0);
+    int idx = get_global_id(0);
+
+    while(idx < N){
+        out[idx] += alpha * in1[idx] * in2[idx];
+        idx += NUM_THREADS;
+    }
+}
--- a/opm/simulators/linalg/bda/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/openclKernels.cpp
@@ -69,12 +69,8 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
    return A / B + (A % B > 0);
 }

-void add_kernel_source(cl::Program::Sources &sources, const std::string &source) {
-    sources.emplace_back(source);
-}
-
-void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
-
+void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
+{
    if (initialized) {
        OpmLog::debug("Warning OpenclKernels is already initialized");
        return;
@@ -84,49 +80,30 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    verbosity = verbosity_;

    cl::Program::Sources sources;
-    const std::string& axpy_s = get_axpy_source();
-    add_kernel_source(sources, axpy_s);
-    const std::string& scale_s = get_scale_source();
-    add_kernel_source(sources, scale_s);
-    const std::string& vmul_s = get_vmul_source();
-    add_kernel_source(sources, vmul_s);
-    const std::string& dot_1_s = get_dot_1_source();
-    add_kernel_source(sources, dot_1_s);
-    const std::string& norm_s = get_norm_source();
-    add_kernel_source(sources, norm_s);
-    const std::string& custom_s = get_custom_source();
-    add_kernel_source(sources, custom_s);
-    const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source();
-    add_kernel_source(sources, full_to_pressure_restriction_s);
-    const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
-    add_kernel_source(sources, add_coarse_pressure_correction_s);
-    const std::string& prolongate_vector_s = get_prolongate_vector_source();
-    add_kernel_source(sources, prolongate_vector_s);
-    const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
-    add_kernel_source(sources, spmv_blocked_s);
-    const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
-    add_kernel_source(sources, spmv_s);
-    const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false);
-    add_kernel_source(sources, spmv_noreset_s);
-    const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op);
-    add_kernel_source(sources, residual_blocked_s);
-    const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op);
-    add_kernel_source(sources, residual_s);
+    sources.emplace_back(axpy_str);
+    sources.emplace_back(scale_str);
+    sources.emplace_back(vmul_str);
+    sources.emplace_back(dot_1_str);
+    sources.emplace_back(norm_str);
+    sources.emplace_back(custom_str);
+    sources.emplace_back(full_to_pressure_restriction_str);
+    sources.emplace_back(add_coarse_pressure_correction_str);
+    sources.emplace_back(prolongate_vector_str);
+    sources.emplace_back(spmv_blocked_str);
+    sources.emplace_back(spmv_str);
+    sources.emplace_back(spmv_noreset_str);
+    sources.emplace_back(residual_blocked_str);
+    sources.emplace_back(residual_str);
 #if CHOW_PATEL
-    bool ilu_operate_on_full_matrix = false;
+    sources.emplace_back(ILU_apply1_str);
+    sources.emplace_back(ILU_apply2_str);
 #else
-    bool ilu_operate_on_full_matrix = true;
+    sources.emplace_back(ILU_apply1_fm_str);
+    sources.emplace_back(ILU_apply2_fm_str);
 #endif
-    const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix);
-    add_kernel_source(sources, ILU_apply1_s);
-    const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix);
-    add_kernel_source(sources, ILU_apply2_s);
-    const std::string& stdwell_apply_s = get_stdwell_apply_source(true);
-    add_kernel_source(sources, stdwell_apply_s);
-    const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false);
-    add_kernel_source(sources, stdwell_apply_no_reorder_s);
-    const std::string& ilu_decomp_s = get_ilu_decomp_source();
-    add_kernel_source(sources, ilu_decomp_s);
+    sources.emplace_back(stdwell_apply_str);
+    sources.emplace_back(stdwell_apply_no_reorder_str);
+    sources.emplace_back(ILU_decomp_str);

    cl::Program program = cl::Program(*context, sources);
    program.build(devices);
@@ -158,9 +135,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
    initialized = true;
 } // end get_opencl_kernels()

-
-
-
 double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
 {
    const unsigned int work_group_size = 256;
@@ -387,7 +361,6 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
    }
 }

-
 void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
 {
    const unsigned int work_group_size = 32;
@@ -406,7 +379,6 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r
    }
 }

-
 void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
 {
    const unsigned int work_group_size = 32;
@@ -488,808 +460,5 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
    }
 }

-
-    std::string OpenclKernels::get_axpy_source() {
-        return R"(
-        __kernel void axpy(
-            __global double *in,
-            const double a,
-            __global double *out,
-            const int N)
-        {
-            unsigned int NUM_THREADS = get_global_size(0);
-            int idx = get_global_id(0);
-
-            while(idx < N){
-                out[idx] += a * in[idx];
-                idx += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-
-    // scale vector with scalar
-    std::string OpenclKernels::get_scale_source() {
-        return R"(
-        __kernel void scale(
-            __global double *vec,
-            const double a,
-            const int N)
-        {
-            unsigned int NUM_THREADS = get_global_size(0);
-            int idx = get_global_id(0);
-
-            while(idx < N){
-                vec[idx] *= a;
-                idx += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-    // multiply vector with another vector and a scalar, element-wise
-    // add result to a third vector
-    std::string OpenclKernels::get_vmul_source() {
-        return R"(
-        __kernel void vmul(
-            const double alpha,
-            __global double const *in1,
-            __global double const *in2,
-            __global double *out,
-            const int N)
-        {
-            unsigned int NUM_THREADS = get_global_size(0);
-            int idx = get_global_id(0);
-
-            while(idx < N){
-                out[idx] += alpha * in1[idx] * in2[idx];
-                idx += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-
-    // returns partial sums, instead of the final dot product
-    std::string OpenclKernels::get_dot_1_source() {
-        return R"(
-        __kernel void dot_1(
-            __global double *in1,
-            __global double *in2,
-            __global double *out,
-            const unsigned int N,
-            __local double *tmp)
-        {
-            unsigned int tid = get_local_id(0);
-            unsigned int bsize = get_local_size(0);
-            unsigned int bid = get_global_id(0) / bsize;
-            unsigned int i = get_global_id(0);
-            unsigned int NUM_THREADS = get_global_size(0);
-
-            double sum = 0.0;
-            while(i < N){
-                sum += in1[i] * in2[i];
-                i += NUM_THREADS;
-            }
-            tmp[tid] = sum;
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            // do reduction in shared mem
-            for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
-            {
-                if (tid < s)
-                {
-                    tmp[tid] += tmp[tid + s];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-
-            // write result for this block to global mem
-            if (tid == 0) out[get_group_id(0)] = tmp[0];
-        }
-        )";
-    }
-
-
-    // returns partial sums, instead of the final norm
-    // the square root must be computed on CPU
-    std::string OpenclKernels::get_norm_source() {
-        return R"(
-        __kernel void norm(
-            __global double *in,
-            __global double *out,
-            const unsigned int N,
-            __local double *tmp)
-        {
-            unsigned int tid = get_local_id(0);
-            unsigned int bsize = get_local_size(0);
-            unsigned int bid = get_global_id(0) / bsize;
-            unsigned int i = get_global_id(0);
-            unsigned int NUM_THREADS = get_global_size(0);
-
-            double local_sum = 0.0;
-            while(i < N){
-                local_sum += in[i] * in[i];
-                i += NUM_THREADS;
-            }
-            tmp[tid] = local_sum;
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            // do reduction in shared mem
-            for(unsigned int s = get_local_size(0) / 2; s > 0; s >>= 1)
-            {
-                if (tid < s)
-                {
-                    tmp[tid] += tmp[tid + s];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-
-            // write result for this block to global mem
-            if (tid == 0) out[get_group_id(0)] = tmp[0];
-        }
-        )";
-    }
-
-
-    // p = (p - omega * v) * beta + r
-    std::string OpenclKernels::get_custom_source() {
-        return R"(
-        __kernel void custom(
-            __global double *p,
-            __global double *v,
-            __global double *r,
-            const double omega,
-            const double beta,
-            const int N)
-        {
-            const unsigned int NUM_THREADS = get_global_size(0);
-            unsigned int idx = get_global_id(0);
-
-            while(idx < N){
-                double res = p[idx];
-                res -= omega * v[idx];
-                res *= beta;
-                res += r[idx];
-                p[idx] = res;
-                idx += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-
-    // transform blocked vector to scalar vector using pressure-weights
-    // every workitem handles one blockrow
-    std::string OpenclKernels::get_full_to_pressure_restriction_source() {
-        return R"(
-        __kernel void full_to_pressure_restriction(
-            __global const double *fine_y,
-            __global const double *weights,
-            __global double *coarse_y,
-            const unsigned int Nb)
-        {
-            const unsigned int NUM_THREADS = get_global_size(0);
-            const unsigned int block_size = 3;
-            unsigned int target_block_row = get_global_id(0);
-
-            while(target_block_row < Nb){
-                double sum = 0.0;
-                unsigned int idx = block_size * target_block_row;
-                for (unsigned int i = 0; i < block_size; ++i) {
-                    sum += fine_y[idx + i] * weights[idx + i];
-                }
-                coarse_y[target_block_row] = sum;
-                target_block_row += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-    // add the coarse pressure solution back to the finer, complete solution
-    // every workitem handles one blockrow
-    std::string OpenclKernels::get_add_coarse_pressure_correction_source() {
-        return R"(
-        __kernel void add_coarse_pressure_correction(
-            __global const double *coarse_x,
-            __global double *fine_x,
-            const unsigned int pressure_idx,
-            const unsigned int Nb)
-        {
-            const unsigned int NUM_THREADS = get_global_size(0);
-            const unsigned int block_size = 3;
-            unsigned int target_block_row = get_global_id(0);
-
-            while(target_block_row < Nb){
-                fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
-                target_block_row += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-    // prolongate vector during amg cycle
-    // every workitem handles one row
-    std::string OpenclKernels::get_prolongate_vector_source() {
-        return R"(
-        __kernel void prolongate_vector(
-            __global const double *in,
-            __global double *out,
-            __global const int *cols,
-            const unsigned int N)
-        {
-            const unsigned int NUM_THREADS = get_global_size(0);
-            unsigned int row = get_global_id(0);
-
-            while(row < N){
-                out[row] += in[cols[row]];
-                row += NUM_THREADS;
-            }
-        }
-        )";
-    }
-
-
-/// either b = mat * x
-/// or res = rhs - mat * x
-std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) {
-    std::string s;
-    if (op == matrix_operation::spmv_op) {
-        s += "__kernel void spmv_blocked(";
-    } else {
-        s += "__kernel void residual_blocked(";
-    }
-        s += R"(__global const double *vals,
-            __global const int *cols,
-            __global const int *rows,
-            const int Nb,
-            __global const double *x,
-            )";
-    if (op == matrix_operation::residual_op) {
-        s += "__global const double *rhs,";
-    }
-        s += R"(
-            __global double *out,
-            const unsigned int block_size,
-            __local double *tmp)
-        {
-            const unsigned int warpsize = 32;
-            const unsigned int bsize = get_local_size(0);
-            const unsigned int idx_b = get_global_id(0) / bsize;
-            const unsigned int idx_t = get_local_id(0);
-            unsigned int idx = idx_b * bsize + idx_t;
-            const unsigned int bs = block_size;
-            const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-            const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-            const unsigned int NUM_THREADS = get_global_size(0);
-            const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-            unsigned int target_block_row = idx / warpsize;
-            const unsigned int lane = idx_t % warpsize;
-            const unsigned int c = (lane / bs) % bs;
-            const unsigned int r = lane % bs;
-
-            // for 3x3 blocks:
-            // num_active_threads: 27
-            // num_blocks_per_warp: 3
-
-            while(target_block_row < Nb){
-                unsigned int first_block = rows[target_block_row];
-                unsigned int last_block = rows[target_block_row+1];
-                unsigned int block = first_block + lane / (bs*bs);
-                double local_out = 0.0;
-
-                if(lane < num_active_threads){
-                    for(; block < last_block; block += num_blocks_per_warp){
-                        double x_elem = x[cols[block]*bs + c];
-                        double A_elem = vals[block*bs*bs + c + r*bs];
-                        local_out += x_elem * A_elem;
-                    }
-                }
-
-                // do reduction in shared mem
-                tmp[lane] = local_out;
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-                {
-                    if (lane + offset < warpsize)
-                    {
-                        tmp[lane] += tmp[lane + offset];
-                    }
-                    barrier(CLK_LOCAL_MEM_FENCE);
-                }
-
-                if(lane < bs){
-                    unsigned int row = target_block_row*bs + lane;
-              )";
-    if (op == matrix_operation::spmv_op) {
-        s += "      out[row] = tmp[lane];";
-    } else {
-        s += "      out[row] = rhs[row] - tmp[lane];";
-    }
-        s += R"(
-                }
-                target_block_row += num_warps_in_grid;
-            }
-        }
-    )";
-    return s;
-}
-
-
-/// either b = mat * x
-/// or res = rhs - mat * x
-std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) {
-    std::string s;
-    if (op == matrix_operation::spmv_op) {
-        if (spmv_reset) {
-            s += "__kernel void spmv(";
-        } else {
-            s += "__kernel void spmv_noreset(";
-        }
-    } else {
-        s += "__kernel void residual(";
-    }
-    s += R"(__global const double *vals,
-            __global const int *cols,
-            __global const int *rows,
-            const int N,
-            __global const double *x,
-            )";
-    if (op == matrix_operation::residual_op) {
-        s += "__global const double *rhs,";
-    }
-        s += R"(
-            __global double *out,
-            __local double *tmp)
-        {
-            const unsigned int bsize = get_local_size(0);
-            const unsigned int idx_b = get_global_id(0) / bsize;
-            const unsigned int idx_t = get_local_id(0);
-            const unsigned int num_workgroups = get_num_groups(0);
-
-            int row = idx_b;
-
-            while (row < N) {
-                int rowStart = rows[row];
-                int rowEnd = rows[row+1];
-                int rowLength = rowEnd - rowStart;
-                double local_sum = 0.0;
-                for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
-                    int col = cols[j];
-                    local_sum += vals[j] * x[col];
-                }
-
-                tmp[idx_t] = local_sum;
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                int offset = bsize / 2;
-                while(offset > 0) {
-                    if (idx_t < offset) {
-                        tmp[idx_t] += tmp[idx_t + offset];
-                    }
-                    barrier(CLK_LOCAL_MEM_FENCE);
-                    offset = offset / 2;
-                }
-
-                if (idx_t == 0) {
-              )";
-    if (op == matrix_operation::spmv_op) {
-        if (spmv_reset) {
-            s += "  out[row] = tmp[idx_t];";
-        } else {
-            s += "  out[row] += tmp[idx_t];";
-        }
-    } else {
-        s += "      out[row] = rhs[row] - tmp[idx_t];";
-    }
-        s += R"(
-                }
-                row += num_workgroups;
-            }
-        }
-        )";
-    return s;
-}
-
-
-    std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) {
-        std::string s = R"(
-            __kernel void ILU_apply1(
-                __global const double *LUvals,
-                __global const unsigned int *LUcols,
-                __global const unsigned int *LUrows,
-                __global const int *diagIndex,
-                __global const double *y,
-                __global double *x,
-                __global const unsigned int *nodesPerColorPrefix,
-                const unsigned int color,
-                const unsigned int block_size,
-                __local double *tmp)
-            {
-                const unsigned int warpsize = 32;
-                const unsigned int bs = block_size;
-                const unsigned int idx_t = get_local_id(0);
-                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-                const unsigned int NUM_THREADS = get_global_size(0);
-                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-                unsigned int idx = get_global_id(0);
-                unsigned int target_block_row = idx / warpsize;
-                target_block_row += nodesPerColorPrefix[color];
-                const unsigned int lane = idx_t % warpsize;
-                const unsigned int c = (lane / bs) % bs;
-                const unsigned int r = lane % bs;
-
-                while(target_block_row < nodesPerColorPrefix[color+1]){
-                    const unsigned int first_block = LUrows[target_block_row];
-                    )";
-        if (full_matrix) {
-            s += "const unsigned int last_block = diagIndex[target_block_row];  ";
-        } else {
-            s += "const unsigned int last_block = LUrows[target_block_row+1];  ";
-        }
-        s += R"(
-                    unsigned int block = first_block + lane / (bs*bs);
-                    double local_out = 0.0;
-                    if(lane < num_active_threads){
-                        if(lane < bs){
-                            local_out = y[target_block_row*bs+lane];
-                        }
-                        for(; block < last_block; block += num_blocks_per_warp){
-                            const double x_elem = x[LUcols[block]*bs + c];
-                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
-                            local_out -= x_elem * A_elem;
-                        }
-                    }
-
-                    // do reduction in shared mem
-                    tmp[lane] = local_out;
-                    barrier(CLK_LOCAL_MEM_FENCE);
-
-                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-                    {
-                        if (lane + offset < warpsize)
-                        {
-                            tmp[lane] += tmp[lane + offset];
-                        }
-                        barrier(CLK_LOCAL_MEM_FENCE);
-                    }
-
-                    if(lane < bs){
-                        const unsigned int row = target_block_row*bs + lane;
-                        x[row] = tmp[lane];
-                    }
-
-                    target_block_row += num_warps_in_grid;
-                }
-            }
-            )";
-        return s;
-    }
-
-
-    std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) {
-        std::string s = R"(
-            __kernel void ILU_apply2(
-                __global const double *LUvals,
-                __global const int *LUcols,
-                __global const int *LUrows,
-                __global const int *diagIndex,
-                __global const double *invDiagVals,
-                __global double *x,
-                __global const unsigned int *nodesPerColorPrefix,
-                const unsigned int color,
-                const unsigned int block_size,
-                __local double *tmp)
-            {
-                const unsigned int warpsize = 32;
-                const unsigned int bs = block_size;
-                const unsigned int idx_t = get_local_id(0);
-                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-                const unsigned int NUM_THREADS = get_global_size(0);
-                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-                unsigned int idx_g = get_global_id(0);
-                unsigned int target_block_row = idx_g / warpsize;
-                target_block_row += nodesPerColorPrefix[color];
-                const unsigned int lane = idx_t % warpsize;
-                const unsigned int c = (lane / bs) % bs;
-                const unsigned int r = lane % bs;
-
-                while(target_block_row < nodesPerColorPrefix[color+1]){
-                    )";
-        if (full_matrix) {
-            s +=   "const unsigned int first_block = diagIndex[target_block_row] + 1;  ";
-        } else {
-            s +=   "const unsigned int first_block = LUrows[target_block_row];  ";
-        }
-        s += R"(
-                    const unsigned int last_block = LUrows[target_block_row+1];
-                    unsigned int block = first_block + lane / (bs*bs);
-                    double local_out = 0.0;
-                    if(lane < num_active_threads){
-                        if(lane < bs){
-                            const unsigned int row = target_block_row*bs+lane;
-                            local_out = x[row];
-                        }
-                        for(; block < last_block; block += num_blocks_per_warp){
-                            const double x_elem = x[LUcols[block]*bs + c];
-                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
-                            local_out -= x_elem * A_elem;
-                        }
-                    }
-
-                    // do reduction in shared mem
-                    tmp[lane] = local_out;
-                    barrier(CLK_LOCAL_MEM_FENCE);
-
-                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
-                    {
-                        if (lane + offset < warpsize)
-                        {
-                            tmp[lane] += tmp[lane + offset];
-                        }
-                        barrier(CLK_LOCAL_MEM_FENCE);
-                    }
-                    local_out = tmp[lane];
-
-                    if(lane < bs){
-                        tmp[lane + bs*idx_t/warpsize] = local_out;
-                        double sum = 0.0;
-                        for(int i = 0; i < bs; ++i){
-                            sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
-                        }
-
-                        const unsigned int row = target_block_row*bs + lane;
-                        x[row] = sum;
-                    }
-
-                    target_block_row += num_warps_in_grid;
-                }
-            }
-        )";
-        return s;
-    }
-
-    std::string OpenclKernels::get_stdwell_apply_source(bool reorder) {
-        std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
-        std::string s = "__kernel void " + kernel_name + R"((
-                        __global const double *Cnnzs,
-                        __global const double *Dnnzs,
-                        __global const double *Bnnzs,
-                        __global const int *Ccols,
-                        __global const int *Bcols,
-                        __global const double *x,
-                        __global double *y,
-                        )";
-        if (reorder) {
-            s +=     R"(__global const int *toOrder,
-                        )";
-        }
-        s +=         R"(const unsigned int dim,
-                        const unsigned int dim_wells,
-                        __global const unsigned int *val_pointers,
-                        __local double *localSum,
-                        __local double *z1,
-                        __local double *z2){
-                int wgId = get_group_id(0);
-                int wiId = get_local_id(0);
-                int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
-                int valsPerBlock = dim*dim_wells;
-                int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
-                int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
-                int c = wiId % dim;
-                int r = (wiId/dim) % dim_wells;
-                double temp;
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                localSum[wiId] = 0;
-                if(wiId < numActiveWorkItems){
-                    int b = wiId/valsPerBlock + val_pointers[wgId];
-                    while(b < valSize + val_pointers[wgId]){
-                        )";
-        if (reorder) {
-            s +=       "int colIdx = toOrder[Bcols[b]];  ";
-        } else {
-            s +=       "int colIdx = Bcols[b];  ";
-        }
-        s += R"(
-                        localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
-                        b += numBlocksPerWarp;
-                    }
-
-                    if(wiId < valsPerBlock){
-                        localSum[wiId] += localSum[wiId + valsPerBlock];
-                    }
-
-                    b = wiId/valsPerBlock + val_pointers[wgId];
-
-                    if(c == 0 && wiId < valsPerBlock){
-                        for(unsigned int stride = 2; stride > 0; stride >>= 1){
-                            localSum[wiId] += localSum[wiId + stride];
-                        }
-                        z1[r] = localSum[wiId];
-                    }
-                }
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                if(wiId < dim_wells){
-                    temp = 0.0;
-                    for(unsigned int i = 0; i < dim_wells; ++i){
-                        temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
-                    }
-                    z2[wiId] = temp;
-                }
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                if(wiId < dim*valSize){
-                    temp = 0.0;
-                    int bb = wiId/dim + val_pointers[wgId];
-                    for (unsigned int j = 0; j < dim_wells; ++j){
-                        temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
-                    }
-                    )";
-        if (reorder) {
-            s +=   "int colIdx = toOrder[Ccols[bb]];  ";
-        } else {
-            s +=   "int colIdx = Ccols[bb];  ";
-        }
-        s += R"(
-                    y[colIdx*dim + c] -= temp;
-                }
-            }
-            )";
-        return s;
-    }
-
-
-    std::string OpenclKernels::get_ilu_decomp_source() {
-        return R"(
-
-        // a = a - (b * c)
-        __kernel void block_mult_sub(__global double *a, __local double *b, __global double *c)
-        {
-            const unsigned int block_size = 3;
-            const unsigned int hwarp_size = 16;
-            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
-            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
-            if(thread_id_in_hwarp < block_size * block_size){
-                const unsigned int row = thread_id_in_hwarp / block_size;
-                const unsigned int col = thread_id_in_hwarp % block_size;
-                double temp = 0.0;
-                for (unsigned int k = 0; k < block_size; k++) {
-                    temp += b[block_size * row + k] * c[block_size * k + col];
-                }
-                a[block_size * row + col] -= temp;
-            }
-        }
-
-        // c = a * b
-        __kernel void block_mult(__global double *a, __global double *b, __local double *c) {
-            const unsigned int block_size = 3;
-            const unsigned int hwarp_size = 16;
-            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
-            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
-            if(thread_id_in_hwarp < block_size * block_size){
-                const unsigned int row = thread_id_in_hwarp / block_size;
-                const unsigned int col = thread_id_in_hwarp % block_size;
-                double temp = 0.0;
-                for (unsigned int k = 0; k < block_size; k++) {
-                    temp += a[block_size * row + k] * b[block_size * k + col];
-                }
-                c[block_size * row + col] = temp;
-            }
-        }
-
-        // invert 3x3 matrix
-        __kernel void inverter(__global double *matrix, __global double *inverse) {
-            const unsigned int block_size = 3;
-            const unsigned int bs = block_size;                           // rename to shorter name
-            const unsigned int hwarp_size = 16;
-            const unsigned int idx_t = get_local_id(0);                   // thread id in work group
-            const unsigned int thread_id_in_hwarp = idx_t % hwarp_size;   // thread id in warp (16 threads)
-            if(thread_id_in_hwarp < bs * bs){
-                double t4  = matrix[0] * matrix[4];
-                double t6  = matrix[0] * matrix[5];
-                double t8  = matrix[1] * matrix[3];
-                double t10 = matrix[2] * matrix[3];
-                double t12 = matrix[1] * matrix[6];
-                double t14 = matrix[2] * matrix[6];
-
-                double det = (t4 * matrix[8] - t6 * matrix[7] - t8 * matrix[8] +
-                              t10 * matrix[7] + t12 * matrix[5] - t14 * matrix[4]);
-                double t17 = 1.0 / det;
-
-                const unsigned int r = thread_id_in_hwarp / bs;
-                const unsigned int c = thread_id_in_hwarp % bs;
-                const unsigned int r1 = (r+1) % bs;
-                const unsigned int c1 = (c+1) % bs;
-                const unsigned int r2 = (r+bs-1) % bs;
-                const unsigned int c2 = (c+bs-1) % bs;
-                inverse[c*bs+r] = ((matrix[r1*bs+c1] * matrix[r2*bs+c2]) - (matrix[r1*bs+c2] * matrix[r2*bs+c1])) * t17;
-            }
-        }
-
-        __kernel void ilu_decomp(const unsigned int firstRow,
-                                 const unsigned int lastRow,
-                                 __global double *LUvals,
-                                 __global const int *LUcols,
-                                 __global const int *LUrows,
-                                 __global double *invDiagVals,
-                                 __global int *diagIndex,
-                                 const unsigned int Nb,
-                                 __local double *pivot){
-
-            const unsigned int bs = 3;
-            const unsigned int hwarp_size = 16;
-            const unsigned int work_group_size = get_local_size(0);
-            const unsigned int work_group_id = get_group_id(0);
-            const unsigned int num_groups = get_num_groups(0);
-            const unsigned int hwarps_per_group = work_group_size / hwarp_size;
-            const unsigned int thread_id_in_group = get_local_id(0);      // thread id in work group
-            const unsigned int thread_id_in_hwarp = thread_id_in_group % hwarp_size;     // thread id in hwarp (16 threads)
-            const unsigned int hwarp_id_in_group = thread_id_in_group / hwarp_size;
-            const unsigned int lmem_offset = hwarp_id_in_group * bs * bs;  // each workgroup gets some lmem, but the workitems have to share it
-                                                                           // every workitem in a hwarp has the same lmem_offset
-
-            // go through all rows
-            for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
-            {
-                int iRowStart = LUrows[i];
-                int iRowEnd = LUrows[i + 1];
-
-                // go through all elements of the row
-                for (int ij = iRowStart; ij < iRowEnd; ij++) {
-                    int j = LUcols[ij];
-
-                    if (j < i) {
-                        // calculate the pivot of this row
-                        block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
-
-                        // copy pivot
-                        if (thread_id_in_hwarp < bs * bs) {
-                            LUvals[ij * bs * bs + thread_id_in_hwarp] = pivot[lmem_offset + thread_id_in_hwarp];
-                        }
-
-                        int jRowEnd = LUrows[j + 1];
-                        int jk = diagIndex[j] + 1;
-                        int ik = ij + 1;
-                        // subtract that row scaled by the pivot from this row.
-                        while (ik < iRowEnd && jk < jRowEnd) {
-                            if (LUcols[ik] == LUcols[jk]) {
-                                block_mult_sub(LUvals + ik * bs * bs, pivot + lmem_offset, LUvals + jk * bs * bs);
-                                ik++;
-                                jk++;
-                            } else {
-                                if (LUcols[ik] < LUcols[jk])
-                                { ik++; }
-                                else
-                                { jk++; }
-                            }
-                        }
-                    }
-                }
-
-                // store the inverse in the diagonal
-                inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
-
-                // copy inverse
-                if (thread_id_in_hwarp < bs * bs) {
-                    LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
-                }
-
-            }
-        }
-        )";
-    }
-
 } // namespace Accelerator
 } // namespace Opm
-
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@@ -61,11 +61,6 @@ private:
    static std::vector<double> tmp;     // used as tmp CPU buffer for dot() and norm()
    static bool initialized;

-    enum matrix_operation {
-        spmv_op,
-        residual_op
-    };
-
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
    static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
@@ -86,76 +81,34 @@ private:
    static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
    static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;

-    /// Generate string with axpy kernel
-    /// a = a + alpha * b
-    static std::string get_axpy_source();
-
-    /// Generate string with scale kernel
-    /// a = a * alpha
-    static std::string get_scale_source();
-
-    /// multiply vector with another vector and a scalar, element-wise
-    /// add result to a third vector
-    static std::string get_vmul_source();
-
-    /// returns partial sums, instead of the final dot product
-    /// partial sums are added on CPU
-    static std::string get_dot_1_source();
-
-    /// returns partial sums, instead of the final norm
-    /// the square root must be computed on CPU
-    static std::string get_norm_source();
-
-    /// Generate string with custom kernel
-    /// This kernel combines some ilubicgstab vector operations into 1
-    /// p = (p - omega * v) * beta + r
-    static std::string get_custom_source();
-
-    /// Transform blocked vector to scalar vector using pressure-weights
-    static std::string get_full_to_pressure_restriction_source();
-
-    /// Add the coarse pressure solution back to the finer, complete solution
-    static std::string get_add_coarse_pressure_correction_source();
-
-    /// Prolongate a vector during the AMG cycle
-    static std::string get_prolongate_vector_source();
-
-    /// b = mat * x
-    /// algorithm based on:
-    /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
-    /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
-    /// or
-    /// res = rhs - (mat * x)
-    static std::string get_blocked_matrix_operation_source(matrix_operation op);
-    static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true);
-
-    /// ILU apply part 1: forward substitution
-    /// solves L*x=y where L is a lower triangular sparse blocked matrix
-    /// this L can be it's own BSR matrix (if full_matrix is false),
-    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
-    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    static std::string get_ILU_apply1_source(bool full_matrix);
-
-    /// ILU apply part 2: backward substitution
-    /// solves U*x=y where U is an upper triangular sparse blocked matrix
-    /// this U can be it's own BSR matrix (if full_matrix is false),
-    /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
-    /// \param[in] full_matrix   whether the kernel should operate on a full (square) matrix or not
-    static std::string get_ILU_apply2_source(bool full_matrix);
-
-    /// Generate string with the stdwell_apply kernels
-    /// If reorder is true, the B/Ccols do not correspond with the x/y vector
-    /// the x/y vector is reordered, use toOrder to address that
-    /// \param[in] reorder   whether the matrix is reordered or not
-    static std::string get_stdwell_apply_source(bool reorder);
-
-    /// Generate string with the exact ilu decomposition kernel
-    /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
-    static std::string get_ilu_decomp_source();
-
    OpenclKernels(){}; // disable instantiation

 public:
+    static const std::string axpy_str;
+    static const std::string scale_str;
+    static const std::string vmul_str;
+    static const std::string dot_1_str;
+    static const std::string norm_str;
+    static const std::string custom_str;
+    static const std::string full_to_pressure_restriction_str;
+    static const std::string add_coarse_pressure_correction_str;
+    static const std::string prolongate_vector_str;
+    static const std::string spmv_blocked_str;
+    static const std::string spmv_str;
+    static const std::string spmv_noreset_str;
+    static const std::string residual_blocked_str;
+    static const std::string residual_str;
+#if CHOW_PATEL
+    static const std::string ILU_apply1_str;
+    static const std::string ILU_apply2_str;
+#else
+    static const std::string ILU_apply1_fm_str;
+    static const std::string ILU_apply2_fm_str;
+#endif
+    static const std::string stdwell_apply_str;
+    static const std::string stdwell_apply_no_reorder_str;
+    static const std::string ILU_decomp_str;
+
    static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);

    static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);