Generating opencl kernels

2025-02-25 18:55:30 -06:00 · 2021-02-24 15:39:40 +01:00 · 2021-02-24 15:39:40 +01:00 · 5b4deab7e4
commit 5b4deab7e4
parent 141af23db5
3 changed files with 287 additions and 302 deletions
--- a/opm/simulators/linalg/bda/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/openclKernels.hpp
@ -220,298 +220,266 @@ namespace bda

    // ILU apply part 1: forward substitution
    // solves L*x=y where L is a lower triangular sparse blocked matrix
-    inline const char* ILU_apply1_s = R"(
-    __kernel void ILU_apply1(
-        __global const double *Lvals,
-        __global const unsigned int *Lcols,
-        __global const unsigned int *Lrows,
-        __global const int *diagIndex,
-        __global const double *y,
-        __global double *x,
-        __global const unsigned int *nodesPerColorPrefix,
-        const unsigned int color,
-        const unsigned int block_size,
-        __local double *tmp)
-    {
-        const unsigned int warpsize = 32;
-        const unsigned int bs = block_size;
-        const unsigned int idx_t = get_local_id(0);
-        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-        const unsigned int NUM_THREADS = get_global_size(0);
-        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-        unsigned int idx = get_global_id(0);
-        unsigned int target_block_row = idx / warpsize;
-        idx += nodesPerColorPrefix[color];
-        target_block_row += nodesPerColorPrefix[color];
-        const unsigned int lane = idx_t % warpsize;
-        const unsigned int c = (lane / bs) % bs;
-        const unsigned int r = lane % bs;
-
-        while(target_block_row < nodesPerColorPrefix[color+1]){
-            const unsigned int first_block = Lrows[target_block_row];
-            const unsigned int last_block = diagIndex[target_block_row];
-            unsigned int block = first_block + lane / (bs*bs);
-            double local_out = 0.0;
-            if(lane < num_active_threads){
-                if(lane < bs){
-                    local_out = y[target_block_row*bs+lane];
-                }
-                for(; block < last_block; block += num_blocks_per_warp){
-                    const double x_elem = x[Lcols[block]*bs + c];
-                    const double A_elem = Lvals[block*bs*bs + c + r*bs];
-                    local_out -= x_elem * A_elem;
-                }
-            }
-
-            // do reduction in shared mem
-            tmp[lane] = local_out;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+    std::string get_ILU_apply1_string(bool full_matrix) {
+        std::string s = R"(
+            __kernel void ILU_apply1(
+                __global const double *LUvals,
+                __global const unsigned int *LUcols,
+                __global const unsigned int *LUrows,
+                __global const int *diagIndex,
+                __global const double *y,
+                __global double *x,
+                __global const unsigned int *nodesPerColorPrefix,
+                const unsigned int color,
+                const unsigned int block_size,
+                __local double *tmp)
            {
-                if (lane + offset < warpsize)
-                {
-                    tmp[lane] += tmp[lane + offset];
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
+                const unsigned int warpsize = 32;
+                const unsigned int bs = block_size;
+                const unsigned int idx_t = get_local_id(0);
+                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+                const unsigned int NUM_THREADS = get_global_size(0);
+                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+                unsigned int idx = get_global_id(0);
+                unsigned int target_block_row = idx / warpsize;
+                target_block_row += nodesPerColorPrefix[color];
+                const unsigned int lane = idx_t % warpsize;
+                const unsigned int c = (lane / bs) % bs;
+                const unsigned int r = lane % bs;

-            if(lane < bs){
-                const unsigned int row = target_block_row*bs + lane;
-                x[row] = tmp[lane];
-            }
-
-            target_block_row += num_warps_in_grid;
+                while(target_block_row < nodesPerColorPrefix[color+1]){
+                    const unsigned int first_block = LUrows[target_block_row];
+                    )";
+        if (full_matrix) {
+            s += "const unsigned int last_block = diagIndex[target_block_row];  ";
+        } else {
+            s += "const unsigned int last_block = LUrows[target_block_row+1];  ";
        }
+        s += R"(
+                    unsigned int block = first_block + lane / (bs*bs);
+                    double local_out = 0.0;
+                    if(lane < num_active_threads){
+                        if(lane < bs){
+                            local_out = y[target_block_row*bs+lane];
+                        }
+                        for(; block < last_block; block += num_blocks_per_warp){
+                            const double x_elem = x[LUcols[block]*bs + c];
+                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                            local_out -= x_elem * A_elem;
+                        }
+                    }
+
+                    // do reduction in shared mem
+                    tmp[lane] = local_out;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+                    {
+                        if (lane + offset < warpsize)
+                        {
+                            tmp[lane] += tmp[lane + offset];
+                        }
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                    }
+
+                    if(lane < bs){
+                        const unsigned int row = target_block_row*bs + lane;
+                        x[row] = tmp[lane];
+                    }
+
+                    target_block_row += num_warps_in_grid;
+                }
+            }
+            )";
+        return s;
    }
-    )";


    // ILU apply part 2: backward substitution
    // solves U*x=y where L is a lower triangular sparse blocked matrix
-    inline const char* ILU_apply2_s = R"(
-    __kernel void ILU_apply2(
-        __global const double *LUvals,
-        __global const int *LUcols,
-        __global const int *LUrows,
-        __global const int *diagIndex,
-        __global const double *invDiagVals,
-        __global double *x,
-        __global const unsigned int *nodesPerColorPrefix,
-        const unsigned int color,
-        const unsigned int block_size,
-        __local double *tmp)
-    {
-        const unsigned int warpsize = 32;
-        const unsigned int bs = block_size;
-        const unsigned int idx_t = get_local_id(0);
-        const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
-        const unsigned int num_blocks_per_warp = warpsize/bs/bs;
-        const unsigned int NUM_THREADS = get_global_size(0);
-        const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
-        unsigned int idx = get_global_id(0);
-        unsigned int target_block_row = idx / warpsize;
-        idx += nodesPerColorPrefix[color];
-        target_block_row += nodesPerColorPrefix[color];
-        const unsigned int lane = idx_t % warpsize;
-        const unsigned int c = (lane / bs) % bs;
-        const unsigned int r = lane % bs;
-        const double relaxation = 0.9;
-
-        while(target_block_row < nodesPerColorPrefix[color+1]){
-            const unsigned int first_block = diagIndex[target_block_row] + 1;
-            const unsigned int last_block = LUrows[target_block_row+1];
-            unsigned int block = first_block + lane / (bs*bs);
-            double local_out = 0.0;
-            if(lane < num_active_threads){
-                if(lane < bs){
-                    const unsigned int row = target_block_row*bs+lane;
-                    local_out = x[row];
-                }
-                for(; block < last_block; block += num_blocks_per_warp){
-                    const double x_elem = x[LUcols[block]*bs + c];
-                    const double A_elem = LUvals[block*bs*bs + c + r*bs];
-                    local_out -= x_elem * A_elem;
-                }
-            }
-
-            // do reduction in shared mem
-            tmp[lane] = local_out;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+    std::string get_ILU_apply2_string(bool full_matrix) {
+        std::string s = R"(
+            __kernel void ILU_apply2(
+                __global const double *LUvals,
+                __global const int *LUcols,
+                __global const int *LUrows,
+                __global const int *diagIndex,
+                __global const double *invDiagVals,
+                __global double *x,
+                __global const unsigned int *nodesPerColorPrefix,
+                const unsigned int color,
+                const unsigned int block_size,
+                __local double *tmp)
            {
-                if (lane + offset < warpsize)
-                {
-                    tmp[lane] += tmp[lane + offset];
+                const unsigned int warpsize = 32;
+                const unsigned int bs = block_size;
+                const unsigned int idx_t = get_local_id(0);
+                const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
+                const unsigned int num_blocks_per_warp = warpsize/bs/bs;
+                const unsigned int NUM_THREADS = get_global_size(0);
+                const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
+                unsigned int idx = get_global_id(0);
+                unsigned int target_block_row = idx / warpsize;
+                target_block_row += nodesPerColorPrefix[color];
+                const unsigned int lane = idx_t % warpsize;
+                const unsigned int c = (lane / bs) % bs;
+                const unsigned int r = lane % bs;
+                const double relaxation = 0.9;
+
+                while(target_block_row < nodesPerColorPrefix[color+1]){
+                    )";
+        if (full_matrix) {
+            s +=   "const unsigned int first_block = diagIndex[target_block_row] + 1;  ";
+        } else {
+            s +=   "const unsigned int first_block = LUrows[target_block_row];  ";
+        }
+        s += R"(
+                    const unsigned int last_block = LUrows[target_block_row+1];
+                    unsigned int block = first_block + lane / (bs*bs);
+                    double local_out = 0.0;
+                    if(lane < num_active_threads){
+                        if(lane < bs){
+                            const unsigned int row = target_block_row*bs+lane;
+                            local_out = x[row];
+                        }
+                        for(; block < last_block; block += num_blocks_per_warp){
+                            const double x_elem = x[LUcols[block]*bs + c];
+                            const double A_elem = LUvals[block*bs*bs + c + r*bs];
+                            local_out -= x_elem * A_elem;
+                        }
+                    }
+
+                    // do reduction in shared mem
+                    tmp[lane] = local_out;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    for(unsigned int offset = 3; offset <= 24; offset <<= 1)
+                    {
+                        if (lane + offset < warpsize)
+                        {
+                            tmp[lane] += tmp[lane + offset];
+                        }
+                        barrier(CLK_LOCAL_MEM_FENCE);
+                    }
+                    local_out = tmp[lane];
+
+                    if(lane < bs){
+                        tmp[lane + bs*idx_t/warpsize] = local_out;
+                        double sum = 0.0;
+                        for(int i = 0; i < bs; ++i){
+                            sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                        }
+
+                        const unsigned int row = target_block_row*bs + lane;
+                        x[row] = relaxation * sum;
+                    }
+
+                    target_block_row += num_warps_in_grid;
                }
+            }
+        )";
+        return s;
+    }
+
+    /// Generate string with the stdwell_apply kernels
+    /// If reorder is true, the B/Ccols do not correspond with the x/y vector
+    /// the x/y vector is reordered, use toOrder to address that
+    /// \param[in] reorder   whether the matrix is reordered or not
+    std::string get_stdwell_apply_string(bool reorder) {
+        std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
+        std::string s = "__kernel void " + kernel_name + R"((
+                        __global const double *Cnnzs,
+                        __global const double *Dnnzs,
+                        __global const double *Bnnzs,
+                        __global const int *Ccols,
+                        __global const int *Bcols,
+                        __global const double *x,
+                        __global double *y,
+                        )";
+        if (reorder) {
+            s +=     R"(__global const int *toOrder,
+                        )";
+        }
+        s +=         R"(const unsigned int dim,
+                        const unsigned int dim_wells,
+                        __global const unsigned int *val_pointers,
+                        __local double *localSum,
+                        __local double *z1,
+                        __local double *z2){
+                int wgId = get_group_id(0);
+                int wiId = get_local_id(0);
+                int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+                int valsPerBlock = dim*dim_wells;
+                int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
+                int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
+                int c = wiId % dim;
+                int r = (wiId/dim) % dim_wells;
+                double temp;
+
                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-            local_out = tmp[lane];

-            if(lane < bs){
-                tmp[lane + bs*idx_t/warpsize] = local_out;
-                double sum = 0.0;
-                for(int i = 0; i < bs; ++i){
-                    sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                localSum[wiId] = 0;
+                if(wiId < numActiveWorkItems){
+                    int b = wiId/valsPerBlock + val_pointers[wgId];
+                    while(b < valSize + val_pointers[wgId]){
+                        )";
+        if (reorder) {
+            s +=       "int colIdx = toOrder[Bcols[b]];  ";
+        } else {
+            s +=       "int colIdx = Bcols[b];  ";
+        }
+        s += R"(
+                        localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+                        b += numBlocksPerWarp;
+                    }
+
+                    if(wiId < valsPerBlock){
+                        localSum[wiId] += localSum[wiId + valsPerBlock];
+                    }
+
+                    b = wiId/valsPerBlock + val_pointers[wgId];
+
+                    if(c == 0 && wiId < valsPerBlock){
+                        for(unsigned int stride = 2; stride > 0; stride >>= 1){
+                            localSum[wiId] += localSum[wiId + stride];
+                        }
+                        z1[r] = localSum[wiId];
+                    }
                }

-                const unsigned int row = target_block_row*bs + lane;
-                x[row] = relaxation * sum;
-            }
+                barrier(CLK_LOCAL_MEM_FENCE);

-            target_block_row += num_warps_in_grid;
-        }
-    }
-    )";
-
-    inline const char* stdwell_apply_s = R"(
-    __kernel void stdwell_apply(__global const double *Cnnzs,
-                                __global const double *Dnnzs,
-                                __global const double *Bnnzs,
-                                __global const int *Ccols,
-                                __global const int *Bcols,
-                                __global const double *x,
-                                __global double *y,
-                                __global const int *toOrder,
-                                const unsigned int dim,
-                                const unsigned int dim_wells,
-                                __global const unsigned int *val_pointers,
-                                __local double *localSum,
-                                __local double *z1,
-                                __local double *z2){
-        int wgId = get_group_id(0);
-        int wiId = get_local_id(0);
-        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
-        int valsPerBlock = dim*dim_wells;
-        int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
-        int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
-        int c = wiId % dim;
-        int r = (wiId/dim) % dim_wells;
-        double temp;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        localSum[wiId] = 0;
-        if(wiId < numActiveWorkItems){
-            int b = wiId/valsPerBlock + val_pointers[wgId];
-            while(b < valSize + val_pointers[wgId]){
-                int colIdx = toOrder[Bcols[b]];
-                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
-                b += numBlocksPerWarp;
-            }
-
-            if(wiId < valsPerBlock){
-                localSum[wiId] += localSum[wiId + valsPerBlock];
-            }
-
-            b = wiId/valsPerBlock + val_pointers[wgId];
-
-            if(c == 0 && wiId < valsPerBlock){
-                for(unsigned int stride = 2; stride > 0; stride >>= 1){
-                    localSum[wiId] += localSum[wiId + stride];
+                if(wiId < dim_wells){
+                    temp = 0.0;
+                    for(unsigned int i = 0; i < dim_wells; ++i){
+                        temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+                    }
+                    z2[wiId] = temp;
                }
-                z1[r] = localSum[wiId];
-            }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if(wiId < dim*valSize){
+                    temp = 0.0;
+                    int bb = wiId/dim + val_pointers[wgId];
+                    for (unsigned int j = 0; j < dim_wells; ++j){
+                        temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+                    }
+                    )";
+        if (reorder) {
+            s +=   "int colIdx = toOrder[Ccols[bb]];  ";
+        } else {
+            s +=   "int colIdx = Ccols[bb];  ";
        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim_wells){
-            temp = 0.0;
-            for(unsigned int i = 0; i < dim_wells; ++i){
-                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
-            }
-            z2[wiId] = temp;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim*valSize){
-            temp = 0.0;
-            int bb = wiId/dim + val_pointers[wgId];
-            for (unsigned int j = 0; j < dim_wells; ++j){
-                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
-            }
-            int colIdx = toOrder[Ccols[bb]];
-            y[colIdx*dim + c] -= temp;
-        }
-    }
-    )";
-
-    inline const char* stdwell_apply_no_reorder_s = R"(
-    __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
-                                __global const double *Dnnzs,
-                                __global const double *Bnnzs,
-                                __global const int *Ccols,
-                                __global const int *Bcols,
-                                __global const double *x,
-                                __global double *y,
-                                const unsigned int dim,
-                                const unsigned int dim_wells,
-                                __global const unsigned int *val_pointers,
-                                __local double *localSum,
-                                __local double *z1,
-                                __local double *z2){
-        int wgId = get_group_id(0);
-        int wiId = get_local_id(0);
-        int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
-        int valsPerBlock = dim*dim_wells;
-        int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
-        int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
-        int c = wiId % dim;
-        int r = (wiId/dim) % dim_wells;
-        double temp;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        localSum[wiId] = 0;
-        if(wiId < numActiveWorkItems){
-            int b = wiId/valsPerBlock + val_pointers[wgId];
-            while(b < valSize + val_pointers[wgId]){
-                int colIdx = Bcols[b];
-                localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
-                b += numBlocksPerWarp;
-            }
-
-            if(wiId < valsPerBlock){
-                localSum[wiId] += localSum[wiId + valsPerBlock];
-            }
-
-            b = wiId/valsPerBlock + val_pointers[wgId];
-
-            if(c == 0 && wiId < valsPerBlock){
-                for(unsigned int stride = 2; stride > 0; stride >>= 1){
-                    localSum[wiId] += localSum[wiId + stride];
+        s += R"(
+                    y[colIdx*dim + c] -= temp;
                }
-                z1[r] = localSum[wiId];
            }
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim_wells){
-            temp = 0.0;
-            for(unsigned int i = 0; i < dim_wells; ++i){
-                temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
-            }
-            z2[wiId] = temp;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(wiId < dim*valSize){
-            temp = 0.0;
-            int bb = wiId/dim + val_pointers[wgId];
-            for (unsigned int j = 0; j < dim_wells; ++j){
-                temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
-            }
-            int colIdx = Ccols[bb];
-            y[colIdx*dim + c] -= temp;
-        }
+            )";
+        return s;
    }
-    )";
+

    inline const char* ilu_decomp_s = R"(

--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -475,19 +475,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
    out.clear();

    try {
-        cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
-        source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
-        source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
-        source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
-        source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
-        source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
-        source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
-        source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
-        source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
-        source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
-        program = cl::Program(*context, source);
-        program.build(devices);
-
        prec->setOpenCLContext(context.get());
        prec->setOpenCLQueue(queue.get());

@ -519,6 +506,49 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
        }

+        get_opencl_kernels();
+
+        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
+
+    } catch (const cl::Error& error) {
+        std::ostringstream oss;
+        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
+        oss << getErrorString(error.err());
+        // rethrow exception
+        OPM_THROW(std::logic_error, oss.str());
+    } catch (const std::logic_error& error) {
+        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
+        throw error;
+    }
+
+    initialized = true;
+} // end initialize()
+
+template <unsigned int block_size>
+void openclSolverBackend<block_size>::get_opencl_kernels() {
+
+        cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s)));  // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
+        source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
+        source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
+        source.emplace_back(std::make_pair(custom_s, strlen(custom_s)));
+        source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
+#if CHOW_PATEL
+        bool full_matrix = false;
+#else
+        bool full_matrix = true;
+#endif
+        std::string ILU_apply1_s = get_ILU_apply1_string(full_matrix);
+        source.emplace_back(std::make_pair(ILU_apply1_s.c_str(), strlen(ILU_apply1_s.c_str())));
+        std::string ILU_apply2_s = get_ILU_apply2_string(full_matrix);
+        source.emplace_back(std::make_pair(ILU_apply2_s.c_str(), strlen(ILU_apply2_s.c_str())));
+        std::string stdwell_apply_s = get_stdwell_apply_string(true);
+        std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
+        source.emplace_back(std::make_pair(stdwell_apply_s.c_str(), strlen(stdwell_apply_s.c_str())));
+        source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s.c_str(), strlen(stdwell_apply_no_reorder_s.c_str())));
+        source.emplace_back(std::make_pair(ilu_decomp_s, strlen(ilu_decomp_s)));
+        cl::Program program = cl::Program(*context, source);
+        program.build(devices);
+
        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking

@ -540,22 +570,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
                                                  cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
        ilu_decomp_k.reset(new cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                               const int, cl::LocalSpaceArg>(cl::Kernel(program, "ilu_decomp")));
-
-        prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
-
-    } catch (const cl::Error& error) {
-        std::ostringstream oss;
-        oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
-        oss << getErrorString(error.err());
-        // rethrow exception
-        OPM_THROW(std::logic_error, oss.str());
-    } catch (const std::logic_error& error) {
-        // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
-        throw error;
-    }
-
-    initialized = true;
-} // end initialize()
+} // end get_opencl_kernels()

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -64,7 +64,6 @@ private:

    // shared pointers are also passed to other objects
    std::vector<cl::Device> devices;
-    cl::Program program;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
    std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
@ -149,6 +148,9 @@ private:
    /// \param[in] cols           array of columnIndices, contains nnz values
    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);

+    /// Generate and compile opencl kernels
+    void get_opencl_kernels();
+
    /// Clean memory
    void finalize();