__kernel void block_add(__global double *mat1, __global double *mat2, __global double *result)
{
    const unsigned int bs = 3;
    const unsigned int warpsize = 32;
    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
    const unsigned int idx_t = get_local_id(0);
    const unsigned int lane = idx_t % warpsize;

    if(lane < num_active_threads){
        const unsigned int row = lane % bs;
        const unsigned int col = (lane / bs) % bs;
        result[bs * row + col] = mat1[bs * row + col] + mat2[bs * row + col];
    }
}

__kernel void block_mult_isai(__global double *mat1, __global double *mat2, __global double *result)
{
    const unsigned int bs = 3;
    const unsigned int warpsize = 32;
    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
    const unsigned int idx_t = get_local_id(0);
    const unsigned int lane = idx_t % warpsize;

    if(lane < num_active_threads){
        const unsigned int row = lane % bs;
        const unsigned int col = (lane / bs) % bs;
        double temp = 0.0;

        for (unsigned int k = 0; k < bs; k++) {
            temp += mat1[bs * row + k] * mat2[bs * k + col];
        }

        result[bs * row + col] = temp;
    }
}

__kernel void isaiU(__global const int *diagIndex,
                    __global const int *colPtr,
                    __global const int *rowIndices,
                    __global const int *mapping,
                    __global const int *nvc,
                    __global const int *luIdxs,
                    __global const int *xxIdxs,
                    __global const int *dxIdxs,
                    __global const double *LU,
                    __global const double *invDiagVals,
                    __global double *invU,
                    const unsigned int Nb)
{
    const unsigned int warpsize = 32;
    const unsigned int idx_b = get_group_id(0);
    const unsigned int idx_t = get_local_id(0);
    const unsigned int idx = get_global_id(0);
    const unsigned int bs = 3;
    const unsigned int num_threads = get_global_size(0);
    const unsigned int num_warps_in_grid = num_threads / warpsize;
    const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
    const unsigned int num_blocks_per_warp = warpsize / bs / bs;
    const unsigned int lane = idx_t % warpsize;
    const unsigned int c = (lane / bs) % bs;
    const unsigned int r = lane % bs;
    unsigned int tcol = idx / warpsize;

    while(tcol < Nb){
        const unsigned int frow = colPtr[tcol];
        const unsigned int lrow = diagIndex[tcol];
        const unsigned int nx = lrow - frow + 1;

        if(lane < num_active_threads){
            block_add(invU + lrow * bs * bs, invDiagVals + tcol * bs * bs, invU + lrow * bs * bs);

            for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
                block_mult_sub_isai(invU + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invU + dxIdxs[v] * bs * bs);
            }

            for(unsigned int xid = 1; xid < nx; xid++){
                unsigned int xpos = mapping[lrow - xid];
                block_mult_isai(invDiagVals + rowIndices[lrow - xid] * bs * bs, invU + xpos * bs * bs, invU + xpos * bs * bs);
            }
        }

        tcol += num_warps_in_grid;
    }
}