atax_p_mpi.c

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <stdbool.h>
#include <omp.h>
#include <mpi.h>
#include <lcg.h>
#include <util.h>
#include <umacros.h>
#include <blockdist.h>

// arrays placement

#define T_base 1000000

#define eta_tmp(i0) 0
#define eta_tmp_mdp(i0) (-(i0)+M-1)
#define T_tmp (1 * T_base)

int tmp_chunk_size;
int tmp_actual_chunk_size;

#define eta_x(i0) (i0)
#define eta_x_mdp(i0) (M-1)
#define T_x (2 * T_base)

int x_chunk_size;
int x_actual_chunk_size;

#define eta_y(i0) (i0)
#define eta_y_mdp(i0) (-(i0)+N-1)
#define T_y (3 * T_base)

int y_chunk_size;
int y_actual_chunk_size;

#define eta_A(i1) (i1)
#define eta_A_mdp(i1) (-(i1)+N-1)
#define T_A (4 * T_base)

int A_chunk_size;
int A_actual_chunk_size;

// mpi routine

void atax_p_ilp_arrays(int M, int N, double** A, double* x, double* y, double* tmp) {
    if ((M <= 0) && (N >= 1)) {
        for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
            y[ilpp] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if ((M >= 1) && (N == 0)) {
        for(int ilpp=max(lR,0);ilpp<=min(uR,0);ilpp++) {
            tmp[0] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if ((M >= 1) && (N >= 1)) {
        for(int ilpp=max(lR,0);ilpp<=min(uR,0);ilpp++) {
            tmp[0] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if (N <= 0) {
        for(int t0=max(0,N+1);t0<=M-1;t0++) {
            for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
                tmp[t0] = 0;
            }
            MPI_Barrier_time(MPI_COMM_WORLD);
            // tmp[t0] in tmp[t0] = 0
            line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
            //
        }
    }
    for(int t0=1;t0<=min(M-1,N-1);t0++) {
        // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
        line_Q_read_vp_rev(-1, t0 - 1, 0, t0 - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,0);ilpp<=min(uR,t0-1);ilpp++) {
            tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
        line_Q_write_vp_rev(-1, t0 - 1, 0, t0 - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
            tmp[t0] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[t0] in tmp[t0] = 0
        line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
        //
    }
    if (M >= 1) {
        for(int t0=M;t0<=N-1;t0++) {
            // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
            line_Q_read_vp_rev(-1, t0 - 1, t0 - M, t0 - 1, tmp, eta_tmp, T_tmp);
            //
            for(int ilpp=max(lR,t0-M);ilpp<=min(uR,t0-1);ilpp++) {
                tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
            }
            MPI_Barrier_time(MPI_COMM_WORLD);
            // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
            line_Q_write_vp_rev(-1, t0 - 1, t0 - M, t0 - 1, tmp, eta_tmp, T_tmp);
            //
        }
    }
    if ((N >= 1) && (N <= M-1)) {
        // tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
        // tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
        line_Q_read_vp_rev(-1, N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
            if (2*ilpp == N-1) {
                if ((N+1)%2 == 0) {
                    y[(N-1)/2] = 0;
                    tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
                }
            }
            if (ilpp <= floord(N-2,2)) {
                y[ilpp] = 0;
            }
            if (ilpp <= floord(N-2,2)) {
                tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
            }
            if (ilpp >= ceild(N,2)) {
                tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
            }
            if (ilpp >= ceild(N,2)) {
                y[ilpp] = 0;
            }
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
        // tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
        line_Q_write_vp_rev(-1, N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,N);ilpp<=min(uR,N);ilpp++) {
            tmp[N] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[N] in tmp[N] = 0
        line_Q_write_vp_rev(0, N, N, N, tmp, eta_tmp, T_tmp);
        //
    }
    if ((M >= 1) && (N >= M)) {
        for(int ilpp=max(lR,0);ilpp<=min(uR,N-M-1);ilpp++) {
            y[ilpp] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
        // tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
        line_Q_read_vp_rev(-1, N - 1, N - M, N - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,N-M);ilpp<=min(uR,N-1);ilpp++) {
            if (2*ilpp == N-1) {
                if ((N+1)%2 == 0) {
                    y[(N-1)/2] = 0;
                    tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
                }
            }
            if (ilpp <= floord(N-2,2)) {
                y[ilpp] = 0;
            }
            if (ilpp >= ceild(N,2)) {
                tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
            }
            if (ilpp <= floord(N-2,2)) {
                tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
            }
            if (ilpp >= ceild(N,2)) {
                y[ilpp] = 0;
            }
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
        // tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
        line_Q_write_vp_rev(-1, N - 1, N - M, N - 1, tmp, eta_tmp, T_tmp);
        //
    }
    if (N >= 1) {
        for(int t0=N+1;t0<=M-1;t0++) {
            // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
            line_Q_read_vp_rev(-1, t0 - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
            //
            // tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1]
            line_Q_read_vp_rev(0, t0 - N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
            //
            for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
                y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
                tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
            }
            MPI_Barrier_time(MPI_COMM_WORLD);
            // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
            line_Q_write_vp_rev(-1, t0 - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
            //
            for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
                tmp[t0] = 0;
            }
            MPI_Barrier_time(MPI_COMM_WORLD);
            // tmp[t0] in tmp[t0] = 0
            line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
            //
        }
    }
    for(int t0=max(M,N+1);t0<=N+M-1;t0++) {
        // tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1]
        line_Q_read_vp_rev(0, t0 - N - 1, 0, t0 - M - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,0);ilpp<=min(uR,t0-M-1);ilpp++) {
            y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
        line_Q_read_vp_rev(-1, t0 - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
        //
        // tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
        line_Q_read_vp_rev(0, t0 - N - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,t0-M);ilpp<=min(uR,N-1);ilpp++) {
            y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
            tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
        // tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
        line_Q_write_vp_rev(-1, t0 - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
        //
    }
    if ((M >= 1) && (N >= 1)) {
        // tmp[M-1] in y[ilpp] = y[ilpp] + A[M-1][ilpp] * tmp[M-1]
        line_Q_read_vp_rev(0, M - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
        //
        for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
            y[ilpp] = y[ilpp] + A[M-1][ilpp] * tmp[M-1];
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
}

void atax_p_ilp_arrays_mdp(int M, int N, double** A, double* x, double* y, double* tmp, bool use_bcast, bool dist_input) {
    if ((M >= 0) && (N >= 0)) {
        for (int ilpp=max(lR,0);ilpp<=min(uR,N+M-1);ilpp++) {
            if (ilpp <= N-1) {
              y[-ilpp+N-1] = 0;
            }
            if (ilpp <= M-1) {
              tmp[-ilpp+M-1] = 0;
            }
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if (M <= -1) {
        for (int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
            y[-ilpp+N-1] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if (N <= -1) {
        for (int ilpp=max(lR,0);ilpp<=min(uR,M-1);ilpp++) {
            tmp[-ilpp+M-1] = 0;
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if (M >= 1) {
        if (dist_input) {
            // x[t2] in tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2]
            if (use_bcast) {
                IF_TIME(mpi_spent_time -= rtclock());
                MPI_Bcast(x, N, MPI_DOUBLE, blockdist_proc_rank(M-1), MPI_COMM_WORLD);
                IF_TIME(mpi_spent_time += rtclock());
            }
            else {
                line_Q_read_send(0, clamp(lr, 0, M-1), N-1, clamp(ur, 0, M-1), x, eta_x_mdp, T_x);
                line_Q_read_receive(0, clamp(lR, 0, M-1), N-1, clamp(uR, 0, M-1), x, eta_x_mdp, T_x);
            }
            //
            // A[-ilpp+M-1][t2] in tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2]
            for (int ilpp=0;ilpp<=M-1;ilpp++) {
                if (use_bcast) {
                    IF_TIME(mpi_spent_time -= rtclock());
                    for (int r = 0; r < Q; r++) {
                        int actual_chunk_size = blockdist_array_distribute_r(N, r);
                        int chunk_start = blockdist_chunk_start_inv_r(N, A_chunk_size, actual_chunk_size, r);
                        MPI_Bcast(A[-ilpp+M-1] + chunk_start, actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
                    }
                    IF_TIME(mpi_spent_time += rtclock());
                }
                else {
                    line_Q_read_send(blockdist_chunk_start_inv(N, A_chunk_size, A_actual_chunk_size), clamp(lr, 0, M-1),
                                     blockdist_chunk_end_inv(N, A_chunk_size, A_actual_chunk_size) - 1, clamp(ur, 0, M-1),
                                     A[-ilpp+M-1], eta_A_mdp, T_A + (-ilpp+M-1));
                    line_Q_read_receive(blockdist_chunk_start_inv_r(N, A_chunk_size, blockdist_array_distribute_r(N, r), r), clamp(lR, 0, M-1),
                                        blockdist_chunk_end_inv_r(N, A_chunk_size, blockdist_array_distribute_r(N, r), r) - 1, clamp(uR, 0, M-1),
                                        A[-ilpp+M-1], eta_A_mdp, T_A + (-ilpp+M-1));
                }
            }
            //
        }
        for (int ilpp=max(lR, 0);ilpp<=min(uR,M-1);ilpp++) {
            for (int t2=0;t2<=N-1;t2++) {
                tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2];
            }
        }
        MPI_Barrier_time(MPI_COMM_WORLD);
    }
    if (N >= 1) {
        // tmp[t2] in y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
        if (use_bcast) {
            IF_TIME(mpi_spent_time -= rtclock());
            for (int r = 0; r < Q; r++) {
                int actual_chunk_size = blockdist_array_distribute_r(M, r);
                int chunk_start = blockdist_chunk_start_inv_r(M, tmp_chunk_size, actual_chunk_size, r);
                MPI_Bcast(tmp + chunk_start, actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
            }
            IF_TIME(mpi_spent_time += rtclock());
        }
        else {
            line_Q_read_send(blockdist_chunk_start_inv(M, tmp_chunk_size, tmp_actual_chunk_size), clamp(lr, 0, N-1),
                             blockdist_chunk_end_inv(M, tmp_chunk_size, tmp_actual_chunk_size) - 1, clamp(ur, 0, N-1),
                             tmp, eta_tmp_mdp, T_tmp);
            line_Q_read_receive(blockdist_chunk_start_inv_r(M, tmp_chunk_size, blockdist_array_distribute_r(M, r), r), clamp(lR, 0, N-1),
                                blockdist_chunk_end_inv_r(M, tmp_chunk_size, blockdist_array_distribute_r(M, r), r) - 1, clamp(uR, 0, N-1),
                                tmp, eta_tmp_mdp, T_tmp);
        }
        //
        int lb = max(lR, 0);
        int ub = min(uR, N-1);
        for (int t2=0;t2<=M-1;t2++) {
            // tmp[t2] in y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
            //line_Q_read_send(t2, clamp(lr, 0, N-1), t2, clamp(ur, 0, N-1), tmp, eta_tmp_mdp, T_tmp, mpi_requests);
            //line_Q_read_receive(t2, clamp(lR, 0, N-1), t2, clamp(uR, 0, N-1), tmp, eta_tmp_mdp, T_tmp, mpi_statuses);
            //
            for (int ilpp=lb;ilpp<=ub;ilpp++) {
                y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
            }
            MPI_Barrier_time(MPI_COMM_WORLD);
        }
    }
    collect_matrix_cols_double(&y, 1, N, y_chunk_size, true);
}

void atax_p_ilp_arrays_mdp_sendrecv_distinp(int M, int N, double** A, double* x, double* y, double* tmp) {
    atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, false, true);
}

void atax_p_ilp_arrays_mdp_bcast_distinp(int M, int N, double** A, double* x, double* y, double* tmp) {
    atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, true, true);
}

void atax_p_ilp_arrays_mdp_sendrecv_dupinp(int M, int N, double** A, double* x, double* y, double* tmp) {
    atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, false, false);
}

void atax_p_ilp_arrays_mdp_bcast_dupinp(int M, int N, double** A, double* x, double* y, double* tmp) {
    atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, true, false);
}

// pluto mpi routine

double pluto_spent_time = 0;

extern int M;
extern int N;
extern double** A;
extern double* x;
extern double* y;
extern double* tmp;

extern void atax_p_pluto_mpi();

void atax_p_pluto(int _M, int _N, double** _A, double* _x, double* _y, double* _tmp) {
    M = _M;
    N = _N;
    A = _A;
    x = _x;
    y = _y;
    tmp = _tmp;
    atax_p_pluto_mpi();
}

// handwritten mpi routine

void atax_by_hand(int M, int N, double** A, double* x, double* y, double* tmp) {
    for (int i = blockdist_chunk_start(y_chunk_size); i < blockdist_chunk_end(y_chunk_size, y_actual_chunk_size); i++) {
        y[i] = 0;
    }

    int tmp_chunk_size = blockdist_chunk_size(M, Q);
    int tmp_actual_chunk_size = blockdist_actual_chunk_size(tmp_chunk_size, M);

    for (int i = R * tmp_chunk_size; i < R * tmp_chunk_size + tmp_actual_chunk_size; i++) {
        tmp[i] = 0;
        for (int j = 0; j < N; j++)
            tmp[i] = tmp[i] + A[i][j] * x[j];
    }

    IF_TIME(mpi_spent_time -= rtclock());
    for (int r = 0; r < Q; r++) {
        tmp_actual_chunk_size = blockdist_actual_chunk_size_r(tmp_chunk_size, M, r);
        if (tmp_actual_chunk_size > 0) {
            MPI_Bcast(tmp + r * tmp_chunk_size, tmp_actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
        }
    }
    IF_TIME(mpi_spent_time += rtclock());

    for (int i = 0; i < M; i++) {
        for (int j = blockdist_chunk_start(y_chunk_size); j < blockdist_chunk_end(y_chunk_size, y_actual_chunk_size); j++) {
            y[j] = y[j] + A[i][j] * tmp[i];
        }
    }

    MPI_Barrier_time(MPI_COMM_WORLD);

    collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
}

// sequential routine

void atax_vanilla(int M, int N, double** A, double* x, double* y, double* tmp) {
    if (R == 0) {
        for (int i = 0; i < N; i++)
            y[i] = 0;
        for (int i = 0; i < M; i++) {
            tmp[i] = 0;
            for (int j = 0; j < N; j++)
                tmp[i] = tmp[i] + A[i][j] * x[j];
            for (int j = 0; j < N; j++)
                y[j] = y[j] + A[i][j] * tmp[i];
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
}

typedef void (*atax_solution)(int M, int N, double** A, double* x, double* y, double* tmp);
const char* method_names[] = {"vanilla",
                              "by_hand",
                              "p_ilp_arrays",
                              "p_ilp_arrays_mdp_sendrecv_distinp",
                              "p_ilp_arrays_mdp_bcast_distinp",
                              "p_ilp_arrays_mdp_sendrecv_dupinp",
                              "p_ilp_arrays_mdp_bcast_dupinp",
                              "p_pluto"};
const atax_solution method_impls[] = {&atax_vanilla,
                                      &atax_by_hand,
                                      &atax_p_ilp_arrays,
                                      &atax_p_ilp_arrays_mdp_sendrecv_distinp,
                                      &atax_p_ilp_arrays_mdp_bcast_distinp,
                                      &atax_p_ilp_arrays_mdp_sendrecv_dupinp,
                                      &atax_p_ilp_arrays_mdp_bcast_dupinp,
                                      &atax_p_pluto};

int main(int argc, char* argv[]) {
    int M = atoi(argv[1]);
    int N = atoi(argv[2]);
    int method = atoi(argv[3]);

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &Q);
    MPI_Comm_rank(MPI_COMM_WORLD, &R);

    const char* pass_count_str = getenv("PASS_COUNT");
    int pass_count = (pass_count_str != NULL) ? atoi(pass_count_str) : 1;

    if (R == 0) {
        printf("M = %d\n", M);
        printf("N = %d\n", N);
        printf("Using %s method...\n", method_names[method]);
        printf("Number of processes is %d\n", Q);
        printf("Pass count is %d\n", pass_count);
        fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    printf("Process #%d started.\n", R);
    fflush(stdout);
    MPI_Barrier(MPI_COMM_WORLD);

    // arrays distribution

    if (method_impls[method] == &atax_p_ilp_arrays) {
        int eta_tmp_min = 0; int eta_tmp_max = 0;
        int eta_x_min = 0; int eta_x_max = N - 1;
        int eta_y_min = 0; int eta_y_max = N - 1;
        int eta_A_min = 0; int eta_A_max = N - 1;
        int pi_S0_min = 0; int pi_S0_max = N - 1;
        int pi_S1_min = 0; int pi_S1_max = M - 1;
        int pi_S2_min = 0; int pi_S2_max = N - 1;
        int pi_S3_min = 0; int pi_S3_max = N - 1;
        L = min(eta_tmp_min, min(eta_x_min, min(eta_y_min, min(eta_A_min, min(pi_S0_min, min(pi_S1_min, min(pi_S2_min, pi_S3_min)))))));
        U = max(eta_tmp_max, max(eta_x_max, max(eta_y_max, max(eta_A_max, max(pi_S0_max, max(pi_S1_max, max(pi_S2_max, pi_S3_max)))))));
        blockdist_init();

        A_chunk_size = block_size; A_actual_chunk_size = blockdist_array_distribute(N);
        x_chunk_size = block_size; x_actual_chunk_size = blockdist_array_distribute(N);
        y_chunk_size = block_size; y_actual_chunk_size = blockdist_array_distribute(N);
        tmp_chunk_size = M; tmp_actual_chunk_size = (blockdist_proc_rank(0) == R) ? tmp_chunk_size : 0;
    }

    if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp ||
            method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
        int eta_tmp_min = 0; int eta_tmp_max = M - 1;
        int eta_x_min = M - 1; int eta_x_max = M - 1;
        int eta_y_min = 0; int eta_y_max = N - 1;
        int eta_A_min = 0; int eta_A_max = N - 1;
        int pi_S0_min = 0; int pi_S0_max = N - 1;
        int pi_S1_min = 0; int pi_S1_max = M - 1;
        int pi_S2_min = 0; int pi_S2_max = M - 1;
        int pi_S3_min = 0; int pi_S3_max = N - 1;
        L = min(eta_tmp_min, min(eta_x_min, min(eta_y_min, min(eta_A_min, min(pi_S0_min, min(pi_S1_min, min(pi_S2_min, pi_S3_min)))))));
        U = max(eta_tmp_max, max(eta_x_max, max(eta_y_max, max(eta_A_max, max(pi_S0_max, max(pi_S1_max, max(pi_S2_max, pi_S3_max)))))));
        blockdist_init();

        if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
            A_chunk_size = block_size; A_actual_chunk_size = blockdist_array_distribute(N);
            x_chunk_size = N; x_actual_chunk_size = (blockdist_proc_rank(M - 1) == R) ? x_chunk_size : 0;
        }
        else {
            A_chunk_size = N; A_actual_chunk_size = N;
            x_chunk_size = N; x_actual_chunk_size = N;
        }
        y_chunk_size = block_size; y_actual_chunk_size = blockdist_array_distribute(N);
        tmp_chunk_size = block_size; tmp_actual_chunk_size = blockdist_array_distribute(M);
    }

    if (method_impls[method] == &atax_by_hand) {
        y_chunk_size = blockdist_chunk_size(N, Q); y_actual_chunk_size = blockdist_actual_chunk_size(y_chunk_size, N);
    }

    if (method_impls[method] == &atax_vanilla) {
        A_chunk_size = N; A_actual_chunk_size = (R == 0) ? A_chunk_size : 0;
        x_chunk_size = N; x_actual_chunk_size = (R == 0) ? x_chunk_size : 0;
        y_chunk_size = N; y_actual_chunk_size = (R == 0) ? y_chunk_size : 0;
        tmp_chunk_size = M; tmp_actual_chunk_size = (R == 0) ? tmp_chunk_size : 0;
    }

    double** A = allocate_matrix_double(M, N, "A");
    double* x = allocate_vector_double(N, "x");
    double* y = allocate_vector_double(N, "y");
    double* tmp = allocate_vector_double(M, "tmp");

    double* elapsed_time = allocate_vector_double(pass_count, "elapsed_time");
#ifdef TIME
    double* compute_ratio = (R == 0) ? allocate_vector_double(pass_count, "compute_ratio") : NULL;
#endif

    for (int pass = 0; pass < pass_count; pass++) {
        // arrays initializing

        time_t seed_time = time(0);
        MPI_Bcast(&seed_time, sizeof(time_t), MPI_BYTE, 0, MPI_COMM_WORLD);

        if (method_impls[method] == &atax_by_hand || method_impls[method] == &atax_p_pluto ||
                method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
            int seed = seed_time % MODULUS;
            fill_matrix_double(A, M, N, 0, M, 0, N, &seed);
            fill_vector_double(x, N, 0, N, &seed);
        }
        else {
            int seed = (seed_time + R) % MODULUS;
            if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
                fill_matrix_double(A, M, N, 0, M, blockdist_chunk_start_inv(N, A_chunk_size, A_actual_chunk_size), blockdist_chunk_end_inv(N, A_chunk_size, A_actual_chunk_size), &seed);
                fill_vector_double(x, N, 0, x_actual_chunk_size, &seed);
            }
            else {
                fill_matrix_double(A, M, N, 0, M, blockdist_chunk_start(A_chunk_size), blockdist_chunk_end(A_chunk_size, A_actual_chunk_size), &seed);
                fill_vector_double(x, N, blockdist_chunk_start(x_chunk_size), blockdist_chunk_end(x_chunk_size, x_actual_chunk_size), &seed);
            }
        }

        MPI_Barrier(MPI_COMM_WORLD);

        struct timeval start_time;
        struct timeval finish_time;

        if (R == 0) {
            printf("Starting %d pass...\n", pass + 1);
            fflush(stdout);
        }
        MPI_Barrier(MPI_COMM_WORLD);

#ifdef TIME
        mpi_spent_time = 0;
        sync_spent_time = 0;
#endif

        gettimeofday(&start_time, 0);
        (*method_impls[method])(M, N, A, x, y, tmp);
        gettimeofday(&finish_time, 0);

        elapsed_time[pass] = elapsed_msecs(start_time, finish_time);
#ifdef TIME
        if (method_impls[method] == &atax_p_pluto)
            elapsed_time[pass] = pluto_spent_time * 1000.0;
        mpi_spent_time *= 1000.0;
        sync_spent_time *= 1000.0;
        printf("Process rank %1$d: time spent %2$lf%5$s, mpi time spent %3$lf%5$s, sync time spent %4$lf%5$s, compute ratio %6$lf.\n",
               R, elapsed_time[pass], mpi_spent_time, sync_spent_time, " ms",
               (elapsed_time[pass] - mpi_spent_time - sync_spent_time) / elapsed_time[pass]);
        fflush(stdout);
        nanosleep((const struct timespec[]){{0, 100000000L}}, NULL);
        MPI_Barrier(MPI_COMM_WORLD);
        double total_time_spent;
        MPI_Reduce(&elapsed_time[pass], &total_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        double total_mpi_time_spent;
        MPI_Reduce(&mpi_spent_time, &total_mpi_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        double total_sync_time_spent;
        MPI_Reduce(&sync_spent_time, &total_sync_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (R == 0)
            compute_ratio[pass] = 100.0 * (total_time_spent - total_mpi_time_spent - total_sync_time_spent) / total_time_spent;
#endif

        if (R == 0) {
            printf("Time spent: %lf milliseconds\n", elapsed_time[pass]);
#ifdef TIME
            printf("Compute ratio: %lf percents\n", compute_ratio[pass]);
#endif
            fflush(stdout);
        }

        //verify
        double max_diff_percents = 0;
        if (method_impls[method] != &atax_vanilla) {
            if (R == 0) {
                printf("Verification method is %s solution.\n", "sequential");
                fflush(stdout);
            }
            if (method_impls[method] == &atax_p_ilp_arrays) {
                collect_matrix_cols_double(A, M, N, A_chunk_size, false);
                collect_matrix_cols_double(&x, 1, N, x_chunk_size, false);
            }
            //if (method_impls[method] == &atax_p_ilp_arrays || method_impls[method] == &atax_by_hand) {
            //    collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
            //}
            if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
                collect_matrix_cols_double(A, M, N, A_chunk_size, true);
                collect_matrix_cols_double(&x, 1, N, x_chunk_size, true);
            }
            //if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp ||
            //        method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
            //    collect_matrix_cols_double(&y, 1, N, y_chunk_size, true);
            //}
            double* z = allocate_vector_double((R == 0) ? N : 0, "z");
            atax_vanilla(M, N, A, x, z, tmp);
            if (R == 0) {
                max_diff_percents = compare_vectors_double(y, z, N) * 100;
                printf("Maximal diff is %lf percents\n", max_diff_percents);
                fflush(stdout);
                int i = 0;
                while (i < N && y[i] > 0)
                    i++;
                if (i < N) {
                    printf("ERROR: The result is not positive at position %d: %lf.\n", i, y[i]);
                    fflush(stdout);
                    max_diff_percents = 1000000;
                }
            }
            free(z);
        }

        if (R == 0) {
            printf((max_diff_percents <= 0.1) ? "PASSED\n" : "NOT PASSED\n");
            fflush(stdout);
        }
    }

    if (R == 0) {
        print_stats(elapsed_time, pass_count, "Elapsed Time ", " ms");
#ifdef TIME
        print_stats(compute_ratio, pass_count, "Compute Ratio ", " %");
#endif
    }

    free(elapsed_time);
#ifdef TIME
    free(compute_ratio);
#endif

    // free memory
    free_matrix_double(A, M);
    free(x);
    free(y);
    free(tmp);    

    MPI_Finalize();

    return 0;
}

blockdist.h

#ifndef BLOCKDIST_H
#define BLOCKDIST_H

#include <umacros.h>
#include <stdbool.h>
#include <mpi.h>

#ifdef TIME
#include <util.h>
double mpi_spent_time = 0;
double sync_spent_time = 0;
#endif

#define MPI_Barrier_time(comm) \
    IF_TIME(sync_spent_time -= rtclock()); \
    MPI_Barrier(comm); \
    IF_TIME(sync_spent_time += rtclock());

int Q = 0;
int R = 0;
int L = 0;
int U = 0;
int lR = 0;
int uR = 0;
int block_size = 0;
int actual_block_size = 0;

#define blockdist_chunk_size(N, Q) (((N) + (Q) - 1) / (Q))

#define blockdist_actual_chunk_size_r(chunk_size, N, R) max(0, min((chunk_size), (N) - (R) * (chunk_size)))
#define blockdist_actual_chunk_size(chunk_size, N) blockdist_actual_chunk_size_r(chunk_size, N, R)

#define blockdist_array_distribute_r(N, R) blockdist_actual_chunk_size_r(block_size, N, R)
#define blockdist_array_distribute(N) blockdist_array_distribute_r(N, R)

#define blockdist_chunk_start_r(chunk_size, R) ((R) * (chunk_size))
#define blockdist_chunk_start(chunk_size) blockdist_chunk_start_r(chunk_size, R)

#define blockdist_chunk_end_r(chunk_size, actual_chunk_size, R) (blockdist_chunk_start_r(chunk_size, R) + (actual_chunk_size))
#define blockdist_chunk_end(chunk_size, actual_chunk_size) blockdist_chunk_end_r(chunk_size, actual_chunk_size, R)

#define blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R) ((N) - blockdist_chunk_end_r(chunk_size, actual_chunk_size, R))
#define blockdist_chunk_start_inv(N, chunk_size, actual_chunk_size) blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R)

#define blockdist_chunk_end_inv_r(N, chunk_size, actual_chunk_size, R) (blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R) + (actual_chunk_size))
#define blockdist_chunk_end_inv(N, chunk_size, actual_chunk_size) blockdist_chunk_end_inv_r(N, chunk_size, actual_chunk_size, R)

#define triple_choice(test_val, ref_val, eq_val, lt_val, gt_val) (((test_val) == (ref_val)) ? (eq_val) : (((test_val) > (ref_val)) ? (gt_val) : (lt_val)))
#define blockdist_actual_chunk_size_offset_r_int(free_cnt, fst_capacity, chunk_size, N, R) triple_choice(R, free_cnt, fst_capacity, 0, max(0, min((chunk_size), (N) - (fst_capacity) - ((R) - (free_cnt) - 1) * (chunk_size))))

#define blockdist_actual_chunk_size_offset_r(chunk_size, offset, N, R) blockdist_actual_chunk_size_offset_r_int((offset) / (chunk_size), min(N, (chunk_size) - (offset) % (chunk_size)), chunk_size, N, R)
#define blockdist_actual_chunk_size_offset(chunk_size, offset, N) blockdist_actual_chunk_size_offset_r(chunk_size, offset, N, R)

#define blockdist_array_distribute_offset_r(offset, N, R) blockdist_actual_chunk_size_offset_r(block_size, offset, N, R)
#define blockdist_array_distribute_offset(offset, N) blockdist_array_distribute_offset_r(offset, N, R)

#define blockdist_chunk_start_offset_r(chunk_size, offset, R) max(0, (R) * (chunk_size) - (offset))
#define blockdist_chunk_start_offset(chunk_size, offset) blockdist_chunk_start_offset_r(chunk_size, offset, R)

#define blockdist_chunk_end_offset_r(chunk_size, actual_chunk_size, offset, R) (blockdist_chunk_start_offset_r(chunk_size, offset, R) + (actual_chunk_size))
#define blockdist_chunk_end_offset(chunk_size, actual_chunk_size, offset) blockdist_chunk_end_offset_r(chunk_size, actual_chunk_size, offset, R)

#define blockdist_in_range(v) ((v) >= lR && (v) <= uR)
#define blockdist_proc_rank(v) (((v) - L) / block_size)
#define blockdist_nb_vproc (U - L + 1)

// Q is a line - part of a one-dimensional array

#define line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
        int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
        int lr = L + r * block_size; \
        int ur = lr + r_actual_chunk_size - 1; \
        bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
                        (blockdist_proc_rank(eta(fst_idx)) == R) && (blockdist_proc_rank(eta(last_idx)) == R) && \
                        ((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
        if (nonempty) { \
            MPI_Request request; \
            MPI_Isend((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
            MPI_Request_free(&request); \
        } \
    }

#define line_Q_read_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
        } \
    }

#define line_Q_read_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = R + 1; r < Q; r++) { \
        line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
    }

#define line_Q_read_send(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_send_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock())

#define line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
        bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
                        (blockdist_proc_rank(eta(fst_idx)) == r) && (blockdist_proc_rank(eta(last_idx)) == r) && \
                        blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
        if (nonempty) { \
            MPI_Status status; \
            MPI_Recv((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
        } \
    }

#define line_Q_read_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
        } \
    }

#define line_Q_read_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < R; r++) { \
        line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
    }

#define line_Q_read_receive(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_receive_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
        bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
                        (blockdist_proc_rank(eta(fst_idx)) == r) && (blockdist_proc_rank(eta(last_idx)) == r) && \
                        blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
        if (nonempty) { \
            MPI_Request request; \
            MPI_Isend((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
            MPI_Request_free(&request); \
        } \
    }

#define line_Q_write_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
        } \
    }

#define line_Q_write_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = R + 1; r < Q; r++) { \
        line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
    }

#define line_Q_write_send(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_send_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
        int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
        int lr = L + r * block_size; \
        int ur = lr + r_actual_chunk_size - 1; \
        bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
                        (blockdist_proc_rank(eta(fst_idx)) == R) && (blockdist_proc_rank(eta(last_idx)) == R) && \
                        ((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
        if (nonempty) { \
            MPI_Status status; \
            MPI_Recv((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
        } \
    }

#define line_Q_write_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
        } \
    }

#define line_Q_write_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
    for (int r = 0; r < R; r++) { \
        line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
    }

#define line_Q_write_receive(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_receive_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define clamp(v, l, u) min(max(v, l), u)

#define line_Q_read_vp_rev_econd(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                               econd, arr, eta, tag); \
    line_Q_read_receive_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                  econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_vp_rev_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                   econd, arr, eta, tag); \
    line_Q_read_receive_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                      econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_vp_rev(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_rev_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_read_vp_rev_fco(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_rev_econd_fco(mul, add, lb, ub, true, arr, eta, tag)

#define line_Q_write_vp_rev_econd(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                econd, arr, eta, tag); \
    line_Q_write_receive_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                   econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_vp_rev_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                    econd, arr, eta, tag); \
    line_Q_write_receive_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                       econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_vp_rev(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_rev_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_rev_fco(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_rev_econd_fco(mul, add, lb, ub, true, arr, eta, tag)

#define line_Q_read_vp_econd(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                               econd, arr, eta, tag); \
    line_Q_read_receive_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                  econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_vp_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_read_send_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                   econd, arr, eta, tag); \
    line_Q_read_receive_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                      econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_read_vp(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_read_vp_fco(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_econd_fco(mul, add, lb, ub, true, arr, eta, tag)

#define line_Q_write_vp_econd(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                econd, arr, eta, tag); \
    line_Q_write_receive_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                   econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_vp_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    line_Q_write_send_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                    econd, arr, eta, tag); \
    line_Q_write_receive_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                       econd, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define line_Q_write_vp(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_fco(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_econd_fco(mul, add, lb, ub, true, arr, eta, tag)

// Q is a column - part of a matrix column

#define col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
        int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
        int lr = L + r * block_size; \
        int ur = lr + r_actual_chunk_size - 1; \
        bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
                        (blockdist_proc_rank(eta(fst_row, col_idx)) == R) && (blockdist_proc_rank(eta(last_row, col_idx)) == R) && \
                        ((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
        if (nonempty) { \
            for (int i = fst_row; i <= last_row; i++) { \
                buf[i] = arr[i][col_idx]; \
            } \
            MPI_Request request; \
            MPI_Isend((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
            MPI_Request_free(&request); \
        } \
    }

#define col_Q_read_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
        } \
    }

#define col_Q_read_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = R + 1; r < Q; r++) { \
        col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
    }

#define col_Q_read_send(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_send_fco(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
        bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
                        (blockdist_proc_rank(eta(fst_row, col_idx)) == r) && (blockdist_proc_rank(eta(last_row, col_idx)) == r) && \
                        blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
        if (nonempty) { \
            MPI_Status status; \
            MPI_Recv((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
            for (int i = fst_row; i <= last_row; i++) { \
                arr[i][col_idx] = buf[i]; \
            } \
        } \
    }

#define col_Q_read_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
        } \
    }

#define col_Q_read_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < R; r++) { \
        col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
    }

#define col_Q_read_receive(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_receive_fco(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_vp_econd(mul, add, lb, ub, col_idx, econd, buff, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                              col_idx, econd, buff, arr, eta, tag); \
    col_Q_read_receive_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                 col_idx, econd, buff, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_vp_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                  col_idx, econd, buf, arr, eta, tag); \
    col_Q_read_receive_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                     col_idx, econd, buf, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_vp(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_read_vp_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)

#define col_Q_read_vp_rev_econd(mul, add, lb, ub, col_idx, econd, buff, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                              col_idx, econd, buff, arr, eta, tag); \
    col_Q_read_receive_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                 col_idx, econd, buff, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_vp_rev_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_read_send_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                  col_idx, econd, buff, arr, eta, tag); \
    col_Q_read_receive_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                     col_idx, econd, buff, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_read_vp_rev(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_rev_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_read_vp_rev_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_rev_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)

#define col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
        bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
                        (blockdist_proc_rank(eta(fst_row, col_idx)) == r) && (blockdist_proc_rank(eta(last_row, col_idx)) == r) && \
                        blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
        if (nonempty) { \
            for (int i = fst_row; i <= last_row; i++) { \
                buf[i] = arr[i][col_idx]; \
            } \
            MPI_Request request; \
            MPI_Isend((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
            MPI_Request_free(&request); \
        } \
    }

#define col_Q_write_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
        } \
    }

#define col_Q_write_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = R + 1; r < Q; r++) { \
        col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
    }

#define col_Q_write_send(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_send_fco(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
        int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
        int lr = L + r * block_size; \
        int ur = lr + r_actual_chunk_size - 1; \
        bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
                        (blockdist_proc_rank(eta(fst_row, col_idx)) == R) && (blockdist_proc_rank(eta(last_row, col_idx)) == R) && \
                        ((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
        if (nonempty) { \
            MPI_Status status; \
            MPI_Recv((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
            for (int i = fst_row; i <= last_row; i++) { \
                arr[i][col_idx] = buf[i]; \
            } \
        } \
    }

#define col_Q_write_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < Q; r++) { \
        if (r != R) { \
            col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
        } \
    }

#define col_Q_write_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
    for (int r = 0; r < R; r++) { \
        col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
    }

#define col_Q_write_receive(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_receive_fco(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_vp_econd(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                               col_idx, econd, buf, arr, eta, tag); \
    col_Q_write_receive_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                  col_idx, econd, buf, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_vp_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
                                   col_idx, econd, buf, arr, eta, tag); \
    col_Q_write_receive_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
                                      col_idx, econd, buf, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_vp(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_vp_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)

#define col_Q_write_vp_rev_econd(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                               col_idx, econd, buf, arr, eta, tag); \
    col_Q_write_receive_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                  col_idx, econd, buf, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_vp_rev_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
    IF_TIME(mpi_spent_time -= rtclock()); \
    col_Q_write_send_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
                                   col_idx, econd, buf, arr, eta, tag); \
    col_Q_write_receive_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
                                      col_idx, econd, buf, arr, eta, tag); \
    MPI_Barrier(MPI_COMM_WORLD); \
    IF_TIME(mpi_spent_time += rtclock());

#define col_Q_write_vp_rev(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_rev_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_vp_rev_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_rev_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)

// data layout utilities

void blockdist_init() {
    block_size = blockdist_chunk_size(blockdist_nb_vproc, Q);
    actual_block_size = blockdist_actual_chunk_size(block_size, blockdist_nb_vproc);
    lR = L + R * block_size;
    uR = lR + actual_block_size - 1;
}

void collect_matrix_rows_double(double** m, int rows, int cols, int chunk_size) {
    IF_TIME(mpi_spent_time -= rtclock());
    if (R == 0) {
        for (int r = 1; r < Q; r++) {
            int actual_chunk_size_r = blockdist_actual_chunk_size_r(chunk_size, rows, r);
            for (int i = blockdist_chunk_start_r(chunk_size, r); i < blockdist_chunk_end_r(chunk_size, actual_chunk_size_r, r); i++) {
                MPI_Status status;
                MPI_Recv(m[i], cols, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
            }
        }
    }
    else {
        int actual_chunk_size = blockdist_actual_chunk_size(chunk_size, rows);
        for (int i = blockdist_chunk_start(chunk_size); i < blockdist_chunk_end(chunk_size, actual_chunk_size); i++) {
            MPI_Send(m[i], cols, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    IF_TIME(mpi_spent_time += rtclock());
}

void collect_matrix_cols_double(double** m, int rows, int cols, int chunk_size, bool inv) {
    IF_TIME(mpi_spent_time -= rtclock());
    if (R == 0) {
        for (int i = 0; i < rows; i++) {
            for (int r = 1; r < Q; r++) {
                int receive_chunk_size = blockdist_actual_chunk_size_r(chunk_size, cols, r);
                if (receive_chunk_size > 0) {
                    MPI_Status status;
                    int offset = (inv) ? blockdist_chunk_start_inv_r(cols, chunk_size, receive_chunk_size, r) :
                                         blockdist_chunk_start_r(chunk_size, r);
                    MPI_Recv(m[i] + offset, receive_chunk_size, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
                }
            }
        }
    }
    else {
        int send_chunk_size = blockdist_actual_chunk_size(chunk_size, cols);
        if (send_chunk_size > 0) {
            for (int i = 0; i < rows; i++) {
                int offset = (inv) ? blockdist_chunk_start_inv(cols, chunk_size, send_chunk_size) :
                                     blockdist_chunk_start(chunk_size);
                MPI_Send(m[i] + offset, send_chunk_size, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
            }
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    IF_TIME(mpi_spent_time += rtclock());
}

void collect_matrix_cols_double_offset(double** m, int rows, int cols, int chunk_size, int offset) {
    IF_TIME(mpi_spent_time -= rtclock());
    if (R == 0) {
        for (int i = 0; i < rows; i++) {
            for (int r = 1; r < Q; r++) {
                int receive_chunk_size = blockdist_actual_chunk_size_offset_r(chunk_size, offset, cols, r);
                if (receive_chunk_size > 0) {
                    MPI_Status status;
                    MPI_Recv(m[i] + blockdist_chunk_start_offset_r(chunk_size, offset, r), receive_chunk_size, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
                }
            }
        }
    }
    else {
        int send_chunk_size = blockdist_actual_chunk_size_offset(chunk_size, offset, cols);
        if (send_chunk_size > 0) {
            for (int i = 0; i < rows; i++) {
                MPI_Send(m[i] + blockdist_chunk_start_offset(chunk_size, offset), send_chunk_size, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
            }
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    IF_TIME(mpi_spent_time += rtclock());
}

#endif // BLOCKDIST_H