atax_p_mpi.c
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <stdbool.h>
#include <omp.h>
#include <mpi.h>
#include <lcg.h>
#include <util.h>
#include <umacros.h>
#include <blockdist.h>
// arrays placement
#define T_base 1000000
#define eta_tmp(i0) 0
#define eta_tmp_mdp(i0) (-(i0)+M-1)
#define T_tmp (1 * T_base)
int tmp_chunk_size;
int tmp_actual_chunk_size;
#define eta_x(i0) (i0)
#define eta_x_mdp(i0) (M-1)
#define T_x (2 * T_base)
int x_chunk_size;
int x_actual_chunk_size;
#define eta_y(i0) (i0)
#define eta_y_mdp(i0) (-(i0)+N-1)
#define T_y (3 * T_base)
int y_chunk_size;
int y_actual_chunk_size;
#define eta_A(i1) (i1)
#define eta_A_mdp(i1) (-(i1)+N-1)
#define T_A (4 * T_base)
int A_chunk_size;
int A_actual_chunk_size;
// mpi routine
void atax_p_ilp_arrays(int M, int N, double** A, double* x, double* y, double* tmp) {
if ((M <= 0) && (N >= 1)) {
for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
y[ilpp] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if ((M >= 1) && (N == 0)) {
for(int ilpp=max(lR,0);ilpp<=min(uR,0);ilpp++) {
tmp[0] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if ((M >= 1) && (N >= 1)) {
for(int ilpp=max(lR,0);ilpp<=min(uR,0);ilpp++) {
tmp[0] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if (N <= 0) {
for(int t0=max(0,N+1);t0<=M-1;t0++) {
for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
tmp[t0] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0] in tmp[t0] = 0
line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
//
}
}
for(int t0=1;t0<=min(M-1,N-1);t0++) {
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_read_vp_rev(-1, t0 - 1, 0, t0 - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,0);ilpp<=min(uR,t0-1);ilpp++) {
tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_write_vp_rev(-1, t0 - 1, 0, t0 - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
tmp[t0] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0] in tmp[t0] = 0
line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
//
}
if (M >= 1) {
for(int t0=M;t0<=N-1;t0++) {
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_read_vp_rev(-1, t0 - 1, t0 - M, t0 - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,t0-M);ilpp<=min(uR,t0-1);ilpp++) {
tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_write_vp_rev(-1, t0 - 1, t0 - M, t0 - 1, tmp, eta_tmp, T_tmp);
//
}
}
if ((N >= 1) && (N <= M-1)) {
// tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
// tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
line_Q_read_vp_rev(-1, N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
if (2*ilpp == N-1) {
if ((N+1)%2 == 0) {
y[(N-1)/2] = 0;
tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
}
}
if (ilpp <= floord(N-2,2)) {
y[ilpp] = 0;
}
if (ilpp <= floord(N-2,2)) {
tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
}
if (ilpp >= ceild(N,2)) {
tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
}
if (ilpp >= ceild(N,2)) {
y[ilpp] = 0;
}
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
// tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
line_Q_write_vp_rev(-1, N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,N);ilpp<=min(uR,N);ilpp++) {
tmp[N] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[N] in tmp[N] = 0
line_Q_write_vp_rev(0, N, N, N, tmp, eta_tmp, T_tmp);
//
}
if ((M >= 1) && (N >= M)) {
for(int ilpp=max(lR,0);ilpp<=min(uR,N-M-1);ilpp++) {
y[ilpp] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
// tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
line_Q_read_vp_rev(-1, N - 1, N - M, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,N-M);ilpp<=min(uR,N-1);ilpp++) {
if (2*ilpp == N-1) {
if ((N+1)%2 == 0) {
y[(N-1)/2] = 0;
tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
}
}
if (ilpp <= floord(N-2,2)) {
y[ilpp] = 0;
}
if (ilpp >= ceild(N,2)) {
tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
}
if (ilpp <= floord(N-2,2)) {
tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
}
if (ilpp >= ceild(N,2)) {
y[ilpp] = 0;
}
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[-ilpp+N-1] in tmp[-ilpp+N-1] = tmp[-ilpp+N-1] + A[-ilpp+N-1][ilpp] * x[ilpp];
// tmp[(N-1)/2] in tmp[(N-1)/2] = tmp[(N-1)/2] + A[(N-1)/2][(N-1)/2] * x[(N-1)/2];
line_Q_write_vp_rev(-1, N - 1, N - M, N - 1, tmp, eta_tmp, T_tmp);
//
}
if (N >= 1) {
for(int t0=N+1;t0<=M-1;t0++) {
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_read_vp_rev(-1, t0 - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
// tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1]
line_Q_read_vp_rev(0, t0 - N - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_write_vp_rev(-1, t0 - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,t0);ilpp<=min(uR,t0);ilpp++) {
tmp[t0] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0] in tmp[t0] = 0
line_Q_write_vp_rev(0, t0, t0, t0, tmp, eta_tmp, T_tmp);
//
}
}
for(int t0=max(M,N+1);t0<=N+M-1;t0++) {
// tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1]
line_Q_read_vp_rev(0, t0 - N - 1, 0, t0 - M - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,0);ilpp<=min(uR,t0-M-1);ilpp++) {
y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_read_vp_rev(-1, t0 - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
//
// tmp[t0-N-1] in y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
line_Q_read_vp_rev(0, t0 - N - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,t0-M);ilpp<=min(uR,N-1);ilpp++) {
y[ilpp] = y[ilpp] + A[t0-N-1][ilpp] * tmp[t0-N-1];
tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp];
}
MPI_Barrier_time(MPI_COMM_WORLD);
// tmp[t0-ilpp-1] in tmp[t0-ilpp-1] = tmp[t0-ilpp-1] + A[t0-ilpp-1][ilpp] * x[ilpp]
line_Q_write_vp_rev(-1, t0 - 1, t0 - M, N - 1, tmp, eta_tmp, T_tmp);
//
}
if ((M >= 1) && (N >= 1)) {
// tmp[M-1] in y[ilpp] = y[ilpp] + A[M-1][ilpp] * tmp[M-1]
line_Q_read_vp_rev(0, M - 1, 0, N - 1, tmp, eta_tmp, T_tmp);
//
for(int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
y[ilpp] = y[ilpp] + A[M-1][ilpp] * tmp[M-1];
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
}
void atax_p_ilp_arrays_mdp(int M, int N, double** A, double* x, double* y, double* tmp, bool use_bcast, bool dist_input) {
if ((M >= 0) && (N >= 0)) {
for (int ilpp=max(lR,0);ilpp<=min(uR,N+M-1);ilpp++) {
if (ilpp <= N-1) {
y[-ilpp+N-1] = 0;
}
if (ilpp <= M-1) {
tmp[-ilpp+M-1] = 0;
}
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if (M <= -1) {
for (int ilpp=max(lR,0);ilpp<=min(uR,N-1);ilpp++) {
y[-ilpp+N-1] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if (N <= -1) {
for (int ilpp=max(lR,0);ilpp<=min(uR,M-1);ilpp++) {
tmp[-ilpp+M-1] = 0;
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if (M >= 1) {
if (dist_input) {
// x[t2] in tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2]
if (use_bcast) {
IF_TIME(mpi_spent_time -= rtclock());
MPI_Bcast(x, N, MPI_DOUBLE, blockdist_proc_rank(M-1), MPI_COMM_WORLD);
IF_TIME(mpi_spent_time += rtclock());
}
else {
line_Q_read_send(0, clamp(lr, 0, M-1), N-1, clamp(ur, 0, M-1), x, eta_x_mdp, T_x);
line_Q_read_receive(0, clamp(lR, 0, M-1), N-1, clamp(uR, 0, M-1), x, eta_x_mdp, T_x);
}
//
// A[-ilpp+M-1][t2] in tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2]
for (int ilpp=0;ilpp<=M-1;ilpp++) {
if (use_bcast) {
IF_TIME(mpi_spent_time -= rtclock());
for (int r = 0; r < Q; r++) {
int actual_chunk_size = blockdist_array_distribute_r(N, r);
int chunk_start = blockdist_chunk_start_inv_r(N, A_chunk_size, actual_chunk_size, r);
MPI_Bcast(A[-ilpp+M-1] + chunk_start, actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
}
IF_TIME(mpi_spent_time += rtclock());
}
else {
line_Q_read_send(blockdist_chunk_start_inv(N, A_chunk_size, A_actual_chunk_size), clamp(lr, 0, M-1),
blockdist_chunk_end_inv(N, A_chunk_size, A_actual_chunk_size) - 1, clamp(ur, 0, M-1),
A[-ilpp+M-1], eta_A_mdp, T_A + (-ilpp+M-1));
line_Q_read_receive(blockdist_chunk_start_inv_r(N, A_chunk_size, blockdist_array_distribute_r(N, r), r), clamp(lR, 0, M-1),
blockdist_chunk_end_inv_r(N, A_chunk_size, blockdist_array_distribute_r(N, r), r) - 1, clamp(uR, 0, M-1),
A[-ilpp+M-1], eta_A_mdp, T_A + (-ilpp+M-1));
}
}
//
}
for (int ilpp=max(lR, 0);ilpp<=min(uR,M-1);ilpp++) {
for (int t2=0;t2<=N-1;t2++) {
tmp[-ilpp+M-1] = tmp[-ilpp+M-1] + A[-ilpp+M-1][t2] * x[t2];
}
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
if (N >= 1) {
// tmp[t2] in y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
if (use_bcast) {
IF_TIME(mpi_spent_time -= rtclock());
for (int r = 0; r < Q; r++) {
int actual_chunk_size = blockdist_array_distribute_r(M, r);
int chunk_start = blockdist_chunk_start_inv_r(M, tmp_chunk_size, actual_chunk_size, r);
MPI_Bcast(tmp + chunk_start, actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
}
IF_TIME(mpi_spent_time += rtclock());
}
else {
line_Q_read_send(blockdist_chunk_start_inv(M, tmp_chunk_size, tmp_actual_chunk_size), clamp(lr, 0, N-1),
blockdist_chunk_end_inv(M, tmp_chunk_size, tmp_actual_chunk_size) - 1, clamp(ur, 0, N-1),
tmp, eta_tmp_mdp, T_tmp);
line_Q_read_receive(blockdist_chunk_start_inv_r(M, tmp_chunk_size, blockdist_array_distribute_r(M, r), r), clamp(lR, 0, N-1),
blockdist_chunk_end_inv_r(M, tmp_chunk_size, blockdist_array_distribute_r(M, r), r) - 1, clamp(uR, 0, N-1),
tmp, eta_tmp_mdp, T_tmp);
}
//
int lb = max(lR, 0);
int ub = min(uR, N-1);
for (int t2=0;t2<=M-1;t2++) {
// tmp[t2] in y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
//line_Q_read_send(t2, clamp(lr, 0, N-1), t2, clamp(ur, 0, N-1), tmp, eta_tmp_mdp, T_tmp, mpi_requests);
//line_Q_read_receive(t2, clamp(lR, 0, N-1), t2, clamp(uR, 0, N-1), tmp, eta_tmp_mdp, T_tmp, mpi_statuses);
//
for (int ilpp=lb;ilpp<=ub;ilpp++) {
y[-ilpp+N-1] = y[-ilpp+N-1] + A[t2][-ilpp+N-1] * tmp[t2];
}
MPI_Barrier_time(MPI_COMM_WORLD);
}
}
collect_matrix_cols_double(&y, 1, N, y_chunk_size, true);
}
void atax_p_ilp_arrays_mdp_sendrecv_distinp(int M, int N, double** A, double* x, double* y, double* tmp) {
atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, false, true);
}
void atax_p_ilp_arrays_mdp_bcast_distinp(int M, int N, double** A, double* x, double* y, double* tmp) {
atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, true, true);
}
void atax_p_ilp_arrays_mdp_sendrecv_dupinp(int M, int N, double** A, double* x, double* y, double* tmp) {
atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, false, false);
}
void atax_p_ilp_arrays_mdp_bcast_dupinp(int M, int N, double** A, double* x, double* y, double* tmp) {
atax_p_ilp_arrays_mdp(M, N, A, x, y, tmp, true, false);
}
// pluto mpi routine
double pluto_spent_time = 0;
extern int M;
extern int N;
extern double** A;
extern double* x;
extern double* y;
extern double* tmp;
extern void atax_p_pluto_mpi();
void atax_p_pluto(int _M, int _N, double** _A, double* _x, double* _y, double* _tmp) {
M = _M;
N = _N;
A = _A;
x = _x;
y = _y;
tmp = _tmp;
atax_p_pluto_mpi();
}
// handwritten mpi routine
void atax_by_hand(int M, int N, double** A, double* x, double* y, double* tmp) {
for (int i = blockdist_chunk_start(y_chunk_size); i < blockdist_chunk_end(y_chunk_size, y_actual_chunk_size); i++) {
y[i] = 0;
}
int tmp_chunk_size = blockdist_chunk_size(M, Q);
int tmp_actual_chunk_size = blockdist_actual_chunk_size(tmp_chunk_size, M);
for (int i = R * tmp_chunk_size; i < R * tmp_chunk_size + tmp_actual_chunk_size; i++) {
tmp[i] = 0;
for (int j = 0; j < N; j++)
tmp[i] = tmp[i] + A[i][j] * x[j];
}
IF_TIME(mpi_spent_time -= rtclock());
for (int r = 0; r < Q; r++) {
tmp_actual_chunk_size = blockdist_actual_chunk_size_r(tmp_chunk_size, M, r);
if (tmp_actual_chunk_size > 0) {
MPI_Bcast(tmp + r * tmp_chunk_size, tmp_actual_chunk_size, MPI_DOUBLE, r, MPI_COMM_WORLD);
}
}
IF_TIME(mpi_spent_time += rtclock());
for (int i = 0; i < M; i++) {
for (int j = blockdist_chunk_start(y_chunk_size); j < blockdist_chunk_end(y_chunk_size, y_actual_chunk_size); j++) {
y[j] = y[j] + A[i][j] * tmp[i];
}
}
MPI_Barrier_time(MPI_COMM_WORLD);
collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
}
// sequential routine
void atax_vanilla(int M, int N, double** A, double* x, double* y, double* tmp) {
if (R == 0) {
for (int i = 0; i < N; i++)
y[i] = 0;
for (int i = 0; i < M; i++) {
tmp[i] = 0;
for (int j = 0; j < N; j++)
tmp[i] = tmp[i] + A[i][j] * x[j];
for (int j = 0; j < N; j++)
y[j] = y[j] + A[i][j] * tmp[i];
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
typedef void (*atax_solution)(int M, int N, double** A, double* x, double* y, double* tmp);
const char* method_names[] = {"vanilla",
"by_hand",
"p_ilp_arrays",
"p_ilp_arrays_mdp_sendrecv_distinp",
"p_ilp_arrays_mdp_bcast_distinp",
"p_ilp_arrays_mdp_sendrecv_dupinp",
"p_ilp_arrays_mdp_bcast_dupinp",
"p_pluto"};
const atax_solution method_impls[] = {&atax_vanilla,
&atax_by_hand,
&atax_p_ilp_arrays,
&atax_p_ilp_arrays_mdp_sendrecv_distinp,
&atax_p_ilp_arrays_mdp_bcast_distinp,
&atax_p_ilp_arrays_mdp_sendrecv_dupinp,
&atax_p_ilp_arrays_mdp_bcast_dupinp,
&atax_p_pluto};
int main(int argc, char* argv[]) {
int M = atoi(argv[1]);
int N = atoi(argv[2]);
int method = atoi(argv[3]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &Q);
MPI_Comm_rank(MPI_COMM_WORLD, &R);
const char* pass_count_str = getenv("PASS_COUNT");
int pass_count = (pass_count_str != NULL) ? atoi(pass_count_str) : 1;
if (R == 0) {
printf("M = %d\n", M);
printf("N = %d\n", N);
printf("Using %s method...\n", method_names[method]);
printf("Number of processes is %d\n", Q);
printf("Pass count is %d\n", pass_count);
fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
printf("Process #%d started.\n", R);
fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
// arrays distribution
if (method_impls[method] == &atax_p_ilp_arrays) {
int eta_tmp_min = 0; int eta_tmp_max = 0;
int eta_x_min = 0; int eta_x_max = N - 1;
int eta_y_min = 0; int eta_y_max = N - 1;
int eta_A_min = 0; int eta_A_max = N - 1;
int pi_S0_min = 0; int pi_S0_max = N - 1;
int pi_S1_min = 0; int pi_S1_max = M - 1;
int pi_S2_min = 0; int pi_S2_max = N - 1;
int pi_S3_min = 0; int pi_S3_max = N - 1;
L = min(eta_tmp_min, min(eta_x_min, min(eta_y_min, min(eta_A_min, min(pi_S0_min, min(pi_S1_min, min(pi_S2_min, pi_S3_min)))))));
U = max(eta_tmp_max, max(eta_x_max, max(eta_y_max, max(eta_A_max, max(pi_S0_max, max(pi_S1_max, max(pi_S2_max, pi_S3_max)))))));
blockdist_init();
A_chunk_size = block_size; A_actual_chunk_size = blockdist_array_distribute(N);
x_chunk_size = block_size; x_actual_chunk_size = blockdist_array_distribute(N);
y_chunk_size = block_size; y_actual_chunk_size = blockdist_array_distribute(N);
tmp_chunk_size = M; tmp_actual_chunk_size = (blockdist_proc_rank(0) == R) ? tmp_chunk_size : 0;
}
if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp ||
method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
int eta_tmp_min = 0; int eta_tmp_max = M - 1;
int eta_x_min = M - 1; int eta_x_max = M - 1;
int eta_y_min = 0; int eta_y_max = N - 1;
int eta_A_min = 0; int eta_A_max = N - 1;
int pi_S0_min = 0; int pi_S0_max = N - 1;
int pi_S1_min = 0; int pi_S1_max = M - 1;
int pi_S2_min = 0; int pi_S2_max = M - 1;
int pi_S3_min = 0; int pi_S3_max = N - 1;
L = min(eta_tmp_min, min(eta_x_min, min(eta_y_min, min(eta_A_min, min(pi_S0_min, min(pi_S1_min, min(pi_S2_min, pi_S3_min)))))));
U = max(eta_tmp_max, max(eta_x_max, max(eta_y_max, max(eta_A_max, max(pi_S0_max, max(pi_S1_max, max(pi_S2_max, pi_S3_max)))))));
blockdist_init();
if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
A_chunk_size = block_size; A_actual_chunk_size = blockdist_array_distribute(N);
x_chunk_size = N; x_actual_chunk_size = (blockdist_proc_rank(M - 1) == R) ? x_chunk_size : 0;
}
else {
A_chunk_size = N; A_actual_chunk_size = N;
x_chunk_size = N; x_actual_chunk_size = N;
}
y_chunk_size = block_size; y_actual_chunk_size = blockdist_array_distribute(N);
tmp_chunk_size = block_size; tmp_actual_chunk_size = blockdist_array_distribute(M);
}
if (method_impls[method] == &atax_by_hand) {
y_chunk_size = blockdist_chunk_size(N, Q); y_actual_chunk_size = blockdist_actual_chunk_size(y_chunk_size, N);
}
if (method_impls[method] == &atax_vanilla) {
A_chunk_size = N; A_actual_chunk_size = (R == 0) ? A_chunk_size : 0;
x_chunk_size = N; x_actual_chunk_size = (R == 0) ? x_chunk_size : 0;
y_chunk_size = N; y_actual_chunk_size = (R == 0) ? y_chunk_size : 0;
tmp_chunk_size = M; tmp_actual_chunk_size = (R == 0) ? tmp_chunk_size : 0;
}
double** A = allocate_matrix_double(M, N, "A");
double* x = allocate_vector_double(N, "x");
double* y = allocate_vector_double(N, "y");
double* tmp = allocate_vector_double(M, "tmp");
double* elapsed_time = allocate_vector_double(pass_count, "elapsed_time");
#ifdef TIME
double* compute_ratio = (R == 0) ? allocate_vector_double(pass_count, "compute_ratio") : NULL;
#endif
for (int pass = 0; pass < pass_count; pass++) {
// arrays initializing
time_t seed_time = time(0);
MPI_Bcast(&seed_time, sizeof(time_t), MPI_BYTE, 0, MPI_COMM_WORLD);
if (method_impls[method] == &atax_by_hand || method_impls[method] == &atax_p_pluto ||
method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
int seed = seed_time % MODULUS;
fill_matrix_double(A, M, N, 0, M, 0, N, &seed);
fill_vector_double(x, N, 0, N, &seed);
}
else {
int seed = (seed_time + R) % MODULUS;
if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
fill_matrix_double(A, M, N, 0, M, blockdist_chunk_start_inv(N, A_chunk_size, A_actual_chunk_size), blockdist_chunk_end_inv(N, A_chunk_size, A_actual_chunk_size), &seed);
fill_vector_double(x, N, 0, x_actual_chunk_size, &seed);
}
else {
fill_matrix_double(A, M, N, 0, M, blockdist_chunk_start(A_chunk_size), blockdist_chunk_end(A_chunk_size, A_actual_chunk_size), &seed);
fill_vector_double(x, N, blockdist_chunk_start(x_chunk_size), blockdist_chunk_end(x_chunk_size, x_actual_chunk_size), &seed);
}
}
MPI_Barrier(MPI_COMM_WORLD);
struct timeval start_time;
struct timeval finish_time;
if (R == 0) {
printf("Starting %d pass...\n", pass + 1);
fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
#ifdef TIME
mpi_spent_time = 0;
sync_spent_time = 0;
#endif
gettimeofday(&start_time, 0);
(*method_impls[method])(M, N, A, x, y, tmp);
gettimeofday(&finish_time, 0);
elapsed_time[pass] = elapsed_msecs(start_time, finish_time);
#ifdef TIME
if (method_impls[method] == &atax_p_pluto)
elapsed_time[pass] = pluto_spent_time * 1000.0;
mpi_spent_time *= 1000.0;
sync_spent_time *= 1000.0;
printf("Process rank %1$d: time spent %2$lf%5$s, mpi time spent %3$lf%5$s, sync time spent %4$lf%5$s, compute ratio %6$lf.\n",
R, elapsed_time[pass], mpi_spent_time, sync_spent_time, " ms",
(elapsed_time[pass] - mpi_spent_time - sync_spent_time) / elapsed_time[pass]);
fflush(stdout);
nanosleep((const struct timespec[]){{0, 100000000L}}, NULL);
MPI_Barrier(MPI_COMM_WORLD);
double total_time_spent;
MPI_Reduce(&elapsed_time[pass], &total_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
double total_mpi_time_spent;
MPI_Reduce(&mpi_spent_time, &total_mpi_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
double total_sync_time_spent;
MPI_Reduce(&sync_spent_time, &total_sync_time_spent, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (R == 0)
compute_ratio[pass] = 100.0 * (total_time_spent - total_mpi_time_spent - total_sync_time_spent) / total_time_spent;
#endif
if (R == 0) {
printf("Time spent: %lf milliseconds\n", elapsed_time[pass]);
#ifdef TIME
printf("Compute ratio: %lf percents\n", compute_ratio[pass]);
#endif
fflush(stdout);
}
//verify
double max_diff_percents = 0;
if (method_impls[method] != &atax_vanilla) {
if (R == 0) {
printf("Verification method is %s solution.\n", "sequential");
fflush(stdout);
}
if (method_impls[method] == &atax_p_ilp_arrays) {
collect_matrix_cols_double(A, M, N, A_chunk_size, false);
collect_matrix_cols_double(&x, 1, N, x_chunk_size, false);
}
//if (method_impls[method] == &atax_p_ilp_arrays || method_impls[method] == &atax_by_hand) {
// collect_matrix_cols_double(&y, 1, N, y_chunk_size, false);
//}
if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp) {
collect_matrix_cols_double(A, M, N, A_chunk_size, true);
collect_matrix_cols_double(&x, 1, N, x_chunk_size, true);
}
//if (method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_distinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_distinp ||
// method_impls[method] == &atax_p_ilp_arrays_mdp_sendrecv_dupinp || method_impls[method] == &atax_p_ilp_arrays_mdp_bcast_dupinp) {
// collect_matrix_cols_double(&y, 1, N, y_chunk_size, true);
//}
double* z = allocate_vector_double((R == 0) ? N : 0, "z");
atax_vanilla(M, N, A, x, z, tmp);
if (R == 0) {
max_diff_percents = compare_vectors_double(y, z, N) * 100;
printf("Maximal diff is %lf percents\n", max_diff_percents);
fflush(stdout);
int i = 0;
while (i < N && y[i] > 0)
i++;
if (i < N) {
printf("ERROR: The result is not positive at position %d: %lf.\n", i, y[i]);
fflush(stdout);
max_diff_percents = 1000000;
}
}
free(z);
}
if (R == 0) {
printf((max_diff_percents <= 0.1) ? "PASSED\n" : "NOT PASSED\n");
fflush(stdout);
}
}
if (R == 0) {
print_stats(elapsed_time, pass_count, "Elapsed Time ", " ms");
#ifdef TIME
print_stats(compute_ratio, pass_count, "Compute Ratio ", " %");
#endif
}
free(elapsed_time);
#ifdef TIME
free(compute_ratio);
#endif
// free memory
free_matrix_double(A, M);
free(x);
free(y);
free(tmp);
MPI_Finalize();
return 0;
}
blockdist.h
#ifndef BLOCKDIST_H
#define BLOCKDIST_H
#include <umacros.h>
#include <stdbool.h>
#include <mpi.h>
#ifdef TIME
#include <util.h>
double mpi_spent_time = 0;
double sync_spent_time = 0;
#endif
#define MPI_Barrier_time(comm) \
IF_TIME(sync_spent_time -= rtclock()); \
MPI_Barrier(comm); \
IF_TIME(sync_spent_time += rtclock());
int Q = 0;
int R = 0;
int L = 0;
int U = 0;
int lR = 0;
int uR = 0;
int block_size = 0;
int actual_block_size = 0;
#define blockdist_chunk_size(N, Q) (((N) + (Q) - 1) / (Q))
#define blockdist_actual_chunk_size_r(chunk_size, N, R) max(0, min((chunk_size), (N) - (R) * (chunk_size)))
#define blockdist_actual_chunk_size(chunk_size, N) blockdist_actual_chunk_size_r(chunk_size, N, R)
#define blockdist_array_distribute_r(N, R) blockdist_actual_chunk_size_r(block_size, N, R)
#define blockdist_array_distribute(N) blockdist_array_distribute_r(N, R)
#define blockdist_chunk_start_r(chunk_size, R) ((R) * (chunk_size))
#define blockdist_chunk_start(chunk_size) blockdist_chunk_start_r(chunk_size, R)
#define blockdist_chunk_end_r(chunk_size, actual_chunk_size, R) (blockdist_chunk_start_r(chunk_size, R) + (actual_chunk_size))
#define blockdist_chunk_end(chunk_size, actual_chunk_size) blockdist_chunk_end_r(chunk_size, actual_chunk_size, R)
#define blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R) ((N) - blockdist_chunk_end_r(chunk_size, actual_chunk_size, R))
#define blockdist_chunk_start_inv(N, chunk_size, actual_chunk_size) blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R)
#define blockdist_chunk_end_inv_r(N, chunk_size, actual_chunk_size, R) (blockdist_chunk_start_inv_r(N, chunk_size, actual_chunk_size, R) + (actual_chunk_size))
#define blockdist_chunk_end_inv(N, chunk_size, actual_chunk_size) blockdist_chunk_end_inv_r(N, chunk_size, actual_chunk_size, R)
#define triple_choice(test_val, ref_val, eq_val, lt_val, gt_val) (((test_val) == (ref_val)) ? (eq_val) : (((test_val) > (ref_val)) ? (gt_val) : (lt_val)))
#define blockdist_actual_chunk_size_offset_r_int(free_cnt, fst_capacity, chunk_size, N, R) triple_choice(R, free_cnt, fst_capacity, 0, max(0, min((chunk_size), (N) - (fst_capacity) - ((R) - (free_cnt) - 1) * (chunk_size))))
#define blockdist_actual_chunk_size_offset_r(chunk_size, offset, N, R) blockdist_actual_chunk_size_offset_r_int((offset) / (chunk_size), min(N, (chunk_size) - (offset) % (chunk_size)), chunk_size, N, R)
#define blockdist_actual_chunk_size_offset(chunk_size, offset, N) blockdist_actual_chunk_size_offset_r(chunk_size, offset, N, R)
#define blockdist_array_distribute_offset_r(offset, N, R) blockdist_actual_chunk_size_offset_r(block_size, offset, N, R)
#define blockdist_array_distribute_offset(offset, N) blockdist_array_distribute_offset_r(offset, N, R)
#define blockdist_chunk_start_offset_r(chunk_size, offset, R) max(0, (R) * (chunk_size) - (offset))
#define blockdist_chunk_start_offset(chunk_size, offset) blockdist_chunk_start_offset_r(chunk_size, offset, R)
#define blockdist_chunk_end_offset_r(chunk_size, actual_chunk_size, offset, R) (blockdist_chunk_start_offset_r(chunk_size, offset, R) + (actual_chunk_size))
#define blockdist_chunk_end_offset(chunk_size, actual_chunk_size, offset) blockdist_chunk_end_offset_r(chunk_size, actual_chunk_size, offset, R)
#define blockdist_in_range(v) ((v) >= lR && (v) <= uR)
#define blockdist_proc_rank(v) (((v) - L) / block_size)
#define blockdist_nb_vproc (U - L + 1)
// Q is a line - part of a one-dimensional array
#define line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
int lr = L + r * block_size; \
int ur = lr + r_actual_chunk_size - 1; \
bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
(blockdist_proc_rank(eta(fst_idx)) == R) && (blockdist_proc_rank(eta(last_idx)) == R) && \
((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
if (nonempty) { \
MPI_Request request; \
MPI_Isend((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
MPI_Request_free(&request); \
} \
}
#define line_Q_read_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
} \
}
#define line_Q_read_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = R + 1; r < Q; r++) { \
line_Q_read_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
}
#define line_Q_read_send(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_send_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock())
#define line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
(blockdist_proc_rank(eta(fst_idx)) == r) && (blockdist_proc_rank(eta(last_idx)) == r) && \
blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
if (nonempty) { \
MPI_Status status; \
MPI_Recv((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
} \
}
#define line_Q_read_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
} \
}
#define line_Q_read_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < R; r++) { \
line_Q_read_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
}
#define line_Q_read_receive(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_receive_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
(blockdist_proc_rank(eta(fst_idx)) == r) && (blockdist_proc_rank(eta(last_idx)) == r) && \
blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
if (nonempty) { \
MPI_Request request; \
MPI_Isend((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
MPI_Request_free(&request); \
} \
}
#define line_Q_write_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
} \
}
#define line_Q_write_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = R + 1; r < Q; r++) { \
line_Q_write_send_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
}
#define line_Q_write_send(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_send_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r) { \
int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
int lr = L + r * block_size; \
int ur = lr + r_actual_chunk_size - 1; \
bool nonempty = (econd) && ((last_idx) >= (fst_idx)) && \
(blockdist_proc_rank(eta(fst_idx)) == R) && (blockdist_proc_rank(eta(last_idx)) == R) && \
((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
if (nonempty) { \
MPI_Status status; \
MPI_Recv((arr) + (fst_idx), (last_idx) - (fst_idx) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
} \
}
#define line_Q_write_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
} \
}
#define line_Q_write_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag) \
for (int r = 0; r < R; r++) { \
line_Q_write_receive_econd_r_int(fst_idx, fst_vp, last_idx, last_vp, econd, arr, eta, tag, r); \
}
#define line_Q_write_receive(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_receive_econd_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_receive_fco(fst_idx, fst_vp, last_idx, last_vp, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_receive_econd_fco_int(fst_idx, fst_vp, last_idx, last_vp, true, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define clamp(v, l, u) min(max(v, l), u)
#define line_Q_read_vp_rev_econd(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
econd, arr, eta, tag); \
line_Q_read_receive_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_vp_rev_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
econd, arr, eta, tag); \
line_Q_read_receive_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_vp_rev(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_rev_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_read_vp_rev_fco(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_rev_econd_fco(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_rev_econd(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
econd, arr, eta, tag); \
line_Q_write_receive_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_vp_rev_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
econd, arr, eta, tag); \
line_Q_write_receive_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_vp_rev(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_rev_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_rev_fco(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_rev_econd_fco(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_read_vp_econd(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
econd, arr, eta, tag); \
line_Q_read_receive_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_vp_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_read_send_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
econd, arr, eta, tag); \
line_Q_read_receive_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_read_vp(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_read_vp_fco(mul, add, lb, ub, arr, eta, tag) line_Q_read_vp_econd_fco(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_econd(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
econd, arr, eta, tag); \
line_Q_write_receive_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_vp_econd_fco(mul, add, lb, ub, econd, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
line_Q_write_send_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
econd, arr, eta, tag); \
line_Q_write_receive_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
econd, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define line_Q_write_vp(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_econd(mul, add, lb, ub, true, arr, eta, tag)
#define line_Q_write_vp_fco(mul, add, lb, ub, arr, eta, tag) line_Q_write_vp_econd_fco(mul, add, lb, ub, true, arr, eta, tag)
// Q is a column - part of a matrix column
#define col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
int lr = L + r * block_size; \
int ur = lr + r_actual_chunk_size - 1; \
bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
(blockdist_proc_rank(eta(fst_row, col_idx)) == R) && (blockdist_proc_rank(eta(last_row, col_idx)) == R) && \
((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
if (nonempty) { \
for (int i = fst_row; i <= last_row; i++) { \
buf[i] = arr[i][col_idx]; \
} \
MPI_Request request; \
MPI_Isend((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
MPI_Request_free(&request); \
} \
}
#define col_Q_read_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
} \
}
#define col_Q_read_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = R + 1; r < Q; r++) { \
col_Q_read_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
}
#define col_Q_read_send(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_send_fco(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
(blockdist_proc_rank(eta(fst_row, col_idx)) == r) && (blockdist_proc_rank(eta(last_row, col_idx)) == r) && \
blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
if (nonempty) { \
MPI_Status status; \
MPI_Recv((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
for (int i = fst_row; i <= last_row; i++) { \
arr[i][col_idx] = buf[i]; \
} \
} \
}
#define col_Q_read_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
} \
}
#define col_Q_read_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < R; r++) { \
col_Q_read_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
}
#define col_Q_read_receive(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_receive_fco(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_vp_econd(mul, add, lb, ub, col_idx, econd, buff, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
col_Q_read_receive_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_vp_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
col_Q_read_receive_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_vp(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_read_vp_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_read_vp_rev_econd(mul, add, lb, ub, col_idx, econd, buff, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
col_Q_read_receive_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_vp_rev_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_read_send_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
col_Q_read_receive_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
col_idx, econd, buff, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_read_vp_rev(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_rev_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_read_vp_rev_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_read_vp_rev_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
(blockdist_proc_rank(eta(fst_row, col_idx)) == r) && (blockdist_proc_rank(eta(last_row, col_idx)) == r) && \
blockdist_in_range(fst_vp) && blockdist_in_range(last_vp); \
if (nonempty) { \
for (int i = fst_row; i <= last_row; i++) { \
buf[i] = arr[i][col_idx]; \
} \
MPI_Request request; \
MPI_Isend((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &request); \
MPI_Request_free(&request); \
} \
}
#define col_Q_write_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
} \
}
#define col_Q_write_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = R + 1; r < Q; r++) { \
col_Q_write_send_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
}
#define col_Q_write_send(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_send_fco(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r) { \
int r_actual_chunk_size = blockdist_actual_chunk_size_r(block_size, blockdist_nb_vproc, r); \
int lr = L + r * block_size; \
int ur = lr + r_actual_chunk_size - 1; \
bool nonempty = (econd) && ((last_row) >= (fst_row)) && \
(blockdist_proc_rank(eta(fst_row, col_idx)) == R) && (blockdist_proc_rank(eta(last_row, col_idx)) == R) && \
((fst_vp) >= lr && (fst_vp) <= ur) && ((last_vp) >= lr && (last_vp) <= ur); \
if (nonempty) { \
MPI_Status status; \
MPI_Recv((buf) + (fst_row), (last_row) - (fst_row) + 1, MPI_DOUBLE, r, tag, MPI_COMM_WORLD, &status); \
for (int i = fst_row; i <= last_row; i++) { \
arr[i][col_idx] = buf[i]; \
} \
} \
}
#define col_Q_write_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < Q; r++) { \
if (r != R) { \
col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
} \
}
#define col_Q_write_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag) \
for (int r = 0; r < R; r++) { \
col_Q_write_receive_econd_r_int(fst_row, fst_vp, last_row, last_vp, col_idx, econd, buf, arr, eta, tag, r); \
}
#define col_Q_write_receive(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_receive_econd_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_receive_fco(fst_row, fst_vp, last_row, last_vp, col_idx, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_receive_econd_fco_int(fst_row, fst_vp, last_row, last_vp, col_idx, true, buf, arr, eta, tag); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_vp_econd(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
col_Q_write_receive_econd_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_vp_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_fco_int((mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), (mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
col_Q_write_receive_econd_fco_int((mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), (mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_vp(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_vp_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_vp_rev_econd(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
col_Q_write_receive_econd_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_vp_rev_econd_fco(mul, add, lb, ub, col_idx, econd, buf, arr, eta, tag) \
IF_TIME(mpi_spent_time -= rtclock()); \
col_Q_write_send_econd_fco_int((mul) * clamp(uR, lb, ub) + (add), clamp(uR, lb, ub), (mul) * clamp(lR, lb, ub) + (add), clamp(lR, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
col_Q_write_receive_econd_fco_int((mul) * clamp(ur, lb, ub) + (add), clamp(ur, lb, ub), (mul) * clamp(lr, lb, ub) + (add), clamp(lr, lb, ub), \
col_idx, econd, buf, arr, eta, tag); \
MPI_Barrier(MPI_COMM_WORLD); \
IF_TIME(mpi_spent_time += rtclock());
#define col_Q_write_vp_rev(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_rev_econd(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
#define col_Q_write_vp_rev_fco(mul, add, lb, ub, col_idx, buf, arr, eta, tag) col_Q_write_vp_rev_econd_fco(mul, add, lb, ub, col_idx, true, buf, arr, eta, tag)
// data layout utilities
void blockdist_init() {
block_size = blockdist_chunk_size(blockdist_nb_vproc, Q);
actual_block_size = blockdist_actual_chunk_size(block_size, blockdist_nb_vproc);
lR = L + R * block_size;
uR = lR + actual_block_size - 1;
}
void collect_matrix_rows_double(double** m, int rows, int cols, int chunk_size) {
IF_TIME(mpi_spent_time -= rtclock());
if (R == 0) {
for (int r = 1; r < Q; r++) {
int actual_chunk_size_r = blockdist_actual_chunk_size_r(chunk_size, rows, r);
for (int i = blockdist_chunk_start_r(chunk_size, r); i < blockdist_chunk_end_r(chunk_size, actual_chunk_size_r, r); i++) {
MPI_Status status;
MPI_Recv(m[i], cols, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
}
}
}
else {
int actual_chunk_size = blockdist_actual_chunk_size(chunk_size, rows);
for (int i = blockdist_chunk_start(chunk_size); i < blockdist_chunk_end(chunk_size, actual_chunk_size); i++) {
MPI_Send(m[i], cols, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
}
}
MPI_Barrier(MPI_COMM_WORLD);
IF_TIME(mpi_spent_time += rtclock());
}
void collect_matrix_cols_double(double** m, int rows, int cols, int chunk_size, bool inv) {
IF_TIME(mpi_spent_time -= rtclock());
if (R == 0) {
for (int i = 0; i < rows; i++) {
for (int r = 1; r < Q; r++) {
int receive_chunk_size = blockdist_actual_chunk_size_r(chunk_size, cols, r);
if (receive_chunk_size > 0) {
MPI_Status status;
int offset = (inv) ? blockdist_chunk_start_inv_r(cols, chunk_size, receive_chunk_size, r) :
blockdist_chunk_start_r(chunk_size, r);
MPI_Recv(m[i] + offset, receive_chunk_size, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
}
}
}
}
else {
int send_chunk_size = blockdist_actual_chunk_size(chunk_size, cols);
if (send_chunk_size > 0) {
for (int i = 0; i < rows; i++) {
int offset = (inv) ? blockdist_chunk_start_inv(cols, chunk_size, send_chunk_size) :
blockdist_chunk_start(chunk_size);
MPI_Send(m[i] + offset, send_chunk_size, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
IF_TIME(mpi_spent_time += rtclock());
}
void collect_matrix_cols_double_offset(double** m, int rows, int cols, int chunk_size, int offset) {
IF_TIME(mpi_spent_time -= rtclock());
if (R == 0) {
for (int i = 0; i < rows; i++) {
for (int r = 1; r < Q; r++) {
int receive_chunk_size = blockdist_actual_chunk_size_offset_r(chunk_size, offset, cols, r);
if (receive_chunk_size > 0) {
MPI_Status status;
MPI_Recv(m[i] + blockdist_chunk_start_offset_r(chunk_size, offset, r), receive_chunk_size, MPI_DOUBLE, r, i, MPI_COMM_WORLD, &status);
}
}
}
}
else {
int send_chunk_size = blockdist_actual_chunk_size_offset(chunk_size, offset, cols);
if (send_chunk_size > 0) {
for (int i = 0; i < rows; i++) {
MPI_Send(m[i] + blockdist_chunk_start_offset(chunk_size, offset), send_chunk_size, MPI_DOUBLE, 0, i, MPI_COMM_WORLD);
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
IF_TIME(mpi_spent_time += rtclock());
}
#endif // BLOCKDIST_H