/* parallel_jacobi.c -- parallel implementation of Jacobi's method
 *     for solving the linear system Ax = b.  Uses block distribution
 *     of the input vectors and block-row distribution of A.
 *
 *     The matrix is filled randomly using drand48.
 * 
 * Input:
 *     n:  order of system (not ness. divisible by p (the number of processors)...
 *         all spill over is handled by the last processor (p-1)
 *     tol:  convergence tolerance
 *     max_iter:  maximum number of iterations
 *     A:  coefficient matrix
 *     b:  right-hand side of system
 *
 * Output:
 *     x:  the solution if the method converges
 *     max_iter:  if the method fails to converge
 *
 * Notes:  
 *     1.  A should be strictly diagonally dominant in
 *         order to insure convergence.
 *     2.  A, x, and b are statically allocated.
 *
 * See Chap 10, pp. 220 & ff in PPMPI.
 *
 * Note: In many places in this code I issue MPI_Send(...,rem,...).  Before this call
 * I check that rem != 0.  I would think that the MPI_Send command should work with
 * a zeros size argument but since I have heard otherwise I check with an if statement
 * before executing the MPI_Send.
 *
 * Written by:
 * -- 
 * John L. Weatherwax                2006-01-29
 * 
 * email: wax@alum.mit.edu
 * 
 * Please send comments and especially bug reports to the
 * above email address.
 * 
 *-----
 */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"

#define Swap(x,y) {float* temp; temp = x; x = y; y = temp;}

#define MAX_DIM 100

typedef float MATRIX_T[MAX_DIM][MAX_DIM];

int Parallel_jacobi(
        MATRIX_T  A_local    /* in  */, 
        float     x_local[]  /* out */, 
        float     b_local[]  /* in  */, 
        int       n          /* in  */, 
        float     tol        /* in  */, 
        int       max_iter   /* in  */,
        int       p          /* in  */, 
        int       my_rank    /* in  */);
void Read_matrix(char* prompt, MATRIX_T A_local, int n, int my_rank, int p);
void Read_vector(char* prompt, float x_local[], int n, int my_rank, int p);
void Print_matrix(char* title, MATRIX_T A_local, int n, int my_rank, int p);
void Print_vector(char* title, float x_local[], int n, int my_rank, int p);
void Gen_matrix(
         MATRIX_T  A_local  /* out */,
         int       n        /* in  */,
         int       my_rank  /* in  */,
         int       p        /* in  */);
void Gen_vector(
         float  x_local[]  /* out */,
         int    n          /* in  */,
         int    my_rank    /* in  */,
         int    p          /* in  */);

main(int argc, char* argv[]) {
    int        p;
    int        my_rank;
    MATRIX_T   A_local;
    float      x_local[MAX_DIM];
    float      b_local[MAX_DIM];
    int        n;
    float      tol;
    int        max_iter;
    int        converged;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    if (my_rank == 0) {
      /*printf("Enter n, tolerance, and max number of iterations\n");*/
      scanf("%d %f %d", &n, &tol, &max_iter);
      /*if( n % p != 0 ) printf("n not divisible by p...invoking code designed for spillover...\n");*/
    }
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&tol, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&max_iter, 1, MPI_INT, 0, MPI_COMM_WORLD);

    srand48( (long int) p ); /* Seed the random number generator: */ 
    Gen_matrix(A_local, n, my_rank, p);
    Print_matrix("The random matrix GENERATED is:",A_local,n,my_rank,p);
    Gen_vector(b_local, n, my_rank, p);
    Print_vector("The random RHS GENERATED is:",b_local,n,my_rank,p); 

    converged = Parallel_jacobi(A_local, x_local, b_local, n, tol, max_iter, p, my_rank);

    if (converged)
        Print_vector("The solution is", x_local, n, my_rank, p);
    else
        if (my_rank == 0)
            printf("Failed to converge in %d iterations\n", max_iter);

    MPI_Finalize();
}  /* main */


/*********************************************************************/
/* Return 1 if iteration converged, 0 otherwise */
/* MATRIX_T is a 2-dimensional array            */
int Parallel_jacobi(
        MATRIX_T  A_local    /* in  */, 
        float     x_local[]  /* out */, 
        float     b_local[]  /* in  */, 
        int       n          /* in  */, 
        float     tol        /* in  */, 
        int       max_iter   /* in  */,
        int       p          /* in  */, 
        int       my_rank    /* in  */) {
    int     i_local, i_global, i,j, itmp, loc_up_lim;
    int     n_bar, rem;
    int     iter_num;
    float   x_temp1[MAX_DIM];
    float   x_temp2[MAX_DIM];
    float*  x_old;
    float*  x_new;

    float Distance(float x[], float y[], int n);

    n_bar = n / p;
    rem   = n % p; 
    /* Compute the local upper limit index: */
    loc_up_lim=( ( my_rank!=p-1 ) ? n_bar : n_bar+rem ); 

    /*printf("my_rank=%d; loc_up_lim = %d\n", my_rank, loc_up_lim);
    Print_vector("b_local",b_local,n,my_rank,p);
    printf("rem = %d\n",rem);*/
    
    /* Initialize x_temp1 */
    MPI_Allgather(b_local, n_bar, MPI_FLOAT, x_temp1, n_bar, MPI_FLOAT, MPI_COMM_WORLD);
    if( rem != 0 ){ /* Get the additional elements held in process p-1 into x_temp1: */
      if( my_rank==p-1 ){      /* First fill processor p-1's data into x_temp1 */
	for(i=0; i<rem; i++) x_temp1[n_bar*p+i] = b_local[n_bar+i];
      }
      MPI_Bcast(&(x_temp1[n_bar*p]), rem, MPI_FLOAT, p-1, MPI_COMM_WORLD );
    }

    x_new = x_temp1;
    x_old = x_temp2;

    iter_num = 0;
    do {
        iter_num++;
        
        /* Interchange x_old and x_new */
        Swap(x_old, x_new);
        for(i_local = 0; i_local < loc_up_lim; i_local++){
            i_global = i_local + my_rank*n_bar;
            x_local[i_local] = b_local[i_local];
            for (j = 0; j < i_global; j++)
                x_local[i_local] = x_local[i_local] - A_local[i_local][j]*x_old[j];
            for (j = i_global+1; j < n; j++)
                x_local[i_local] = x_local[i_local] - A_local[i_local][j]*x_old[j];
            x_local[i_local] = x_local[i_local]/A_local[i_local][i_global];
        }

	/* Print each iterate */ 
	/*Print_vector("",x_local,n,my_rank,p);*/

        MPI_Allgather(x_local, n_bar, MPI_FLOAT, x_new, n_bar, MPI_FLOAT, MPI_COMM_WORLD);
	if( rem != 0 ){
	  if( my_rank==p-1 ){ /* Get the additional elements on processor p-1 into x_new: */ 
	    for( itmp=0; itmp < rem; itmp++ ) x_new[n_bar*p+itmp]=x_local[n_bar+itmp]; 
	  }
	  MPI_Bcast(&(x_new[n_bar*p]), rem, MPI_FLOAT, p-1, MPI_COMM_WORLD); 
	}

    } while ((iter_num < max_iter) && (Distance(x_new,x_old,n) >= tol));

    if (Distance(x_new,x_old,n) < tol)
        return 1;
    else
        return 0;
} /* Jacobi */


/*********************************************************************/
float Distance(float x[], float y[], int n) {
    int i;
    float sum = 0.0;

    for (i = 0; i < n; i++) {
        sum = sum + (x[i] - y[i])*(x[i] - y[i]);
    }
    return sqrt(sum);
} /* Distance */


/*********************************************************************/
void Gen_matrix(
         MATRIX_T  A_local  /* out */,
         int       n        /* in  */,
         int       my_rank  /* in  */,
         int       p        /* in  */) {

    int       i, j, global_index;
    float     row_sum; 
    int       n_bar, rem;
    MPI_Status status;
 
    n_bar = n / p;
    rem   = n % p; 

    /* Fill dummy entries in temp with zeroes */
    for (i = 0; i < n; i++)
        for (j = n; j < MAX_DIM; j++)
            A_local[i][j] = -1.0;
    /* Fill matrix randomly */ 
    for (i = 0; i < n_bar; i++)
      for (j = 0; j < n; j++)
	A_local[i][j] = (float) drand48(); 
    if( rem != 0 && my_rank == p-1 ) { /* Any remaining required rows are given to process "p-1": */ 
      for (i = 0; i < rem; i++)
	for (j = 0; j < n; j++)
	  A_local[n_bar+i][j] = (float) drand48(); 
    }
    /* To guarantee strict diagonal dominance (and nice convergence properties), 
       we overwrite the diagonal elements with the sum of all the elements in 
       that row ... we do this now */
    for (i = 0; i < n_bar; i++){
      row_sum=0.0; for (j = 0; j < n; j++) row_sum += A_local[i][j];
      global_index = i+my_rank*n_bar;
      A_local[i][global_index] = row_sum; 
    }
    if( rem != 0 && my_rank == p-1 ){ /* Any remaining rows are assigned to process "p-1": */
      for (i = 0; i < rem; i++){
	row_sum=0.0; for (j = 0; j < n; j++) row_sum += A_local[i][j];
	global_index = i+my_rank*n_bar;
	A_local[n_bar+i][global_index] = row_sum; 
      }
    }

}  /* Gen_matrix */

/*********************************************************************/
void Gen_vector(
         float  x_local[]  /* out */,
         int    n          /* in  */,
         int    my_rank    /* in  */,
         int    p          /* in  */) {

    int   i;
    int   n_bar, rem;
    MPI_Status status; 
    
    n_bar = n/p;
    rem = n % p; 

    for (i = 0; i < n_bar; i++)
      x_local[i] = (float) drand48(); 
    if( rem != 0 && my_rank == p-1 ){ /* Any remaining required elements are given to process "p-1": */ 
      for (i = 0; i < rem; i++)
	x_local[n_bar+i] = (float) drand48(); 
    }

}  /* Gen_vector */


/*********************************************************************/
void Print_matrix(
         char*     title      /* in */,
         MATRIX_T  A_local    /* in */,
         int       n          /* in */,
         int       my_rank    /* in */,
         int       p          /* in */) {

    int       i, j;
    MATRIX_T  temp;
    int       n_bar, rem;
    MPI_Status status; 

    n_bar = n/p;
    rem = n % p; 

    MPI_Gather(A_local, n_bar*MAX_DIM, MPI_FLOAT, temp,
         n_bar*MAX_DIM, MPI_FLOAT, 0, MPI_COMM_WORLD);
    
    if( rem != 0 && my_rank==p-1 ){ /* Send the remaining rows on processor p-1 to processor 0 */ 
      MPI_Send(&(A_local[n_bar][0]),rem*MAX_DIM,MPI_FLOAT,0,0,MPI_COMM_WORLD); 
    }
    if( rem != 0 && my_rank==0 ){ 
      MPI_Recv(&(temp[n_bar*p][0]),rem*MAX_DIM,MPI_FLOAT,p-1,0,MPI_COMM_WORLD,&status); 
    }

    if (my_rank == 0) {
        printf("%s\n", title);
        for (i = 0; i < n; i++) {
            for (j = 0; j < n; j++)
                printf("%12.8f ", temp[i][j]);
            printf("\n");
        }
    }
}  /* Print_matrix */


/*********************************************************************/
void Print_vector(
         char*  title      /* in */,
         float  x_local[]  /* in */,
         int    n          /* in */,
         int    my_rank    /* in */,
         int    p          /* in */) {

    int   i;
    float temp[MAX_DIM];
    int   n_bar, rem;
    MPI_Status status; 
    
    n_bar = n/p;
    rem = n % p; 

    MPI_Gather(x_local, n_bar, MPI_FLOAT, temp, n_bar, MPI_FLOAT,
        0, MPI_COMM_WORLD);
    if( rem != 0 && my_rank==p-1 ){ /* Send the remaining elements on processor p-1 to processor 0 */ 
      MPI_Send(&(x_local[n_bar]),rem,MPI_FLOAT,0,1,MPI_COMM_WORLD); 
    }
    if( rem != 0 && my_rank==0 ){ 
      MPI_Recv(&(temp[n_bar*p]),rem,MPI_FLOAT,p-1,1,MPI_COMM_WORLD,&status); 
    }

    if (my_rank == 0) {
        printf("%s\n", title);
        for (i = 0; i < n; i++)
            printf("%12.8f ", temp[i]);
        printf("\n");
    }
}  /* Print_vector */