/* parallel_jacobi.c -- parallel implementation of Jacobi's method
 *     for solving the linear system Ax = b.  Uses block distribution
 *     of the input vectors and block-row distribution of A.
 * 
 * Input:
 *     n:  order of system (not ness. divisible by p (the number of processors)...
 *         all spill over is handled by the last processor (p-1)
 *     tol:  convergence tolerance
 *     max_iter:  maximum number of iterations
 *     A:  coefficient matrix
 *     b:  right-hand side of system
 *
 * Output:
 *     x:  the solution if the method converges
 *     max_iter:  if the method fails to converge
 *
 * Notes:  
 *     1.  A should be strictly diagonally dominant in
 *         order to insure convergence.
 *     2.  A, x, and b are statically allocated.
 *
 * See Chap 10, pp. 220 & ff in PPMPI.
 *
 * Written by:
 * -- 
 * John L. Weatherwax                2006-01-24
 * 
 * email: wax@alum.mit.edu
 * 
 * Please send comments and especially bug reports to the
 * above email address.
 * 
 *-----
 */
#include <stdio.h>
#include "mpi.h"
#include <math.h>

#define Swap(x,y) {float* temp; temp = x; x = y; y = temp;}

#define MAX_DIM 100

typedef float MATRIX_T[MAX_DIM][MAX_DIM];

int Parallel_jacobi(
        MATRIX_T  A_local    /* in  */, 
        float     x_local[]  /* out */, 
        float     b_local[]  /* in  */, 
        int       n          /* in  */, 
        float     tol        /* in  */, 
        int       max_iter   /* in  */,
        int       p          /* in  */, 
        int       my_rank    /* in  */);

void Read_matrix(char* prompt, MATRIX_T A_local, int n, int my_rank, int p);
void Read_vector(char* prompt, float x_local[], int n, int my_rank, int p);
void Print_matrix(char* title, MATRIX_T A_local, int n, int my_rank, int p);
void Print_vector(char* title, float x_local[], int n, int my_rank, int p);

main(int argc, char* argv[]) {
    int        p;
    int        my_rank;
    MATRIX_T   A_local;
    float      x_local[MAX_DIM];
    float      b_local[MAX_DIM];
    int        n;
    float      tol;
    int        max_iter;
    int        converged;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    if (my_rank == 0) {
      /*printf("Enter n, tolerance, and max number of iterations\n");*/
      scanf("%d %f %d", &n, &tol, &max_iter);
      /*if( n % p != 0 ) printf("n not divisible by p...invoking code designed for spillover...\n");*/
    }
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&tol, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&max_iter, 1, MPI_INT, 0, MPI_COMM_WORLD);

    /*Read_matrix("Enter the matrix", A_local, n, my_rank, p);*/
    Read_matrix("", A_local, n, my_rank, p);
    /*Print_matrix("Input matrix is:",A_local,n,my_rank,p);*/
    /*Read_vector("Enter the right-hand side", b_local, n, my_rank, p);*/
    Read_vector("", b_local, n, my_rank, p);
    /*Print_vector("Input vector is:", b_local, n, my_rank, p);*/

    converged = Parallel_jacobi(A_local, x_local, b_local, n, tol, max_iter, p, my_rank);

    if (converged)
        Print_vector("The solution is", x_local, n, my_rank, p);
    else
        if (my_rank == 0)
            printf("Failed to converge in %d iterations\n", max_iter);

    MPI_Finalize();
}  /* main */


/*********************************************************************/
/* Return 1 if iteration converged, 0 otherwise */
/* MATRIX_T is a 2-dimensional array            */
int Parallel_jacobi(
        MATRIX_T  A_local    /* in  */, 
        float     x_local[]  /* out */, 
        float     b_local[]  /* in  */, 
        int       n          /* in  */, 
        float     tol        /* in  */, 
        int       max_iter   /* in  */,
        int       p          /* in  */, 
        int       my_rank    /* in  */) {
    int     i_local, i_global, i,j, itmp, loc_up_lim;
    int     n_bar, rem;
    int     iter_num;
    float   x_temp1[MAX_DIM];
    float   x_temp2[MAX_DIM];
    float*  x_old;
    float*  x_new;

    float Distance(float x[], float y[], int n);

    n_bar = n / p;
    rem   = n % p; 
    /* Compute the local upper limit index: */
    loc_up_lim=( ( my_rank!=p-1 ) ? n_bar : n_bar+rem ); 

    /*printf("my_rank=%d; loc_up_lim = %d\n", my_rank, loc_up_lim);
    Print_vector("b_local",b_local,n,my_rank,p);
    printf("rem = %d\n",rem);*/
    
    /* Initialize x_temp1 */
    MPI_Allgather(b_local, n_bar, MPI_FLOAT, x_temp1, n_bar, MPI_FLOAT, MPI_COMM_WORLD);
    if( rem != 0 ){ /* Get the additional elements held in process p-1 into x_temp1: */
      if( my_rank==p-1 ){      /* First fill processor p-1's data into x_temp1 */
	for(i=0; i<rem; i++) x_temp1[n_bar*p+i] = b_local[n_bar+i];
      }
      MPI_Bcast(&(x_temp1[n_bar*p]), rem, MPI_FLOAT, p-1, MPI_COMM_WORLD );
    }

    x_new = x_temp1;
    x_old = x_temp2;

    iter_num = 0;
    do {
        iter_num++;
	/* if( my_rank==0 ) printf("iter_num=%d\n",iter_num); */
        
        /* Interchange x_old and x_new */
        Swap(x_old, x_new);
        for(i_local = 0; i_local < loc_up_lim; i_local++){
            i_global = i_local + my_rank*n_bar;
	    /*if( my_rank==p-1 ) printf("i_global=%d\n",i_global);*/
            x_local[i_local] = b_local[i_local];
            for (j = 0; j < i_global; j++){
                x_local[i_local] = x_local[i_local] - A_local[i_local][j]*x_old[j];
	    }
            for (j = i_global+1; j < n; j++){
                x_local[i_local] = x_local[i_local] - A_local[i_local][j]*x_old[j];
	    }
            x_local[i_local] = x_local[i_local]/A_local[i_local][i_global];
        }

	/* Print each iterate */ 
	/*Print_vector("",x_local,n,my_rank,p);*/

        MPI_Allgather(x_local, n_bar, MPI_FLOAT, x_new, n_bar, MPI_FLOAT, MPI_COMM_WORLD);
	if( rem != 0 ){ /* Get the additional elements on processor p-1 into x_new: */
	  if( my_rank==p-1 ){      /* First fill processor p-1's data into x_new */
	    for(i=0; i<rem; i++) x_new[n_bar*p+i] = x_local[n_bar+i]; 
	  }
	  MPI_Bcast(&(x_new[n_bar*p]), rem, MPI_FLOAT, p-1, MPI_COMM_WORLD);
	}

    } while ((iter_num < max_iter) && (Distance(x_new,x_old,n) >= tol));

    if (Distance(x_new,x_old,n) < tol)
        return 1;
    else
        return 0;
} /* Jacobi */


/*********************************************************************/
float Distance(float x[], float y[], int n) {
    int i;
    float sum = 0.0;

    for (i = 0; i < n; i++) {
        sum = sum + (x[i] - y[i])*(x[i] - y[i]);
    }
    return sqrt(sum);
} /* Distance */


/*********************************************************************/
void Read_matrix(
         char*     prompt   /* in  */,
         MATRIX_T  A_local  /* out */,
         int       n        /* in  */,
         int       my_rank  /* in  */,
         int       p        /* in  */) {

    int       i, j;
    MATRIX_T  temp;
    int       n_bar, rem;
    MPI_Status status;
 
    n_bar = n/p;
    rem = n % p; 

    /* Fill dummy entries in temp with zeroes */
    for (i = 0; i < n; i++)
        for (j = n; j < MAX_DIM; j++)
            temp[i][j] = 0.0;

    if (my_rank == 0) {
        printf("%s\n", prompt);
        for (i = 0; i < n; i++)
            for (j = 0; j < n; j++)
                scanf("%f",&temp[i][j]);
    }
    MPI_Scatter(temp, n_bar*MAX_DIM, MPI_FLOAT, A_local,
        n_bar*MAX_DIM, MPI_FLOAT, 0, MPI_COMM_WORLD);

    if( rem != 0 ){ /* Any remaining data goes to process "p-1": */ 
      if( my_rank == 0 ) {
	MPI_Send(&(temp[n_bar*p][0]), rem*MAX_DIM, MPI_FLOAT, p-1, 0, MPI_COMM_WORLD); 
      }
      if( my_rank == p-1 ) {
	MPI_Recv(&(A_local[n_bar][0]), rem*MAX_DIM, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
      }
    }

}  /* Read_matrix */

/*********************************************************************/
void Read_vector(
         char*  prompt     /* in  */,
         float  x_local[]  /* out */,
         int    n          /* in  */,
         int    my_rank    /* in  */,
         int    p          /* in  */) {

    int   i;
    float temp[MAX_DIM];
    int   n_bar, rem;
    MPI_Status status; 
    
    n_bar = n/p;
    rem = n % p; 

    if (my_rank == 0) {
        printf("%s\n", prompt);
        for (i = 0; i < n; i++)
            scanf("%f", &temp[i]);
    }
    MPI_Scatter(temp, n_bar, MPI_FLOAT, x_local, n_bar, MPI_FLOAT,
        0, MPI_COMM_WORLD);

    if( rem != 0 ){ /* Any remaining data goes to the last process "p-1": */ 
      if( my_rank == 0 ) {
	MPI_Send(&(temp[n_bar*p]), rem, MPI_FLOAT, p-1, 1, MPI_COMM_WORLD);
      }
      if( my_rank == p-1 ) {
	MPI_Recv(&(x_local[n_bar]), rem, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status); 
      }
    }

}  /* Read_vector */


/*********************************************************************/
void Print_matrix(
         char*     title      /* in */,
         MATRIX_T  A_local    /* in */,
         int       n          /* in */,
         int       my_rank    /* in */,
         int       p          /* in */) {

    int       i, j;
    MATRIX_T  temp;
    int       n_bar, rem;
    MPI_Status status; 

    n_bar = n/p;
    rem = n % p; 

    MPI_Gather(A_local, n_bar*MAX_DIM, MPI_FLOAT, temp,
         n_bar*MAX_DIM, MPI_FLOAT, 0, MPI_COMM_WORLD);

    if( rem != 0 ){ /* Send the remaining rows on processor p-1 to processor 0 */ 
      if( my_rank==p-1 ){
	MPI_Send(&(A_local[n_bar][0]),rem*MAX_DIM,MPI_FLOAT,0,2,MPI_COMM_WORLD); 
      }
      if( my_rank==0 ){ 
	MPI_Recv(&(temp[n_bar*p][0]),rem*MAX_DIM,MPI_FLOAT,p-1,2,MPI_COMM_WORLD,&status); 
      }
    }

    if (my_rank == 0) {
      printf("%s\n", title);
      for (i = 0; i < n; i++) {
	for (j = 0; j < n; j++)
	  printf("%10.6f ", temp[i][j]);
	printf("\n");
      }
    }
}  /* Print_matrix */


/*********************************************************************/
void Print_vector(
         char*  title      /* in */,
         float  x_local[]  /* in */,
         int    n          /* in */,
         int    my_rank    /* in */,
         int    p          /* in */) {

    int   i;
    float temp[MAX_DIM];
    int   n_bar, rem;
    MPI_Status status; 
    

    n_bar = n/p;
    rem = n % p; 

    MPI_Gather(x_local, n_bar, MPI_FLOAT, temp, n_bar, MPI_FLOAT,
        0, MPI_COMM_WORLD);
    
    if( rem != 0 ){ /* Send the remaining elements on processor p-1 to processor 0 */ 
      if( my_rank==p-1 ){
	MPI_Send(&(x_local[n_bar]),rem,MPI_FLOAT,0,3,MPI_COMM_WORLD); 
      }
      if( my_rank==0 ){ 
	MPI_Recv(&(temp[n_bar*p]),rem,MPI_FLOAT,p-1,3,MPI_COMM_WORLD,&status); 
      }
    }

    if (my_rank == 0) {
        printf("%s\n", title);
        for (i = 0; i < n; i++)
            printf("%12.8f ", temp[i]);
        printf("\n");
    }
}  /* Print_vector */