#include <stdio.h>
#include "mpi.h"

/* 

Exercise 6.10.2, see Chap 6, p. 109 in PPMPI.
by John Weathewax

Here we input from a file a matrix distributed as row major order and
distribute into individual processes in block column order.  Each process 
creates a vector of ones and we compute the global matrix vector multiply.

*/ 

#define MAX_ORDER 100

typedef float LOCAL_MATRIX_T[MAX_ORDER][MAX_ORDER];

main( int argc, char* argv[] ){
    int             my_rank;
    int             p;
    LOCAL_MATRIX_T  local_A; 
    float           local_x[MAX_ORDER];
    float           global_b[MAX_ORDER];
    int             n;
    int             local_n;

    /* declarations: */ 
    void Read_Row_Major_Matrix(LOCAL_MATRIX_T local_A,float local_x[],int *local_n,int *n,int my_rank,int p); 
    void Print_Block_Col_Order(LOCAL_MATRIX_T local_A,int  local_n,int  n,int my_rank,int p);
    void Mult_Mat_by_Vect(LOCAL_MATRIX_T local_A,float local_x[], int n, int local_n,
                          int my_rank, float global_b[]);
    void PrintResult(int n,int my_rank,float globa_b[]); 

    MPI_Init(&argc,&argv); 
    MPI_Comm_size(MPI_COMM_WORLD,&p);
    MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); 
  
    Read_Row_Major_Matrix(local_A,local_x,&local_n,&n,my_rank,p);
    Mult_Mat_by_Vect(local_A,local_x,n,local_n,my_rank,global_b); 

    Print_Block_Col_Order(local_A,local_n,n,my_rank,p); 
    PrintResult(n,my_rank,global_b);

    MPI_Finalize(); 
}


/**********************************************************************/
void Mult_Mat_by_Vect(
         LOCAL_MATRIX_T  local_A      /* in  */, 
	 float           local_x[]    /* in  */,
	 int             n            /* in  */,
	 int             local_n      /* in  */,
	 int             my_rank      /* in  */,
	 float           global_b[]   /* out */){

  float              local_b[MAX_ORDER]; 
  int                i,j,xIndx;

  /* Sum local contributions to the matrix vector product */
  for( i=0; i < n; i++ ){ /* we */
    local_b[i] = 0.0; 
    for( j=0; j < local_n; j++ ){
      xIndx = local_n*my_rank + j;
      local_b[i] = local_b[i]+local_A[i][j]*local_x[xIndx];
    }
  }
  /* Reduce all partial sum results ... this may not be the best way ... 
     perhaps a derived type would be better */
  for( i=0; i < n; i++ ){
    MPI_Allreduce(&(local_b[i]),&(global_b[i]),1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
  }
}


/**********************************************************************/
void PrintResult( 
         int                   n      /* in   */,
         int             my_rank      /* in   */,
	 float           global_b[]   /* in   */){

    int i;

    if( my_rank==0 ){
      for( i=0; i < n; i++ ){
	printf("%7.3f",global_b[i]);
      }
      printf("\n"); 
    }
}


/**********************************************************************/
void Print_Block_Col_Order(
         LOCAL_MATRIX_T  local_A  /* in   */, 
         int             local_n  /* in   */, 
         int                   n  /* in   */,
         int             my_rank  /* in   */,
	 int                   p  /* in   */){

    int i,j,bri,bci,start_row,start_col,tag; 
    float local_A0[MAX_ORDER][MAX_ORDER];
    MPI_Datatype loc_sub_mat_t;
    MPI_Status status; 
  
    /* Create a mpi type used in sending data to process 0 ...
       Q: WAS THE DECLARATION OF THIS TYPE (BELOW) SUFFICIENT? */ 
    MPI_Type_vector(local_n,local_n,MAX_ORDER,MPI_FLOAT,&loc_sub_mat_t); 
    MPI_Type_commit(&loc_sub_mat_t); 

    for( bri=0; bri < p; bri++ ){
      start_row=bri*local_n;
      start_col=0; 
      tag = my_rank + p * bri; 
      MPI_Send(&(local_A[start_row][start_col]),1,loc_sub_mat_t,0,tag,MPI_COMM_WORLD); 
    }

    /* Print these suckers... */ 
    if( my_rank==0 ){
      for( bri=0; bri < p; bri++ ){
	for( bci=0; bci < p; bci++ ){
	  start_col = bci*local_n;
	  start_row = bri*local_n; 
	  tag = bci + p * bri;
	  MPI_Recv(&(local_A0[start_row][start_col]),1,loc_sub_mat_t,MPI_ANY_SOURCE,tag,MPI_COMM_WORLD,&status); 
	}
      } /* All submatrices recieved ... as process==0 I'll do my thang ... */
      for( i=0; i < n; i++ ){
	for( j=0; j < n; j++ ){
	  printf("%7.2f ",local_A0[i][j]); 
	}
	printf("\n"); 
      }
      printf("\n"); 
    }
}


/**********************************************************************/
void Read_Row_Major_Matrix(
         LOCAL_MATRIX_T  local_A    /* out */, 
         float           local_x[]  /* out */, 
         int            *local_n    /* out  */, 
         int                  *n    /* out  */,
         int             my_rank    /* in   */,
         int                   p    /* in   */) {

    int             i, j, bri, bci;
    int             send_start_row, send_start_col; 
    int             recv_start_row, recv_start_col; 
    char mat_file[] = "prob_6.10.1.b.dat";
    FILE *fp; 
    float local_A0[MAX_ORDER][MAX_ORDER];
    MPI_Datatype loc_sub_mat_t; 
    MPI_Status status; 

    /* Use rank==0 process as gatekeeper for all input data...
       this means that we read all data first and then send it out. */ 
    if( my_rank==0 ){
      if( (fp=fopen(mat_file,"r")) == NULL ){
	printf("Error opening file...exiting\n"); 
	MPI_Finalize(); 
	exit(1); 
      }
      /* read in matrix size (n x n): */ 
      fscanf(fp,"%d",n); 
      printf("n = %d\n",*n);

      /* compute the local block row/column size: */ 
      (*local_n) = (*n)/p; 
      printf("local_n = %d\n",*local_n);
      
      /* read matrix (row major order) into local storage on process==0: */ 
      for( i=0; i < (*n); i++ ){
	for( j=0; j < (*n); j++ ){
	  fscanf(fp,"%f",&local_A0[i][j]);
	}
      }
      /* for debugging ... print matrix */ 
      for( i=0; i < (*n); i++ ){
	for( j=0; j < (*n); j++ ){
	  printf("%f  ",local_A0[i][j]);
	}
	printf("\n"); 
      }
    } /* end data input processing for my_rank==0 */

    /* process==0 now has all the data, send it to all OTHER processes... */ 
    MPI_Bcast(n,1,MPI_INTEGER,0,MPI_COMM_WORLD); 
    MPI_Bcast(local_n,1,MPI_INTEGER,0,MPI_COMM_WORLD); 

    /* Create a mpi type used in sending data to the other processes */ 
    MPI_Type_vector((*local_n),(*local_n),MAX_ORDER,MPI_FLOAT,&loc_sub_mat_t); 
    MPI_Type_commit(&loc_sub_mat_t); 
    
    /* Issue send and receives... */
    for( bri=0; bri < p; bri++ ){
      for( bci=0; bci < p; bci++ ){
	send_start_row = bri*(*local_n); 
	send_start_col = bci*(*local_n); 
	if( my_rank==0 ){
	  MPI_Send(&(local_A0[send_start_row][send_start_col]),1,
		   loc_sub_mat_t,bri,0,MPI_COMM_WORLD);
	}
	recv_start_row = send_start_row; 
	recv_start_col = 0; 
	if( my_rank==bci ){
	  MPI_Recv(&(local_A[recv_start_row][recv_start_col]),1,
		   loc_sub_mat_t,0,0,MPI_COMM_WORLD,&status);
	}
      }
    }
    /* 
       At this point the input matrix is distributed (in block columns) 
       across all processors...
    */ 

    /* Create a vector of all ones in each local process */ 
    for( i=0; i < (*n); i++ ) local_x[i] = 1.0; 
}