/****(c) 2020. A. Gerbessiotis. See ../KAagcopy.txt ****/
/*******************************************************/
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "avgaii.h"
#include "aimisc.h"

/* operators for bspfold */
void foldmax(double *z, double *x, double *y)
{
  *z=(((*x)>=(*y))?(*x):(*y));
}
void foldmin(double *z, double *x, double *y)
{
  *z=(((*x)<(*y))?(*x):(*y));
}
void foldsum(int *z,int *x,int *y)
{
  *z= *x + *y;
}
/* void oper_add(idata *result,idata *left,idata *right,int size)
{
        register int    i;
        for (i=0;i<size;i++)
                result[i]=left[i]+right[i];
} */

/* One superstep broadcasting to communicate command line args */
void bspbroad(int fromp, char *from, char *to, int size)
{
 register int pid,nprocs,i;
 pid=AIPID();  nprocs=AINPROCS(); 
 AIREGISTER(to,size);   /* ALL REGISTRATIONS INTERNAL */
 AICOMMIT();
 AICOMSTART();
 for (i=0; i < nprocs; i++){
   if (pid==fromp)
        AIHPPUT(i,from,to,0,size);
 }
 AICOMEND();  
 AIDEREGISTER((void*)to);
}

/* One superstep scatter (special) */
void bspscatter(int fromp, char *from, char *to, int size)
{
 register int pid,nprocs,i;
 pid=AIPID();  nprocs=AINPROCS(); 
 AIREGISTER(to,size);   /* ALL REGISTRATIONS INTERNAL */
 AICOMMIT();
 AICOMSTART();
 for (i = 0; i < nprocs;i++) {
   if (pid==fromp)
        AIHPPUT(i,from+i*(size),to,0,size);
 }
 AICOMEND(); 
 AIDEREGISTER((void*)to);
}

/* One superstep gather (special) */
void bspgather(int top, char *from, char *to, int size)
{
 register int pid,nprocs,i;
 pid=AIPID(); nprocs=AINPROCS(); 
 AIREGISTER(to,nprocs*size);   /* ALL REGISTRATIONS INTERNAL */
 AICOMMIT();
 AICOMSTART();
 for (i = 0; i < nprocs;i++) {
   if (pid==i)
        AIHPPUT(0,from,to,i*size,size);
 }
 AICOMEND(); 
 AIDEREGISTER((void*)to);
}

/* One superstep gather (special) */
void bspunevengather(int top, char *from, char *to, int size, int* rrsize,int* prsize)
{
 register int pid,nprocs,i,total;
 pid=AIPID(); nprocs=AINPROCS(); 
 total=0;
 for(i=0;i<nprocs;i++) total +=rrsize[i];
 AIREGISTER(to,total*size);   /* ALL REGISTRATIONS INTERNAL */
 AICOMMIT();
 AICOMSTART();
 for (i = 0; i < nprocs;i++) {
   if (pid==i)
        AIHPPUT(0,from,to,prsize[i]*size,rrsize[i]*size);
 }
 AICOMEND(); 
 AIDEREGISTER((void*)to);
}

/* One superstep folding operation to compute global min/max of timing data */
void 
  bspfold(void (*oper)(void *,void*,void*),char *from,char *to,int size)
{
 register int  pid, nprocs, i, lsize; 
 char *list;

 list = NULL;
 pid=AIPID(); nprocs=AINPROCS(); 
 if (1==nprocs)  /* if local, copy */ {
    memcpy(to,from,size*sizeof(char));
    return;
 }
 else {    /* if global, allocate memory O(p) per processor */
    lsize = nprocs*size; 
    list=(char *) malloc(lsize*sizeof(char));
    AIREGISTER(list,lsize);   /* ALL REGISTRATIONS INTERNAL */
    AICOMMIT();
 }
 AICOMSTART();
 /* each proc sends its from to every other proc; results stored in list */
 for (i = 0; i < nprocs;i++) {  
   AIHPPUT(i,from,list,pid*size,size);
 }
 AICOMEND();
 /* apply operator to local list */
 for (i = 0; i < nprocs-1;i++) {
  oper(to,(list+i*size),(list+(i+1)*size));
  memcpy((list+(i+1)*size),to,size);
 }
    /* free memory and deregister */
    AIDEREGISTER(list);
    free((void *)list);
 }

/* 2-superstep parallel prefix/scan for a vector plus a broadcast! (third superstep) */
void ai2scan(void (*operator)(),int multi,char *from,char *to, char *all,int nbytes)
{
 register int   pid,nprocs, i, size, sze;
 register int   offset, ceil_temp,left_temp,floor_temp;
 char           *temp;                  /* buffer array */

 pid = AIPID(); nprocs = AINPROCS();
 /* Slit either floor(multi/nprocs) or ceil(multi/nprocs) */
 left_temp = LEFT(multi,nprocs);
 ceil_temp = CEIL(multi,nprocs);  /* CEIL  if  pid <  left_temp */
 floor_temp = FLOOR(multi,nprocs);/* FLOOR if  pid >= left_temp */

 /* allocate temp space and check */
 temp =  (char *) malloc(ceil_temp*nprocs*nbytes);
 AIMREGISTER(temp,ceil_temp*nprocs*nbytes);
 AIMCOMMIT();

 /* In first phase, each processor sends its i-th block to
 *  processor i  */
 AIMCOMSTART(temp);
 for (i=0;i<nprocs;i++) {
    if (i < left_temp) {
          sze= ceil_temp; /* send a ceil_temp block */
          offset=i*sze;   /* starting at... */
    } else {
          sze= floor_temp; /* send a floor_temp block */
          offset=left_temp*ceil_temp+(i-left_temp)*sze;
    }
    AIMHPPUT(i,&from[offset*nbytes],temp,pid*sze*nbytes,sze*nbytes);
 }
 AIMCOMEND(temp); 
 /* In second phase a local prefix operation is performed */

 /* Did pid get a ceil_temp or floor_temp block? */
 if (pid < left_temp)
       size=ceil_temp;
    else
       size=floor_temp;


 for(i=1;i<nprocs;i++) {
    operator(&temp[i*size*nbytes],&temp[(i-1)*size*nbytes],
                                  &temp[i*size*nbytes],size);
 }

 /* Registration for put  operations */
 AIMREGISTER(to,multi*nbytes);
 AIMCOMMIT();
 /* Register if necessary. PUSHED is defined in my_bsp.h */

 /* In third phase the computed prefices are sent back  */
 /* Where does pid send this block? */
 if (pid < left_temp) {
         sze= ceil_temp;
         offset=pid*sze;
 } else {
         sze= floor_temp;
         offset=left_temp*ceil_temp+(pid-left_temp)*sze;
 }
 AIMCOMSTART(to);
 for (i=0;i<nprocs;i++) {
         AIMPUT(i,&temp[i*size*nbytes],to,offset*nbytes,size*nbytes);
 }
 AIMCOMEND(to);

 AIMREGISTER(all,multi*nbytes);
 AIMCOMMIT();
 AIMCOMSTART(all);
 for (i=0;i<nprocs;i++) {
         AIMPUT(i,&temp[(nprocs-1)*size*nbytes],all,offset*nbytes,size*nbytes);
 }
 AIMCOMEND(all);
 AIBARRIER();            /* NOT NEEDED JUST IN CASE */
 /* free memory and deregister information */
 AIMDEREGISTER(all);
 AIMDEREGISTER(to );
 AIMDEREGISTER(temp);
 /* Unregister, if necessary */
 free((void*)temp);
}

/* One superstep simple scan operation */
void aiscan(int  *from,int  *to,int size)
{
 register int  pid, nprocs, i, k,lsize;
 int  *list;

 pid=AIPID(); nprocs=AINPROCS();
 if (1==nprocs){  /* if local, copy */
    memcpy(to,from,size*sizeof(int));
    return;
 }
 else {    /* if global, allocate memory O(p) per processor */
    lsize = nprocs*size*sizeof(int);
    list=(int  *) malloc(lsize);
    AIREGISTER(list,lsize);
    AICOMMIT();
 }
 AICOMSTART();
 /* each proc sends its from to every other proc; results stored in list */
 for (i = 0; i < nprocs;i++) {
   AIPUT(i,(char *)from,(char *)list,pid*size*sizeof(int),size*sizeof(int));
 }
 AICOMEND();
 /* apply operator to local list */
  for(k=0;k<nprocs;k++) {
    to[k]=list[k];
  }
   for (i = 1; i <= pid;i++) {
     for(k=0;k<nprocs;k++) {
       to[k] =to[k]+list[i*size+k];
     }
   }
    /* free memory and deregister */

    AIDEREGISTER(list);
    free((void *)list);
}

/* One superstep simple ppf  operation: one int per processor */
void aippf(int  *from,int *list, int  *to)
{
 register int  pid, nprocs, i;

 pid=AIPID(); nprocs=AINPROCS();
 memset(list,0,nprocs*sizeof(int));
 memset(to,0,nprocs*sizeof(int));
 if (1==nprocs){  /* if local, copy */
    memcpy(list,from,sizeof(int));
    return;
 }
 else {    /* if global, allocate memory O(p) per processor */
    AIREGISTER(list,nprocs*sizeof(int));
    AICOMMIT();
 }
 AICOMSTART();
 /* each proc sends its from to every other proc; results stored in list */
 for (i = 0; i < nprocs;i++) {
   AIPUT(i,(char *)from,(char *)list,pid*sizeof(int),sizeof(int));
 }
 AICOMEND();
 /* apply operator to local list */
  for(i=0;i<nprocs;i++) {
    to[i]=list[i];
  }
  to[0]=0;
   for(i=1;i<nprocs;i++) {
       to[i] =list[i-1]+to[i-1];
     }
    /* free memory and deregister */

    AIDEREGISTER(list);
 AIBARRIER();
}


void ai2prefix(void (*oper)(),int multi,char *from,char *to, int nbytes)
{
 register int   pid, nprocs;
 register int   i,size,sze,offset;    /* index variables */
 register int   ceil_temp,left_temp,floor_temp;
 char           *temp;                  /* buffer array */

 pid = AIPID(); nprocs = AINPROCS();
 /* split array of size multi into blocks of size either*/
 /* floor(multi/nprocs) or ceil(multi/nprocs). Assign:  */
 left_temp = LEFT(multi,nprocs);
 ceil_temp = CEIL(multi,nprocs);  /* to procs less than left_temp */
 floor_temp = FLOOR(multi,nprocs);/* to procs at least left_temp */

 /* allocate temp space and check */
 temp =  (char *) malloc(ceil_temp*nprocs*nbytes);
 AIMREGISTER(temp,ceil_temp*nprocs*nbytes);
 AIMCOMMIT();


 /* In first phase, each processor sends its i-th block to
  * processor i */
 AIMCOMSTART(temp);  
 for (i=0;i<nprocs;i++) {
    if (i < left_temp) {
          sze= ceil_temp; /* send a ceil_temp block */
          offset=i*sze;   /* starting at... */
    } else {
          sze= floor_temp; /* send a floor_temp block */
          offset=left_temp*ceil_temp+(i-left_temp)*sze;
    }
    AIMHPPUT(i,&from[offset*nbytes],temp,pid*sze*nbytes,sze*nbytes);
 }
 AIMCOMEND(temp);

 /* In second phase a local prefix operation is performed */

 /* Did pid get a ceil_temp or floor_temp block? */
 if (pid < left_temp)
       size=ceil_temp;
    else
       size=floor_temp;


 for(i=1;i<nprocs;i++) {
    oper(&temp[i*size*nbytes],&temp[(i-1)*size*nbytes],
                                  &temp[i*size*nbytes],size);
 }

 /* Registration for put  operations */
 AIMREGISTER(to,multi*nbytes);
 AIMCOMMIT();
 /* Register if necessary. PUSHED is defined in my_bsp.h */

 /* In third phase the computed prefices are sent back  */
 /* Where does pid send this block? */
 if (pid < left_temp) {
         sze= ceil_temp;
         offset=pid*sze;
 } else {
         sze= floor_temp;
         offset=left_temp*ceil_temp+(pid-left_temp)*sze;
 }
 AIMCOMSTART(to);
 for (i=0;i<nprocs;i++) {
         AIMPUT(i,&temp[i*size*nbytes],to,offset*nbytes,size*nbytes);
 }
 AIMCOMEND(to);


 /* free memory and deregister information */
 AIBARRIER();
 AIMDEREGISTER(to);
 AIMDEREGISTER(temp);
 /* Unregister, if necessary */
 free((void*)temp);
}
