#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>

/* exported */
void XtWXy(int *nxp, int *pxp, int *kxp, 
   int *intercept, int *incy, int *inttype, 
   double *xmat, double *wts, double *yvec, double *res);

void Xty(int *nxp, int *pxp, int *kxp, 
   int *intercept, int *incy, int *inttype, 
   double *xmat, double *yvec, double *res);

void Xxy(int *nxp, int *pxp, int *kxp,
   int *intercept, int *incy, int *inttype, 
   double *xmat, double *yvec, double *res);

void XtWX(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *wts, double *res);

void X(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *res);

/* local */
double *product(int px, int kx, int incy, int inttype, double *x, double *k);
double *tproduct(int px, double *x, double *y, int *kx, double *k, int inttype);
double *fproduct(int px, double *x, double *y, int *kx, double *k);
int rowlength(int px, int kx, int intercept, int incy, int inttype);

int main(int argc, char **argv)
{
   int nx = 1000;
   int px = 8;
   int kx = 15;
   int p2 = rowlength(px,kx,1,1,3);
   int intercept=1,incy=1,inttype=3;
   int i;

   printf("%d\n", p2);

   double *xmat = malloc(nx*px*sizeof(double));
   double *wts = malloc(nx * sizeof(double));
   double *beta=malloc(p2*sizeof(double));
   double *yvec = malloc(nx * sizeof(double)); 
   double *res = malloc(p2*sizeof(double)+nx*sizeof(double));
   double *res2 = malloc(p2*p2*sizeof(double)+nx*sizeof(double));

   if (xmat == 0 || wts == 0 || beta == 0 || yvec == 0 || res == 0 || res2 == 0) {
      fprintf(stderr,"Memory issues\n");
      exit(1);
   }

   for (i = 0; i < 0; i++) {
      Xxy(&nx, &px, &kx, &intercept, &incy, &inttype, xmat, beta, res ); 
      printf("Xxy done %d\n",i);
   }

   for (i = 0; i < 10; i++) {
      XtWXy(&nx, &px, &kx, &intercept, &incy, &inttype, xmat, wts, beta, res ); 
      printf("XtWXy done %d\n",i);
   }

   for (i = 0; i < 0; i++) {
      Xty(&nx, &px, &kx, &intercept, &incy, &inttype, xmat, yvec, res ); 
      printf("Xty done %d\n",i);
   }

   for (i = 0; i < 0; i++) {
      XtWX(&nx, &px, &kx, &intercept, &incy, &inttype, xmat, wts, res2 ); 
      printf("XtWX done %d\n",i);
   }

   /* write(STDOUT_FILENO, res, p2); */

   free(res2);free(res); free(yvec); free(beta); free(wts); free(xmat);
   exit(0);
}

int rowlength(int p, int k, int intercept, int incy, int inttype)
{
   int d = p * k;
   int px = p;

   if (incy == 1) px = px-1;

   if ((inttype & 1) != 0) {
      d = d + (px*(px-1)/2)*k*k;
   }

   if ((inttype & 2) != 0) {
      d = d + px * k*(k-1)/2;
   }

   if (incy==1 && ((inttype & 4) != 0)) {
      d = d + px*k*k;
   }

   if (incy==1 && ((inttype & 8) != 0)) {
      d = d + k*(k-1)/2;
   }
   
   if ((inttype & 16) != 0) {
      d = d + px*k;
   }

   if (incy == 1 && ((inttype & 8) != 0) && ((inttype & 16) != 0)) {
      d = d + k;
   }

   if (intercept == 1) d = d+1;

   return d;
}

double *product(int p, int k, int incy, int inttype, double *x, double *out)
{
   int i1,i2;
   double *xx = 0;
   double *yy = 0;
   int pl = p,pr = p;
   int l = 0, ol = 0;


   if (incy == 1) { 
      if ((inttype & 8) == 0) pl = pl-1;
      if ((inttype & 4) == 0) pr = pr-1;
   } else {
      if ((inttype & 2) == 0) pl = pl-1;
   }

   /* printf("%s %d %d %d %d %d %d\n", "product", p, k, incy, inttype, pl, pr); */

   for (i1=0; i1 < pl; i1++) for (i2=i1; i2 < pr; i2++) {
      
      xx=&x[(i1)*k];
      yy=&x[(i2)*k];
      if (i1 == i2) tproduct(k,xx,yy,&ol,&out[l],inttype);
      else fproduct(k,xx,yy,&ol,&out[l]);
      l = l + ol;
      /* printf("%d %d %d %d\n", i1, i2, l, ol); */

   }
   return out;
} 

double *tproduct(int k, double *x, double *y, int *ol, double *out, int inttype)
{
   int i,j,j0;
   int l = 0;

   for (i=0; i < k; i++)  {
      j0 = i;
      if ((inttype & 16) == 0) j0 = j0 + 1;
      for (j = j0; j < k; j++) {
         out[l]=x[i]*y[j];
         l=l+1;
      }
   }
   *ol = l;
   return out;
} 

double *fproduct(int k, double *x, double *y, int *ol, double *out)
{
   int i,j;
   int l = 0;

   for (i=0; i < k; i++) for (j = 0; j < k; j++) 
   {
      out[l]=x[i]*y[j];
      l=l+1;
   }
   *ol = l;
   return out;
} 


void XtWXy(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *wts, double *yvec, double *res)
{
   int nx = nxp[0];
   int p  = pxp[0];
   int k  = kxp[0];
   int px = p*k;

   int incy = incyp[0];
   int inttype = inttypep[0];
   int intercept = interceptp[0];

   int i,j,s=0;
   double *sum=res;
   double res1 = 0;
   double *xi;
   double *xr;
   double *loc_sum;
   int px2 = rowlength(p,k,intercept,incy,inttype);
   int tlimit = omp_get_thread_limit();
   if (tlimit > 1000) tlimit = 0;

	/* printf("nxp=%d,pxp=%d,px2=%d\n", nx, px, px2); */
   for (j=0; j < px2; j++) sum[j]=0;

   #pragma omp parallel private(j,res1,xi,xr,loc_sum) num_threads(tlimit)
   {
      loc_sum = malloc(px2*sizeof(double));
      if(loc_sum == 0) exit(1);
      for (j=0; j < px2; j++) loc_sum[j]=0;
      xi=malloc(px2 * sizeof(double));
      if(xi == 0) exit(1);
      #pragma omp for schedule(static,1)
      for (i=0; i<nx; i++)
      {
         xr=&xmat[i*px];
         if (intercept == 1) {
            xi[0]=1;
            s=1;
         }
         for (j=0; j < px;j++) xi[s+j]=xr[j];
         product(p, k, incy, inttype, &xi[s], &xi[px+s]);

         res1=0;
         for (j=0; j < px2; j++) res1=res1+xi[j]*yvec[j];
         res1=res1*wts[i];

         for (j=0; j < px2; j++) xi[j]=xi[j]*res1;

         for (j=0; j < px2; j++) loc_sum[j]=loc_sum[j]+xi[j];
      }
   
      #pragma omp critical
      for (j=0; j < px2; j++) sum[j]=sum[j]+loc_sum[j];
      free(loc_sum);
      loc_sum=NULL;
      free(xi);
      xi=NULL;
   }
}


void Xty(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *yvec, double *res)
{
   int nx = nxp[0];
   int p  = pxp[0];
   int k  = kxp[0];
   int px = p*k;

   int incy = incyp[0];
   int inttype = inttypep[0];
   int intercept = interceptp[0];

   int i,j,s=0;
   double *sum=res;
   double *xi;
   double *xr;
   double *loc_sum;
   int px2 = rowlength(p,k,intercept,incy,inttype);
   int tlimit = omp_get_thread_limit();
   if (tlimit > 1000) tlimit = 0;

   /* printf("nxp=%d,pxp=%d,px2=%d\n", nx, px, px2); */
   for (j=0; j < px2; j++) sum[j]=0;

   #pragma omp parallel private(j,xi,xr,loc_sum) num_threads(tlimit)
   {
      loc_sum = malloc(px2*sizeof(double));
      if(loc_sum == 0) exit(1);
      for (j=0; j < px2; j++) loc_sum[j]=0;
      xi=malloc(px2 * sizeof(double));
      if(xi == 0) exit(1);
      #pragma omp for schedule(static,1)
      for (i=0; i<nx; i++)
      {
         xr=&xmat[i*px];
         if (intercept == 1) {
            xi[0]=1;
            s=1;
         }
         for (j=0; j < px;j++) xi[s+j]=xr[j];
         product(p, k, incy, inttype, &xi[s], &xi[px+s]);

         for (j=0; j < px2; j++) loc_sum[j]=loc_sum[j]+xi[j]*yvec[i];
      }
   
      #pragma omp critical
      for (j=0; j < px2; j++) sum[j]=sum[j]+loc_sum[j];
      free(loc_sum);
      loc_sum=NULL;
      free(xi);
      xi=NULL;


   }
}


/* calculates X times y */
void Xxy(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *yvec, double *res)
{
   int nx = nxp[0];
   int p  = pxp[0];
   int k  = kxp[0];
   int px = p*k;

   int incy = incyp[0];
   int inttype = inttypep[0];
   int intercept = interceptp[0];

   int i,j,s=0;
   double *sum=res;
   double res1 = 0;
   double *xi;
   double *xr;
   int px2 = rowlength(p,k,intercept,incy,inttype);
   int tlimit = omp_get_thread_limit();
   if (tlimit > 1000) tlimit = 0;

   /* printf("nxp=%d,pxp=%d,px2=%d\n", nx, px, px2); */

   #pragma omp parallel private(j,res1,xi,xr) num_threads(tlimit)
   {
      xi=malloc(px2 * sizeof(double));
      if(xi == 0) exit(1);
      #pragma omp for schedule(static,1)
      for (i=0; i<nx; i++)
      {
         xr=&xmat[i*px];
         if (intercept == 1) {
            xi[0]=1;
            s=1;
         }
         for (j=0; j < px;j++) xi[s+j]=xr[j];
         product(p, k, incy, inttype, &xi[s], &xi[px+s]);

         res1=0;
         for (j=0; j < px2; j++) res1=res1+xi[j]*yvec[j];

         sum[i] = res1;
      }

      #pragma omp critical
      free(xi);
      xi=NULL;

   }
}

void XtWX(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *wts, double *res)
{
   int nx = nxp[0];
   int p  = pxp[0];
   int k  = kxp[0];
   int px = p*k;

   int incy = incyp[0];
   int inttype = inttypep[0];
   int intercept = interceptp[0];

   int i,j,j1,j2,s=0;
   double *sum=res;
   double *xi,*xl;
   double *xr;
   double *loc_sum;
   int px2 = rowlength(p,k,intercept,incy,inttype);
   int tlimit = omp_get_thread_limit();
   if (tlimit > 1000) tlimit = 0;

   /* printf("nxp=%d,pxp=%d,px2=%d\n", nx, px, px2); */
   for (j=0; j < px2*px2; j++) sum[j]=0;

   #pragma omp parallel private(j,j1,j2,xi,xr,xl,loc_sum) num_threads(tlimit)
   {
      loc_sum = malloc(px2*px2*sizeof(double));
      if (loc_sum == 0) exit(1);
      for (j=0; j < px2*px2; j++) loc_sum[j]=0;
      xi=malloc(px2 * sizeof(double));
      if (xi == 0) exit(1);
      xl=malloc(px2 * sizeof(double));
      if (xl == 0) exit(1);
      #pragma omp for schedule(static,1)
      for (i=0; i<nx; i++)
      {
         xr=&xmat[i*px];
         if (intercept == 1) {
            xi[0]=1;
            s=1;
         }
         for (j=0; j < px; j++) xi[s+j]=xr[j];
         product(p, k, incy, inttype, &xi[s], &xi[px+s]);

         for (j1=0; j1 < px2; j1++) 
            for (j2=0; j2 < px2; j2++) {
               loc_sum[j1*px2 + j2]=loc_sum[j1*px2 + j2] + xi[j1]*xi[j2]*wts[i];
         }
      }
   
      #pragma omp critical
      for (j=0; j < px2*px2; j++) sum[j]=sum[j]+loc_sum[j];
      free(loc_sum);
      loc_sum=NULL;
      free(xi);
      xi=NULL;
      free(xl);
      xl=NULL;
   }
}


void X(int *nxp, int *pxp, int *kxp, 
   int *interceptp, int *incyp, int *inttypep, 
   double *xmat, double *res)
{
   int nx = nxp[0];
   int p  = pxp[0];
   int k  = kxp[0];
   int px = p*k;

   int incy = incyp[0];
   int inttype = inttypep[0];
   int intercept = interceptp[0];

   int i,j,s=0;
   double *mat=res;
   double *xi;
   double *xr;
   int px2 = rowlength(p,k,intercept,incy,inttype);
   int tlimit = omp_get_thread_limit();
   if (tlimit > 1000) tlimit = 0;

   /* printf("nxp=%d,pxp=%d,px2=%d\n", nx, px, px2); */
   for (j=0; j < nx*px2; j++) mat[j]=0;

   #pragma omp parallel private(j,xi,xr) num_threads(tlimit)
   {
      xi=malloc(px2 * sizeof(double));
      if (xi == 0) exit(1);
      #pragma omp for schedule(static,1)
      for (i=0; i<nx; i++)
      {
         xr=&xmat[i*px];
         if (intercept == 1) {
            xi[0]=1;
            s=1;
         }
         for (j=0; j < px; j++) xi[s+j]=xr[j];
         product(p, k, incy, inttype, &xi[s], &xi[px+s]);

         for (j=0; j < px2; j++) mat[i*px2+j] = xi[j];

      }
   
      #pragma omp critical
      free(xi);
      xi=NULL;
   }
}
