#include "tensorTypes.h"

void lpRq (hInt_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p, hInt_t q) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;
  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++modOffset) {
      hDim_t idx = tmp2 + modOffset + rts;
      for (i = 1; i < p-1; ++i) {
        hInt_t temp = y[(idx-rts)*tupSize] + y[idx*tupSize];
        if (temp >= q) y[idx*tupSize]=temp-q;
        else y[idx*tupSize] = temp;
        idx += rts;
      }
    }
  }
}

void lpR (hInt_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++modOffset) {
      hDim_t idx = tmp2 + modOffset + rts;
      for (i = 1; i < p-1; ++i) {
        y[idx*tupSize] += y[(idx-rts)*tupSize];
        idx += rts;
      }
    }
  }
}

void lpDouble (double* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++modOffset) {
      hDim_t idx = tmp2 + modOffset + rts;
      for (i = 1; i < p-1; ++i) {
        y[idx*tupSize] += y[(idx-rts)*tupSize];
        idx += rts;
      }
    }
  }
}

void lpC (complex_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++modOffset) {
      hDim_t idx = tmp2 + modOffset + rts;
      for (i = 1; i < p-1; ++i) {
        CMPLX_IADD (y[idx*tupSize], y[(idx-rts)*tupSize]);
        idx += rts;
      }
    }
  }
}

void lpInvRq (hInt_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p, hInt_t q) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++ modOffset) {
      hDim_t tensorOffset = tmp2 + modOffset;
      hDim_t idx = tensorOffset + (p-2) * rts;
      for (i = p-2; i != 0; --i) {
        hInt_t temp = y[idx*tupSize] - y[(idx-rts)*tupSize] + q;
        if (temp >= q) y[idx*tupSize]=temp-q;
        else y[idx*tupSize] = temp;
        idx -= rts;
      }
    }
  }
}

void lpInvR (hInt_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++ modOffset) {
      hDim_t tensorOffset = tmp2 + modOffset;
      hDim_t idx = tensorOffset + (p-2) * rts;
      for (i = p-2; i != 0; --i) {
        y[idx*tupSize] -= y[(idx-rts)*tupSize] ;
        idx -= rts;
      }
    }
  }
}

void lpInvDouble (double* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++ modOffset) {
      hDim_t tensorOffset = tmp2 + modOffset;
      hDim_t idx = tensorOffset + (p-2) * rts;
      for (i = p-2; i != 0; --i) {
        y[idx*tupSize] -= y[(idx-rts)*tupSize] ;
        idx -= rts;
      }
    }
  }
}

void lpInvC (complex_t* y, hShort_t tupSize, hDim_t lts, hDim_t rts, hDim_t p) {
  hDim_t blockOffset;
  hDim_t modOffset;
  int i;

  hDim_t tmp1 = rts*(p-1);
  for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
    hDim_t tmp2 = blockOffset*tmp1;
    for (modOffset = 0; modOffset < rts; ++ modOffset) {
      hDim_t tensorOffset = tmp2 + modOffset;
      hDim_t idx = tensorOffset + (p-2) * rts;
      for (i = p-2; i != 0; --i) {
        CMPLX_ISUB (y[idx*tupSize], y[(idx-rts)*tupSize]);
        idx -= rts;
      }
    }
  }
}

void ppLRq (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpRq (((hInt_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p, qs[tupIdx]);
    }
}

void ppLR (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpR (((hInt_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}

void ppLDouble (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpDouble (((double*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}

void ppLC (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpC (((complex_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}


void ppLInvRq (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpInvRq (((hInt_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p, qs[tupIdx]);
    }
}

void ppLInvR (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpInvR (((hInt_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}

void ppLInvDouble (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpInvDouble (((double*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}

void ppLInvC (void* y, hShort_t tupSize, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t* qs) {
#ifdef DEBUG_MODE
  ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
    if(p == 2) return;
    for(int tupIdx = 0; tupIdx < tupSize; tupIdx++) {
      lpInvC (((complex_t*)y)+tupIdx, tupSize, lts*ipow(p,e-1), rts, p);
    }
}

#ifdef STATS
int lrqCtr = 0;
int lrCtr = 0;
int ldCtr = 0;
int lcCtr = 0;
int lirqCtr = 0;
int lirCtr = 0;
int lidCtr = 0;
int licCtr = 0;

struct timespec lrqTime = {0,0};
struct timespec lrTime = {0,0};
struct timespec ldTime = {0,0};
struct timespec lcTime = {0,0};
struct timespec lirqTime = {0,0};
struct timespec lirTime = {0,0};
struct timespec lidTime = {0,0};
struct timespec licTime = {0,0};
#endif


void tensorLRq (hShort_t tupSize, hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE, hInt_t* qs) {
#ifdef STATS
    lrqCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    hDim_t i;
    printf("\n\nEntered tensorLRq\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\tq=%" PRId64 "\n[", totm, sizeOfPE, q);
    /*for(i = 0; i < totm; i++) {
        printf("%" PRId64 ",", y[i]);
    }*/
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
  tensorFuser (y, tupSize, ppLRq, totm, peArr, sizeOfPE, qs); // don't need to shift here
#ifdef DEBUG_MODE
  for(i = 0; i < totm*tupSize; i++) {
      if(y[i]<0) {
          printf("tensorLRq\n");
      }
  }
#endif
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lrqTime = tsAdd(lrqTime, tsSubtract(t1,s1));
#endif
}

void tensorLR (hShort_t tupSize, hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lrCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLR\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("%" PRId64 ",", y[i]);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
  tensorFuser (y, tupSize, ppLR, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lrTime = tsAdd(lrTime, tsSubtract(t1,s1));
#endif
}

void tensorLDouble (hShort_t tupSize, double* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    ldCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLDouble\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("%f,", y[i]);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
  tensorFuser (y, tupSize, ppLDouble, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    ldTime = tsAdd(ldTime, tsSubtract(t1,s1));
#endif
}

void tensorLC (hShort_t tupSize, complex_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lcCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLC\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("(%f,%f),", y[i].real, y[i].imag);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
  tensorFuser (y, tupSize, ppLC, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lcTime = tsAdd(lcTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvRq (hShort_t tupSize, hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE, hInt_t* qs) {
#ifdef STATS
    lirqCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
  tensorFuser (y, tupSize, ppLInvRq, totm, peArr, sizeOfPE, qs);
#ifdef DEBUG_MODE
  hDim_t i;
  for(i = 0; i < totm*tupSize; i++)
  {
      if(y[i]<0)
      {
          printf("tensorLInvRq\n");
      }
  }
#endif
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lirqTime = tsAdd(lirqTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvR (hShort_t tupSize, hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lirCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
  tensorFuser (y, tupSize, ppLInvR, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lirTime = tsAdd(lirTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvDouble (hShort_t tupSize, double* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lidCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
  tensorFuser (y, tupSize, ppLInvDouble, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lidTime = tsAdd(lidTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvC (hShort_t tupSize, complex_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    licCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
  tensorFuser (y, tupSize, ppLInvC, totm, peArr, sizeOfPE, (hInt_t*)0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    licTime = tsAdd(licTime, tsSubtract(t1,s1));
#endif
}