#include "tensorTypes.h"

void lpRq (hInt_t* y, hDim_t lts, hDim_t rts, hDim_t p, hInt_t q) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;
	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++modOffset) {
			hDim_t idx = tmp2 + modOffset + rts;
			for (i = 1; i < p-1; ++i) {
        hInt_t temp = y[idx-rts] + y[idx];
        if (temp >= q) y[idx]=temp-q;
        else y[idx] = temp;
				idx += rts;
			}
		}
	}
}

void lpR (hInt_t* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset)	{
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++modOffset) {
			hDim_t idx = tmp2 + modOffset + rts;
			for (i = 1; i < p-1; ++i) {
				y[idx] += y[idx-rts];
				idx += rts;
			}
		}
	}
}

void lpDouble (double* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++modOffset) {
			hDim_t idx = tmp2 + modOffset + rts;
			for (i = 1; i < p-1; ++i) {
				y[idx] += y[idx-rts];
				idx += rts;
			}
		}
	}
}

void lpC (complex_t* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++modOffset) {
			hDim_t idx = tmp2 + modOffset + rts;
			for (i = 1; i < p-1; ++i) {
				CMPLX_IADD (y[idx], y[idx-rts]);
				idx += rts;
			}
		}
	}
}

void lpInvRq (hInt_t* y, hDim_t lts, hDim_t rts, hDim_t p, hInt_t q) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset)	{
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++ modOffset) {
			hDim_t tensorOffset = tmp2 + modOffset;
			hDim_t idx = tensorOffset + (p-2) * rts;
			for (i = p-2; i != 0; --i) {
        hInt_t temp = y[idx] - y[idx-rts] + q;
        if (temp >= q) y[idx]=temp-q;
        else y[idx] = temp;
				idx -= rts;
			}
		}
	}
}

void lpInvR (hInt_t* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++ modOffset) {
			hDim_t tensorOffset = tmp2 + modOffset;
			hDim_t idx = tensorOffset + (p-2) * rts;
			for (i = p-2; i != 0; --i) {
				y[idx] -= y[idx-rts] ;
				idx -= rts;
			}
		}
	}
}

void lpInvDouble (double* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++ modOffset) {
			hDim_t tensorOffset = tmp2 + modOffset;
			hDim_t idx = tensorOffset + (p-2) * rts;
			for (i = p-2; i != 0; --i) {
				y[idx] -= y[idx-rts] ;
				idx -= rts;
			}
		}
	}
}

void lpInvC (complex_t* y, hDim_t lts, hDim_t rts, hDim_t p) {
	hDim_t blockOffset;
	hDim_t modOffset;
	int i;

	hDim_t tmp1 = rts*(p-1);
	for (blockOffset = 0; blockOffset < lts; ++blockOffset) {
		hDim_t tmp2 = blockOffset*tmp1;
		for (modOffset = 0; modOffset < rts; ++ modOffset) {
			hDim_t tensorOffset = tmp2 + modOffset;
			hDim_t idx = tensorOffset + (p-2) * rts;
			for (i = p-2; i != 0; --i) {
				CMPLX_ISUB (y[idx], y[idx-rts]);
				idx -= rts;
			}
		}
	}
}

void ppLRq (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpRq ((hInt_t*)y, lts*ipow(p,e-1), rts, p, q);
}

void ppLR (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpR ((hInt_t*)y, lts*ipow(p,e-1), rts, p);
}

void ppLDouble (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpDouble ((double*)y, lts*ipow(p,e-1), rts, p);
}

void ppLC (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpC ((complex_t*)y, lts*ipow(p,e-1), rts, p);
}


void ppLInvRq (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpInvRq ((hInt_t*)y, lts*ipow(p,e-1), rts, p, q);
}

void ppLInvR (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpInvR ((hInt_t*)y, lts*ipow(p,e-1), rts, p);
}

void ppLInvDouble (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpInvDouble ((double*)y, lts*ipow(p,e-1), rts, p);
}

void ppLInvC (void* y, PrimeExponent pe, hDim_t lts, hDim_t rts, hInt_t q) {
#ifdef DEBUG_MODE
	ASSERT (q==0);
#endif
    hDim_t p = pe.prime;
    hShort_t e = pe.exponent;
	lpInvC ((complex_t*)y, lts*ipow(p,e-1), rts, p);
}

#ifdef STATS
int lrqCtr = 0;
int lrCtr = 0;
int ldCtr = 0;
int lcCtr = 0;
int lirqCtr = 0;
int lirCtr = 0;
int lidCtr = 0;
int licCtr = 0;

struct timespec lrqTime = {0,0};
struct timespec lrTime = {0,0};
struct timespec ldTime = {0,0};
struct timespec lcTime = {0,0};
struct timespec lirqTime = {0,0};
struct timespec lirTime = {0,0};
struct timespec lidTime = {0,0};
struct timespec licTime = {0,0};
#endif


void tensorLRq (hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE, hInt_t q) {
#ifdef STATS
    lrqCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    hDim_t i;
    printf("\n\nEntered tensorLRq\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\tq=%" PRId64 "\n[", totm, sizeOfPE, q);
    /*for(i = 0; i < totm; i++) {
        printf("%" PRId64 ",", y[i]);
    }*/
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
	tensorFuser (y, ppLRq, totm, peArr, sizeOfPE, q); // don't need to shift here
#ifdef DEBUG_MODE
	for(i = 0; i < totm; i++) {
	    if(y[i]<0) {
	        printf("tensorLRq\n");
	    }
	}
#endif
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lrqTime = tsAdd(lrqTime, tsSubtract(t1,s1));
#endif
}

void tensorLR (hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lrCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLR\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("%" PRId64 ",", y[i]);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
	tensorFuser (y, ppLR, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lrTime = tsAdd(lrTime, tsSubtract(t1,s1));
#endif
}

void tensorLDouble (double* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    ldCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLDouble\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("%f,", y[i]);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
	tensorFuser (y, ppLDouble, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    ldTime = tsAdd(ldTime, tsSubtract(t1,s1));
#endif
}

void tensorLC (complex_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lcCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
#ifdef DEBUG_MODE
    printf("\n\nEntered tensorLC\ttotm=%" PRId32 "\tnumFacts=%" PRId16 "\n[", totm, sizeOfPE);
    hDim_t i;
    for(i = 0; i < totm; i++) {
        printf("(%f,%f),", y[i].real, y[i].imag);
    }
    printf("]\n[");
    for(i = 0; i < sizeOfPE; i++) {
        printf("(%" PRId32 ",%" PRId16 "),", peArr[i].prime, peArr[i].exponent);
    }
    printf("]\n");
#endif
	tensorFuser (y, ppLC, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lcTime = tsAdd(lcTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvRq (hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE, hInt_t q) {
#ifdef STATS
    lirqCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
	tensorFuser (y, ppLInvRq, totm, peArr, sizeOfPE, q);  // don't need to shift here
#ifdef DEBUG_MODE
	hDim_t i;
	for(i = 0; i < totm; i++)
	{
	    if(y[i]<0)
	    {
	        printf("tensorLInvRq\n");
	    }
	}
#endif
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lirqTime = tsAdd(lirqTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvR (hInt_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lirCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
	tensorFuser (y, ppLInvR, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lirTime = tsAdd(lirTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvDouble (double* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    lidCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
	tensorFuser (y, ppLInvDouble, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    lidTime = tsAdd(lidTime, tsSubtract(t1,s1));
#endif
}

void tensorLInvC (complex_t* y, hDim_t totm, PrimeExponent* peArr, hShort_t sizeOfPE) {
#ifdef STATS
    licCtr++;
    struct timespec s1,t1;
    clock_gettime(CLOCK_REALTIME, &s1);
#endif
	tensorFuser (y, ppLInvC, totm, peArr, sizeOfPE, 0);
#ifdef STATS
    clock_gettime(CLOCK_REALTIME, &t1);
    licTime = tsAdd(licTime, tsSubtract(t1,s1));
#endif
}