//
// Institute for Signal Processing (University of Luebeck, Germany)
// Copyright (c) 2011 by Radoslaw Mazur
//
// Permission to use, copy, modify, and distribute this software without
// fee is hereby granted FOR RESEARCH/EDUCATION PURPOSES only, provided
// that this copyright notice appears in all copies and in all supporting
// documentation, and that the software is not redistributed for any
// fee (except for a nominal shipping charge).
//
// For any other uses of this software, in original or modified form,
// including but not limited to consulting, production or distribution
// in whole or in part, specific prior permission must be obtained
// from the author.
// Signal processing methods and algorithms implemented by this
// software may be claimed by patents owned by others.
//
// The author makes no representation about the suitability of this
// software for any purpose. It is provided "as is" without warranty
// of any kind, either expressed or implied.
// Beware of the bugs.
//
//     Revision history
//
//     Ver     Date         Description
//     -------------------------------------------------------------------
//     0.5     14-10-2011   basic version

#include <iostream>
#include <stdio.h>


using namespace std;

#include "../include/shorteningData.h"

__global__ void clear_mem(int count, double *data){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < count) {
        data[tid] = 0;
    }
}

__global__ void clear_mem(int count, Complex *data){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < count) {
        data[tid].x = 0;
        data[tid].y = 0;
    }
}

__global__ void complex_conj(int count, int total, Complex *in, Complex *out) {
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < count) {
        out[tid].x = in[tid].x;
        out[tid].y = -in[tid].y;
    } else {
        if (tid<total) {
            out[tid].x = 0.0;
            out[tid].y = 0.0;
        }
    }
}

__global__ void calc_filter_coef_forward(Complex *C, Complex *in, Complex *out,
                                         int  sf_len, int speakers, int microphones, double fft_fact){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < (sf_len * microphones)) {

        int block = tid / sf_len;
        int fbin  = tid % sf_len;

        double ret_re = 0;
        double ret_im = 0;

        for (int i = 0; i < speakers; i++){

            int cpos = fbin + i*sf_len + block * speakers * sf_len;
            int ipos = fbin + i*sf_len;

            ret_re = ret_re + C[cpos].x * in[ipos].x - C[cpos].y * in[ipos].y;
            ret_im = ret_im + C[cpos].y * in[ipos].x + C[cpos].x * in[ipos].y;
        }
        out[tid].x = ret_re * fft_fact;
        out[tid].y = ret_im * fft_fact;
    }
}

__global__ void make_calcs(double *gk, double *wu, double *bu, double *add_out, double pu, int len){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < len) {
        double wuu   = wu[tid];
        double gu    = gk[tid]* wuu;
        double absgu = abs(gu);
        double Bgu   = pow(absgu,pu-1.0);

        add_out[tid]  = Bgu * absgu;
        bu[tid] = copysign(wuu * Bgu,gu);
    }
}


__global__ void sum_reduce(double *in, double *out, int len, double *inv_SG) {
    extern __shared__ double cache[];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;

    double temp =0.0;
    while (tid < len){
        temp += in[tid];
        tid += blockDim.x * gridDim.x;
    }
    cache[cacheIndex] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while ( i != 0) {
        if (cacheIndex < i) {
            cache[cacheIndex] += cache[cacheIndex +i];
        }
        __syncthreads();
        i /=2;
    }
    if (cacheIndex ==0) {
        out[blockIdx.x] = cache[0];
        *inv_SG = 1.0/cache[0];
    }
}



__global__ void gradient_pre_calc(double *bu, double *bd, double *SAgu, double *SAgd, double *out, int count){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < count) {
        out[tid] = *SAgu * bu[tid] - *SAgd * bd[tid];
    }
}



__global__ void calc_filter_coef_back(Complex *Ck, Complex *in, Complex *out,
                                      int  sf_len, int speakers, int microphones, double fft_fact){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < (sf_len * speakers)) {

        int block = tid / sf_len;
        int fbin  = tid % sf_len;

        double ret_re = 0;
        double ret_im = 0;

        for (int i = 0; i < microphones; i++){
            //int pos = fbin + i*sf_len + block * speakers * sf_len;

            int cpos = fbin + i*(sf_len*speakers) + block *  sf_len;
            int ipos = fbin + i*sf_len;

            ret_re = ret_re + Ck[cpos].x * in[ipos].x - Ck[cpos].y * in[ipos].y;
            ret_im = ret_im + Ck[cpos].y * in[ipos].x + Ck[cpos].x * in[ipos].y;
        }
        out[tid].x = ret_re * fft_fact;
        out[tid].y = ret_im * fft_fact;
    }
}


__global__ void apply_gradient(double *F1, double *h, double mue, int fft_len, int Lh, int speakers){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < fft_len * speakers) {
        if ((tid % (fft_len)) < Lh) {
            h[tid] = h[tid] - mue * F1[tid];
        } else {
            h[tid] = 0.0;
        }
    }
}


__global__ void complex_point_mult_scale(int count, int fft_len, Complex *in1, Complex *in2, Complex *out, double scale){
    int tid = threadIdx.x +blockIdx.x * blockDim.x;
    if (tid < count) {
        double tmp = (in1[tid].x * in2[tid].y + in1[tid].y * in2[tid].x)*scale;
        out[tid].x = (in1[tid].x * in2[tid].x - in1[tid].y * in2[tid].y)*scale;
        out[tid].y = tmp;
    } else {
        if (tid<fft_len) {
            out[tid].x = 0;
            out[tid].y = 0;
        }
    }
}



