//
// Institute for Signal Processing (University of Luebeck, Germany)
// Copyright (c) 2011 by Radoslaw Mazur
//
// Permission to use, copy, modify, and distribute this software without
// fee is hereby granted FOR RESEARCH/EDUCATION PURPOSES only, provided
// that this copyright notice appears in all copies and in all supporting
// documentation, and that the software is not redistributed for any
// fee (except for a nominal shipping charge).
//
// For any other uses of this software, in original or modified form,
// including but not limited to consulting, production or distribution
// in whole or in part, specific prior permission must be obtained
// from the author.
// Signal processing methods and algorithms implemented by this
// software may be claimed by patents owned by others.
//
// The author makes no representation about the suitability of this
// software for any purpose. It is provided "as is" without warranty
// of any kind, either expressed or implied.
// Beware of the bugs.
//
//     Revision history
//
//     Ver     Date         Description
//     -------------------------------------------------------------------
//     0.5     14-10-2011   basic version



#include <iostream>
#include <stdio.h>
#include <math.h>
#include <time.h>

#include <cufft.h>

using namespace std;

#include "../include/shorteningData.h"
#include "../include/kernels.h"

void do_shortening(double *c, double *wd, double *wu, double *h, int Lc, int Lh, int Lg, int fft_len, int iter, double mue, double pd, double pu, int speakers, int microphones, double *ret_re){
    int verbose = 2;

    if (verbose >= 2) cout << endl << endl << "Multichannel filtershaping using cuda:" << endl;

    shorteningData sd(c, wd, wu, h, ret_re, Lc, Lg, Lh, fft_len, speakers, microphones);

    double fft_fact =1.0/(double)fft_len;
    int sf_len  = fft_len/2+1;
    int threads = 256;
    int blocks  = (fft_len+threads-1)/threads;

    sd.copyToDevice();

    if (verbose >= 2) {
            cout << "The overall setup:" << endl;
            cout << "+- Loudspeakers: " << speakers << "    Microphones: " << microphones << endl;
            cout << "+- RIR length: " << Lc << endl;
            cout << "+- Equalizer length: " << Lh << endl;
            cout << "+- FFT  length: " << fft_len << endl;
            cout << "+- Iterations: " << iter << "  Stepsize: " << mue <<endl;
//            cout << "" << << endl;
    }


    // calculate the fft of the RIRs. Needed for fast convolution.
    cufftHandle plan_C_full;
    cufftPlanMany(&plan_C_full, 1, &fft_len, NULL, 1, 0, NULL, 1, 0, CUFFT_D2Z, speakers * microphones);
    cufftExecD2Z(plan_C_full, (cufftDoubleReal *)sd.dev_c, (cufftDoubleComplex *)sd.C);
    complex_conj<<<(fft_len*speakers*microphones+threads)/threads,threads>>>(sd.c_ges_len, sd.c_ges_len, sd.C, sd.Ck);


    // Create plans for the remaining FFTs

    cufftHandle plan_h_forward;
    cufftPlanMany(&plan_h_forward, 1, &fft_len, NULL, 1, 0, NULL, 1, 0, CUFFT_D2Z, speakers);

    cufftHandle plan_g_back;
    cufftPlanMany(&plan_g_back, 1, &fft_len, NULL, 1, 0, NULL, 1, 0, CUFFT_Z2D, microphones);

    cufftHandle plan_g_forward;
    cufftPlanMany(&plan_g_forward, 1, &fft_len, NULL, 1, 0, NULL, 1, 0, CUFFT_D2Z, microphones);

    cufftHandle plan_h_back;
    cufftPlanMany(&plan_h_back, 1, &fft_len, NULL, 1, 0, NULL, 1, 0, CUFFT_Z2D, speakers);


    clock_t start,  end;
    start = clock();

    double Sd = 0.0;
    double Su = 0.0;

    //double J = 0.0;
    double J_old = 100000000000.0;
    int half_count = 0;

    // gradient descend
    for (int ii=0; ii <iter; ii++) {
        //Calculate g_i. Equation (11)
        cufftExecD2Z(plan_h_forward, (cufftDoubleReal *)sd.dev_h, (cufftDoubleComplex *)sd.conv_tmp_in);
        calc_filter_coef_forward<<<(fft_len*microphones+threads)/threads,threads>>>
               (sd.C, sd.conv_tmp_in, sd.conv_tmp_out, sf_len, speakers, microphones, fft_fact);
        cufftExecZ2D(plan_g_back, (cufftDoubleComplex *)sd.conv_tmp_out, (cufftDoubleReal *)sd.gk);

        //calculate values for gradient
        make_calcs<<<(fft_len*microphones + threads)/threads,threads>>>(sd.gk, sd.dev_wu, sd.bu, sd.add_tmp1, pu, fft_len*microphones);
        make_calcs<<<(fft_len*microphones + threads)/threads,threads>>>(sd.gk, sd.dev_wd, sd.bd, sd.add_tmp2, pd, fft_len*microphones);

        // Calculate SAgu and SAgd using sum-reduction.
        int smemSize = 256 * sizeof(double);
        sum_reduce<<<256, 256, smemSize>>>(sd.add_tmp1, sd.add_partial,fft_len*microphones, sd.SAgu);
        sum_reduce<<<1, 256, smemSize>>>(sd.add_partial, sd.add_partial,256,sd.SAgu);
        cudaMemcpy(&Su, sd.add_partial,  1 * sizeof(double), cudaMemcpyDeviceToHost);

        sum_reduce<<<256, 256, smemSize>>>(sd.add_tmp2, sd.add_partial,fft_len*microphones, sd.SAgd);
        sum_reduce<<<1, 256, smemSize>>>(sd.add_partial, sd.add_partial,256,sd.SAgd);
        cudaMemcpy(&Sd, sd.add_partial,  1 * sizeof(double), cudaMemcpyDeviceToHost);

        //SAgu*bu - SAgd*bd
        gradient_pre_calc<<<(fft_len*microphones+threads)/threads,threads>>>(sd.bu, sd.bd, sd.SAgu, sd.SAgd, sd.gradient_out, fft_len * microphones);

        cufftExecD2Z(plan_g_forward, (cufftDoubleReal *)sd.gradient_out, (cufftDoubleComplex *)sd.conv_tmp_in);
        calc_filter_coef_back<<<(fft_len*speakers+threads)/threads,threads>>>
                (sd.Ck, sd.conv_tmp_in, sd.conv_tmp_out, sf_len, speakers, microphones, fft_fact);
        cufftExecZ2D(plan_h_back, (cufftDoubleComplex *)sd.conv_tmp_out, (cufftDoubleReal *)sd.F1);

        apply_gradient<<<(fft_len*speakers+threads)/threads,threads>>>(sd.F1, sd.dev_h, mue, fft_len, Lh, speakers);

        double J = pow(Su,(1.0/pu)) / pow(Sd,(1/pd));
        if (J > J_old) {
            mue /=2.0;
            cout << "------> Stepsize halved at iteration: " << ii << endl;
            half_count++;
        }
        J_old = J;
        //cout << "J: " << J  << endl;

    }
    end = clock();
    if (verbose >= 2) {
        //cout.precision(4);
        cout << "Time used for gradient descent (CUDA):  " << ((double)(end-start))/((double)(CLOCKS_PER_SEC)) << "s" <<endl;
        cout << "Time used for one step of the gradient: " << 1000*((double)(end-start))/((double)(CLOCKS_PER_SEC))/(double(iter)) << "ms" <<endl;
        cout << "Stepsize halfed: " << half_count << " times." << endl;
    }

    // retrive the result
    sd.copyToHost();
    //clean up
    cufftDestroy(plan_C_full);
    cufftDestroy(plan_h_forward);
    cufftDestroy(plan_g_back);
    cufftDestroy(plan_g_forward);
    cufftDestroy(plan_h_back);
}
