// Perform Coulomb collisions and calculate the fusion rate for each cell
// in the simulation.
// Input:  - velocity vectors of all particles in the cell
//         - density of particles in cell
//         - cell volume
//         - max number of collision particles allocated per cell
// Result: - New post-collision velocities of particles
//         - Fusion rate (in fusion events-per-second)
// Author: Andrew M. Chap
// Last edited September 2017

__global__ void performCollisions(
        float *vx, float *vr, float *vt,
        int *Ic,
        float *rho, const float *bmax, const float *CellVolumesPM,
        const int maxParticlesPerCell, int *cellResidents,
        const int *cellOccupants, int *cellOccupantsMax,
        float *fusionRate,
        float *partNs, float *partAs,
        curandState *myCurandstate, 
        const float PIdt, const float a_coeff,
        const int nPM, const float L){
    
    // Parallelized by cell.  Each process performs all the collisions in
    // a cell and then moves on to the next cell
    int c = threadIdx.x  + blockDim.x * blockIdx.x;
    
    // All Equation numbers and Figure numbers refer to:
    // Chap, Andrew M., Sedwick, Raymond J., “Coulomb Collision Model for Use in
    // Nonthermal Plasma Simulation”, Phys. Rev. E 95:6 063209 (2017)

    while(c < nPM){ // nPM is number of cells
        int np = cellOccupants[c];
	// Just recording the max number of occupants that have been seen in this cell
        if(np > cellOccupantsMax[c])       
            cellOccupantsMax[c] = np;
        // If we have more particles than the max allocated for, 
        // we need to truncate our number of particles there
        int npTrunc = np <= maxParticlesPerCell ? np : maxParticlesPerCell; 

        // Generate random permutation of particles in the cell
        // (Fisher Yates algorithm)
        for (int p = npTrunc-1; p >= 0; --p){
            // Generate a random number [0, p-1]
	    // curand generates unsigned int random numbers, so we need to convert to int
            int r = (int)(curand(&myCurandstate[c]) % (((unsigned int)(p))+1)); 
            // Swap the last element with element at random index
            int temp = cellResidents[c*maxParticlesPerCell + p];
            cellResidents[c*maxParticlesPerCell + p] = cellResidents[c*maxParticlesPerCell + r];
            cellResidents[c*maxParticlesPerCell + r] = temp;
        }

        float fusionRateHere = 0.0f; // initialize total fusion rate to zero
	// right now we don't collide the last particle if we have an odd number
        for (int p = 0; p < npTrunc-1; p+=2){ // loop through particle pairs
            // -------------------------------------------------
            // Calculate relative velocity
            // -------------------------------------------------
            // select particles to be collided
            int p1 = cellResidents[c*maxParticlesPerCell + p];
            int p2 = cellResidents[c*maxParticlesPerCell + p+1];
            // COM velocity
            float VCx = 0.5f*(vx[p1]+vx[p2]);
            float VCr = 0.5f*(vr[p1]+vr[p2]);
            float VCt = 0.5f*(vt[p1]+vt[p2]);
            // Relative velocity of p1 in the COM frame
            float VRx = vx[p1] - VCx;
            float VRr = vr[p1] - VCr;
            float VRt = vt[p1] - VCt;
            // Rel velocity squared and magnitude
            float VR2 = VRx*VRx + VRr*VRr + VRt*VRt;
            float VRm = sqrtf(VR2);
            // The velocity difference
            float VDx = vx[p1] - vx[p2];
            float VDr = vr[p1] - vr[p2];
            float VDt = vt[p1] - vt[p2];
            // The velocity difference squared and magnitude
            float VD2 = VDx*VDx + VDr*VDr + VDt*VDt;
            float VDm = sqrtf(VD2);
            float VD2i = 1.0f/VD2;

            // ------------------------------------
            // Calculate fusion rate
            // ------------------------------------
            float v = VDm*SECONDSCONVERSION;
            float vInv = 1.0f/v;
            float v2 = v*v;
            float v4 = v2*v2;
            float SV;
            // fusion cross section calculation from fit functions to experimental data
            if (v < V400){
                SV = CV0 + CV1*v2 + CV2*v4 + AVL/((v2-VS148)*(v2-VS148) + VQ235);
            }else if(v < V642){
                float peak400 = (v2 - VS400);
                float peak400squared = peak400*peak400;
                SV = DV0 + 
                     DV1*peak400 - 
                     DV2*peak400*peak400 - 
                     DV3*peak400squared*peak400squared*peak400;
            }else{
                SV = BV0 + 
                     AV0/((v2 - VS5813)*(v2 - VS5813) + VQ857) +
                     AV1/((v2 - VS1083)*(v2 - VS1083) + VQ234) +
                     AV2/((v2 - VS2405)*(v2 - VS2405) + VQ138) +
                     AV3/((v2 - VS3344)*(v2 - VS3344) + VQ309);
            }
            // Density squared multiplied by volume (Have to divide rho by volume to get
            // Density, so in this case we just divide once by volume)
            float fusionContribution = rho[c]*rho[c]/(CellVolumesPM[c]*L*L*L) * 
                    vInv*expf(-VG*vInv)*(1.0f/SECONDSCONVERSION)*SV/4.0f/(float(npTrunc));
            // Above is Eq. (4.18) from my PhD thesis (not in Coulomb paper)
            // Later (in MATLAB plotting) it gets multiplied by dE (8.7e6 MeV)
            // i.e. Eq. (4.19) in PhD thesis
           
            if (fusionContribution != fusionContribution){ //if nan
                fusionContribution = 0.0f;
            }
	    // add fusion rate contribution of these two particles total
            fusionRateHere += fusionContribution;

            // -------------------------------------------------
            // Calculate Coulomb collision scatttering angle
            // -------------------------------------------------
            // The '(float(np)/float(npTrunc))' is there to offset any 'under-colliding'
            // that would be done if we aren't operating on all the particles
            float N = (((PIdt*VDm)*bmax[c]*bmax[c])*rho[c]) / 
                         (CellVolumesPM[c]*(L*L*L))*(float(np)/float(npTrunc));  // Eq. (2)
            // Above eqution divides by cell volume because our "rho" is 
            // actually particles per cell, rather than particles per m^3
            
            // inverse of the magnitude of the relative velocity
            float VRi = VRm>0 ? 1.0f/VRm : 0.0f;
            
            if (N > 0){
                float theta;
                float a = a_coeff*VD2i/bmax[c]; // Eq. (6)
                float sTildeAsqoN = (a*sqrtf(N))*(KS4 - KS1*expf(-KS2*powf(N,KS3))); // Eq. (31a)
                float s = sTildeAsqoN*pow((1.0f + pow((TWOOVERPI*sTildeAsqoN),KS5)),(1.0f/KS5)); // Eq. (32a)
                // Create random variables U (for scattering angle theta) 
                // and V (for azimuthal angle phi)
                float U = curand_uniform(&myCurandstate[c]);
                float V = curand_uniform(&myCurandstate[c]);
                
                if(s > PIOVERTWO-0.2f){ 
                // If s is close to pi/2, we are approximately isotropic.  this happens
                // when two particles are traveling close to the same speed
                    theta = acosf(1.9999f*U-0.9998f);  
                    // use 1.9999 and 0.9998 instead of 2 and 1 to avoid the remote
                    // possibility of an argument equal to -1 or 1
                }else{
                    // Eqs. (31b-31d)
                    float kTilde = 1.0f - KK1*expf(-KK2*powf(N,KK3));
                    float lTilde = KL1*expf(-KL2*powf(N,KL3));
                    float hTilde = KH1*expf(-KH2*powf(N,KH3));
	            // Eqs. (32b-32d)
                    float k = kTilde*expf(KK4*powf(s,KK5));
                    float l = lTilde*expf(-KL4*powf(s,KL5));
                    float h = hTilde*expf(-KH4*powf(s,KH5));
                    
                    // if any of k, l, or h are greater than 1 or less than 0, set within these bounds
                    k = k > 1.0f ? 1.0f : k;
                    k = k < 0.0f ? 0.0f : k;
                    l = l > 1.0f ? 1.0f : l;
                    l = l < 0.0f ? 0.0f : l;
                    h = h > 1.0f ? 1.0f : h;
                    h = h < 0.0f ? 0.0f : h;
                    
                    // Eq. (27) branching (see Fig. 3 in my Coulomb collision paper a graphical representation)
                    if (U >= h){ // theta_transition or theta_low
			// Called "varSigma" because that is the name of the LaTeX character I used to represent it
                        float varSigma = cosf(s)/(sinf(s)*sinf(s)); // Eq. (22)
                        float invVarSigma = 1.0f/varSigma;
                        if (U >= l){ 
                            // Most common route: theta_low
                            // (with extra "abs" to prevent negative number in logarithm,
                            // though U should never be low enough to reach that point)
                            // (u_low-1)/kappa + 1 > 0, so: u_low > 1-kappa 
                            theta = // three different expressions to prevent 
                                    (varSigma>1.0e6f)
                                    ?
                                        // Taylor expansion of acos(1-x) at x=0 is sqrt(2*x).  we need this taylor 
                                        // expansion when varSigma > 1e6 or so, because evaluating 1+1e-8
                                        // in single results in 1
                                        sqrtf(-2*invVarSigma*logf(fabsf((U-1.0f)/k+1.0f))) 
                                        :
                                        (
                                            (varSigma>80.0f)
                                            ?
                                            (acosf(1.0f + invVarSigma*logf(fabsf((U-1.0f)/k+1.0f)))) // eq. (25)
                                            :
                                            (acosf(invVarSigma*logf(fabsf(expf(-varSigma)
                                                + 2.0f*sinhf(varSigma)*((U-1.0f)/k+1.0f))))) // eq. (24)
                                         )
                                    ;
                        }else{ // second-most common route: theta_mid
                            // to evaluate, we need expressions similar to theta_low (above) and theta_high (below)
                            float thetaHighUhigh = 2.0f*atanf(a*sqrtf(N/h));
                     
                            float thetaLowUlow = 
                                    (varSigma>1.0e6f)
                                    ?
                                        sqrtf(-2*invVarSigma*logf(fabsf((l-1.0f)/k+1.0f)))
                                        :
                                        (
                                            (varSigma>80.0f)
                                            ?
                                            (acosf(1.0f + invVarSigma*logf(fabsf((l-1.0f)/k+1.0f))))  // eq. (25)
                                            :
                                            (acosf(invVarSigma*logf(fabsf(expf(-varSigma)
                                                + 2.0f*sinhf(varSigma)*((l-1.0f)/k+1.0f))))) // eq. (24)
                                         )
                                    ;

                            theta = thetaLowUlow*powf((U/l), (logf(thetaLowUlow/thetaHighUhigh)/logf(l/h)));
                            // Eq. (26)
                        }
                    }else{ // least common route: theta_high
                        theta = 2.0f*atanf(a*sqrtf(N/U)); // eq. (21)
                    }
                }

                // -------------------------------------------------
                // Perform Collision
                // -------------------------------------------------

                // rel velocity normalized to a unit vector
                float VRNx = VRx*VRi;
                float VRNr = VRr*VRi;
                float VRNt = VRt*VRi;
                // collision scattering angle
                float sinTheta = sinf(theta);
                float cosTheta = cosf(theta);
                // azimuthal angle of collision
                float phi = TWOPI*V; // random between 0 and 2*pi
                float sinPhi = sinf(phi);
                float cosPhi = cosf(phi);
                // create vector "R" perpendicular to v_rel_com_norm
                float Rx, Rr, Rt;
                if (VRNr!=1.0f){
                    Rx = -VRNt;
                    Rr = 0.0f;
                    Rt = VRNx;
                }else{ // avoids the very very unlikely possibility of getting the zero vector
                    Rx = 0.0f;
                    Rr = VRNr;
                    Rt = -VRNt;
                }
                // vector "U" = VRN cross R
                float Ux = VRNr*Rt - VRNt*Rr;
                float Ur = VRNt*Rx - VRNx*Rt;
                float Ut = VRNx*Rr - VRNr*Rx;
                // vector "P" = U*sin(phi) + R*cos(phi)
                float Px = Ux*sinPhi + Rx*cosPhi;
                float Pr = Ur*sinPhi + Rr*cosPhi;
                float Pt = Ut*sinPhi + Rt*cosPhi;
                // Normalize P
                float P2 = Px*Px + Pr*Pr + Pt*Pt;
                float Pi = P2>0.0f ? rsqrtf(P2) : 0.0f; // check for zero
                Px *= Pi;
                Pr *= Pi;
                Pt *= Pi;
                // Vector "T" is the unnormalized rel velocity rotated towards "P" by theta,
                // this is the new velocity of p1 in the COM frame
                float Tx = VRx*cosTheta + (Pr*VRt - Pt*VRr)*sinTheta;
                float Tr = VRr*cosTheta + (Pt*VRx - Px*VRt)*sinTheta;
                float Tt = VRt*cosTheta + (Px*VRr - Pr*VRx)*sinTheta;
                // Vector "C" is the change in velocity in the lab frame
                float Cx = Tx - VRx;
                float Cr = Tr - VRr;
                float Ct = Tt - VRt;
                // Check for NaN's and zero them out if necessary
                if (Cx!=Cx)
                    Cx = 0.0f;
                if (Cr!=Cr)
                    Cr = 0.0f;
                if (Ct!=Ct)
                    Ct = 0.0f;
                // apply collision velocity change to particles
                vx[p1] += Cx;
                vr[p1] += Cr;
                vt[p1] += Ct;
                vx[p2] -= Cx;
                vr[p2] -= Cr;
                vt[p2] -= Ct;
            }
        }
        fusionRate[c] = fusionRateHere; // record fusion rate in this cell
        c += blockDim.x*gridDim.x;      // go to next cell assigned to this process
    }
}