


#include <math.h>
#include <cstdio>
#include <algorithm>

#define WITHIN_KERNEL 
#define KERNEL
#define GLOBAL_MEM
#define LOCAL_MEM

using std::min;

struct XYZ {
    int x;
    int y;
    int z;
};

thread_local XYZ blockIdx;
XYZ gridDim;

WITHIN_KERNEL int get_local_id(unsigned int dim) { return 0; }

WITHIN_KERNEL int get_group_id(unsigned int dim)
{
    if(dim == 0) return blockIdx.x;
    if(dim == 1) return blockIdx.y;
    if(dim == 2) return blockIdx.z;
    return 0;
}

WITHIN_KERNEL int get_local_size(unsigned int dim) { return 1; }

WITHIN_KERNEL int get_num_groups(unsigned int dim)
{
    if(dim == 0) return gridDim.x;
    if(dim == 1) return gridDim.y;
    if(dim == 2) return gridDim.z;
    return 1;
}
WITHIN_KERNEL int get_global_size(unsigned int dim)
{
    return get_num_groups(dim);
}
WITHIN_KERNEL int get_global_id(unsigned int dim)
{
    return get_group_id(dim);
}

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <iostream>

namespace py = pybind11;






#ifndef CUTDE_COMMON
#define CUTDE_COMMON



#define Real double
#define EPS 2.220446049250313e-16

#ifndef M_PI
  #define M_PI   3.14159265358979323846264338327950288
#endif

typedef struct Real2 {
    Real x;
    Real y;
} Real2;

typedef struct Real3 {
    Real x;
    Real y;
    Real z;
} Real3;

typedef struct Real6 {
    Real x;
    Real y;
    Real z;
    Real a;
    Real b;
    Real c;
} Real6;

WITHIN_KERNEL void print(Real x) {
    printf("%f \n", x);
}

    

WITHIN_KERNEL Real2 add2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
    return out;
}

    

WITHIN_KERNEL Real2 sub2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
    return out;
}

    

WITHIN_KERNEL Real2 mul2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
    return out;
}

    

WITHIN_KERNEL Real2 div2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
    return out;
}


    

WITHIN_KERNEL Real2 add_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
    return out;
}

    

WITHIN_KERNEL Real2 sub_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
    return out;
}

    

WITHIN_KERNEL Real2 mul_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
    return out;
}

    

WITHIN_KERNEL Real2 div_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
    return out;
}


    WITHIN_KERNEL void print2(Real2 x) {
        
        printf("%f %f  \n", x.x,x.y);
    }

    WITHIN_KERNEL Real sum2(Real2 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
        return out;
    }

    WITHIN_KERNEL Real dot2(Real2 x, Real2 y) {
        return sum2(mul2(x, y));
    }

    WITHIN_KERNEL Real length2(Real2 x) {
        return sqrt(dot2(x, x));
    }

    WITHIN_KERNEL Real2 negate2(Real2 x) {
        return mul_scalar2(x, -1);
    }

    WITHIN_KERNEL Real2 normalize2(Real2 x) {
        return div_scalar2(x, length2(x));
    }

    WITHIN_KERNEL Real2 make2(
        Real x
            ,
        Real y
    ) {
        Real2 out;
            out.x = x;
            out.y = y;
        return out;
    }
    

WITHIN_KERNEL Real3 add3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
    return out;
}

    

WITHIN_KERNEL Real3 sub3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
    return out;
}

    

WITHIN_KERNEL Real3 mul3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
    return out;
}

    

WITHIN_KERNEL Real3 div3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
    return out;
}


    

WITHIN_KERNEL Real3 add_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
    return out;
}

    

WITHIN_KERNEL Real3 sub_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
    return out;
}

    

WITHIN_KERNEL Real3 mul_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
    return out;
}

    

WITHIN_KERNEL Real3 div_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
    return out;
}


    WITHIN_KERNEL void print3(Real3 x) {
        
        printf("%f %f %f  \n", x.x,x.y,x.z);
    }

    WITHIN_KERNEL Real sum3(Real3 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
        return out;
    }

    WITHIN_KERNEL Real dot3(Real3 x, Real3 y) {
        return sum3(mul3(x, y));
    }

    WITHIN_KERNEL Real length3(Real3 x) {
        return sqrt(dot3(x, x));
    }

    WITHIN_KERNEL Real3 negate3(Real3 x) {
        return mul_scalar3(x, -1);
    }

    WITHIN_KERNEL Real3 normalize3(Real3 x) {
        return div_scalar3(x, length3(x));
    }

    WITHIN_KERNEL Real3 make3(
        Real x
            ,
        Real y
            ,
        Real z
    ) {
        Real3 out;
            out.x = x;
            out.y = y;
            out.z = z;
        return out;
    }
    

WITHIN_KERNEL Real6 add6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
        
        out.a = a.a + b.a;
        
        out.b = a.b + b.b;
        
        out.c = a.c + b.c;
    return out;
}

    

WITHIN_KERNEL Real6 sub6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
        
        out.a = a.a - b.a;
        
        out.b = a.b - b.b;
        
        out.c = a.c - b.c;
    return out;
}

    

WITHIN_KERNEL Real6 mul6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
        
        out.a = a.a * b.a;
        
        out.b = a.b * b.b;
        
        out.c = a.c * b.c;
    return out;
}

    

WITHIN_KERNEL Real6 div6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
        
        out.a = a.a / b.a;
        
        out.b = a.b / b.b;
        
        out.c = a.c / b.c;
    return out;
}


    

WITHIN_KERNEL Real6 add_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
        
        out.a = a.a + b;
        
        out.b = a.b + b;
        
        out.c = a.c + b;
    return out;
}

    

WITHIN_KERNEL Real6 sub_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
        
        out.a = a.a - b;
        
        out.b = a.b - b;
        
        out.c = a.c - b;
    return out;
}

    

WITHIN_KERNEL Real6 mul_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
        
        out.a = a.a * b;
        
        out.b = a.b * b;
        
        out.c = a.c * b;
    return out;
}

    

WITHIN_KERNEL Real6 div_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
        
        out.a = a.a / b;
        
        out.b = a.b / b;
        
        out.c = a.c / b;
    return out;
}


    WITHIN_KERNEL void print6(Real6 x) {
        
        printf("%f %f %f %f %f %f  \n", x.x,x.y,x.z,x.a,x.b,x.c);
    }

    WITHIN_KERNEL Real sum6(Real6 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
            out += x.a;
            out += x.b;
            out += x.c;
        return out;
    }

    WITHIN_KERNEL Real dot6(Real6 x, Real6 y) {
        return sum6(mul6(x, y));
    }

    WITHIN_KERNEL Real length6(Real6 x) {
        return sqrt(dot6(x, x));
    }

    WITHIN_KERNEL Real6 negate6(Real6 x) {
        return mul_scalar6(x, -1);
    }

    WITHIN_KERNEL Real6 normalize6(Real6 x) {
        return div_scalar6(x, length6(x));
    }

    WITHIN_KERNEL Real6 make6(
        Real x
            ,
        Real y
            ,
        Real z
            ,
        Real a
            ,
        Real b
            ,
        Real c
    ) {
        Real6 out;
            out.x = x;
            out.y = y;
            out.z = z;
            out.a = a;
            out.b = b;
            out.c = c;
        return out;
    }


WITHIN_KERNEL Real2 transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = dot2(a,v);
    out.y = dot2(b,v);
    return out;
}

WITHIN_KERNEL Real2 inv_transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = a.x * v.x + b.x * v.y;
    out.y = a.y * v.x + b.y * v.y;
    return out;
}

WITHIN_KERNEL Real3 transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = dot3(a,v);
    out.y = dot3(b,v);
    out.z = dot3(c,v);
    return out;
}

WITHIN_KERNEL Real3 inv_transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = a.x * v.x + b.x * v.y + c.x * v.z;
    out.y = a.y * v.x + b.y * v.y + c.y * v.z;
    out.z = a.z * v.x + b.z * v.y + c.z * v.z;
    return out;
}

WITHIN_KERNEL Real3 cross3(Real3 x, Real3 y) {
    Real3 out;
    out.x = x.y * y.z - x.z * y.y;
    out.y = x.z * y.x - x.x * y.z;
    out.z = x.x * y.y - x.y * y.x;
    return out;
}

WITHIN_KERNEL Real6 tensor_transform3(Real3 a, Real3 b, Real3 c, Real6 tensor) {
    Real A[9];
    A[0] = a.x; A[1] = a.y; A[2] = a.z;
    A[3] = b.x; A[4] = b.y; A[5] = b.z;
    A[6] = c.x; A[7] = c.y; A[8] = c.z;
    Real6 out;
    out.x = A[0]*A[0]*tensor.x+2*A[0]*A[3]*tensor.a+2*A[0]*A[6]*tensor.b+
        2*A[3]*A[6]*tensor.c+ A[3]*A[3]*tensor.y+A[6]*A[6]*tensor.z;
    out.y = A[1]*A[1]*tensor.x+2*A[1]*A[4]*tensor.a+2*A[1]*A[7]*tensor.b+
        2*A[4]*A[7]*tensor.c+A[4]*A[4]*tensor.y+A[7]*A[7]*tensor.z;
    out.z = A[2]*A[2]*tensor.x+2*A[2]*A[5]*tensor.a+2*A[2]*A[8]*tensor.b+
        2*A[5]*A[8]*tensor.c+A[5]*A[5]*tensor.y+A[8]*A[8]*tensor.z;
    out.a = A[0]*A[1]*tensor.x+(A[0]*A[4]+A[1]*A[3])*tensor.a+(A[0]*A[7]+
        A[1]*A[6])*tensor.b+(A[7]*A[3]+A[6]*A[4])*tensor.c+A[4]*A[3]*tensor.y+
        A[6]*A[7]*tensor.z;
    out.b = A[0]*A[2]*tensor.x+(A[0]*A[5]+A[2]*A[3])*tensor.a+(A[0]*A[8]+
        A[2]*A[6])*tensor.b+(A[8]*A[3]+A[6]*A[5])*tensor.c+A[5]*A[3]*tensor.y+
        A[6]*A[8]*tensor.z;
    out.c = A[1]*A[2]*tensor.x+(A[2]*A[4]+A[1]*A[5])*tensor.a+(A[2]*A[7]+
        A[1]*A[8])*tensor.b+(A[7]*A[5]+A[8]*A[4])*tensor.c+A[4]*A[5]*tensor.y+
        A[7]*A[8]*tensor.z;
    return out;
}

WITHIN_KERNEL int trimodefinder(Real3 obs, Real3 tri0, Real3 tri1, Real3 tri2) {
    // trimodefinder calculates the normalized barycentric coordinates of
    // the points with respect to the TD vertices and specifies the appropriate
    // artefact-free configuration of the angular dislocations for the
    // calculations. The input matrices x, y and z share the same size and
    // correspond to the y, z and x coordinates in the TDCS, respectively. p1,
    // p2 and p3 are two-component matrices representing the y and z coordinates
    // of the TD vertices in the TDCS, respectively.
    // The components of the output (trimode) corresponding to each calculation
    // points, are 1 for the first configuration, -1 for the second
    // configuration and 0 for the calculation point that lie on the TD sides.

    Real a = ((tri1.z-tri2.z)*(obs.y-tri2.y)+(tri2.y-tri1.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real b = ((tri2.z-tri0.z)*(obs.y-tri2.y)+(tri0.y-tri2.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real c = 1-a-b;

    int result = 1;
    if ((a<=0 && b>c && c>a) ||
            (b<=0 && c>a && a>b) ||
            (c<=0 && a>b && b>c)) {
        result = -1;
    }

    if ((a==0 && b>=0 && c>=0) ||
            (a>=0 && b==0 && c>=0) ||
            (a>=0 && b>=0 && c==0)) {
        result = 0;
    }
    if (result == 0 && obs.x != 0) {
        result = 1;
    }

    return result;
}

WITHIN_KERNEL Real3 AngDisDisp(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;
    Real r = sqrt(x * x + y * y + z * z);

    Real ux = bx/8/M_PI/(1-nu)*(x*y/r/(r-z)-x*eta/r/(r-zeta));
    Real vx = bx/8/M_PI/(1-nu)*(eta*sinA/(r-zeta)-y*eta/r/(r-zeta)+        y*y/r/(r-z)+(1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real wx = bx/8/M_PI/(1-nu)*(eta*cosA/(r-zeta)-y/r-eta*z/r/(r-zeta)-        (1-2*nu)*sinA*log(r-zeta));

    Real uy = by/8/M_PI/(1-nu)*(x*x*cosA/r/(r-zeta)-x*x/r/(r-z)-         (1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real vy = by*x/8/M_PI/(1-nu)*(y*cosA/r/(r-zeta)-sinA*cosA/(r-zeta)-y/r/(r-z));
    Real wy = by*x/8/M_PI/(1-nu)*(z*cosA/r/(r-zeta)-cosA*cosA/(r-zeta)+1/r);

    Real uz = bz*sinA/8/M_PI/(1-nu)*((1-2*nu)*log(r-zeta)-x*x/r/(r-zeta));
    Real vz = bz*x*sinA/8/M_PI/(1-nu)*(sinA/(r-zeta)-y/r/(r-zeta));
    Real wz = bz*x*sinA/8/M_PI/(1-nu)*(cosA/(r-zeta)-z/r/(r-zeta));
    return make3(ux+uy+uz, vx+vy+vz, wx+wy+wz);
}

WITHIN_KERNEL Real3 TDSetupD(Real3 obs, Real alpha, Real3 slip, Real nu, Real3 TriVertex, Real3 SideVec) {
    // TDSetupD transforms coordinates of the calculation points as well as
    // slip vector components from ADCS into TDCS. It then calculates the
    // displacements in ADCS and transforms them into TDCS.

    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    Real3 uvw = AngDisDisp(obs.x, y1, z1, -M_PI + alpha, slip.x, by1, bz1, nu);

    Real2 r3 = inv_transform2(A1, A2, make2(uvw.y, uvw.z));
    Real v = r3.x;
    Real w = r3.y;
    return make3(uvw.x, v, w);
}

WITHIN_KERNEL Real6 AngDisStrain(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {
    // AngDisStrain calculates the strains associated with an angular 
    // dislocation in an elastic full-space.

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;

    Real x2 = x*x;
    Real y2 = y*y;
    Real z2 = z*z;
    Real r2 = x2+y2+z2;
    Real r = sqrt(r2);
    Real r3 = r*r2;
    Real rz = r*(r-z);
    Real rmz = (r-z);
    Real r2z2 = r2*rmz*rmz;
    Real r3z = r3*rmz;

    Real W = zeta-r;
    Real W2 = W*W;
    Real Wr = W*r;
    Real W2r = W2*r;
    Real Wr3 = W*r3;
    Real W2r2 = W2*r2;

    Real C = (r*cosA-z)/Wr;
    Real S = (r*sinA-y)/Wr;

    // Partial derivatives of the Burgers' function
    Real rFi_rx = (eta/r/(r-zeta)-y/r/(r-z))/4/M_PI;
    Real rFi_ry = (x/r/(r-z)-cosA*x/r/(r-zeta))/4/M_PI;
    Real rFi_rz = (sinA*x/r/(r-zeta))/4/M_PI;

    Real6 out;
    out.x = bx*(rFi_rx)+
        bx/8/M_PI/(1-nu)*(eta/Wr+eta*x2/W2r2-eta*x2/Wr3+y/rz-
        x2*y/r2z2-x2*y/r3z)-
        by*x/8/M_PI/(1-nu)*(((2*nu+1)/Wr+x2/W2r2-x2/Wr3)*cosA+
        (2*nu+1)/rz-x2/r2z2-x2/r3z)+
        bz*x*sinA/8/M_PI/(1-nu)*((2*nu+1)/Wr+x2/W2r2-x2/Wr3);

    out.y = by*(rFi_ry)+
        bx/8/M_PI/(1-nu)*((1/Wr+S*S-y2/Wr3)*eta+(2*nu+1)*y/rz-y*y*y/r2z2-
        y*y*y/r3z-2*nu*cosA*S)-
        by*x/8/M_PI/(1-nu)*(1/rz-y2/r2z2-y2/r3z+
        (1/Wr+S*S-y2/Wr3)*cosA)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+S*S-y2/Wr3);

    out.z = bz*(rFi_rz)+
        bx/8/M_PI/(1-nu)*(eta/W/r+eta*C*C-eta*z2/Wr3+y*z/r3+
        2*nu*sinA*C)-
        by*x/8/M_PI/(1-nu)*((1/Wr+C*C-z2/Wr3)*cosA+z/r3)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+C*C-z2/Wr3);

    out.a = bx*(rFi_ry)/2+by*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(x*y2/r2z2-nu*x/rz+x*y2/r3z-nu*x*cosA/Wr+
        eta*x*S/Wr+eta*x*y/Wr3)+
        by/8/M_PI/(1-nu)*(x2*y/r2z2-nu*y/rz+x2*y/r3z+nu*cosA*S+
        x2*y*cosA/Wr3+x2*cosA*S/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*S+x2*S/Wr+x2*y/Wr3);

    out.b = bx*(rFi_rz)/2+bz*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(-x*y/r3+nu*x*sinA/Wr+eta*x*C/Wr+
        eta*x*z/Wr3)+
        by/8/M_PI/(1-nu)*(-x2/r3+nu/r+nu*cosA*C+x2*z*cosA/Wr3+
        x2*cosA*C/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*C+x2*C/Wr+x2*z/Wr3);

    out.c = by*(rFi_rz)/2+bz*(rFi_ry)/2+
        bx/8/M_PI/(1-nu)*(y2/r3-nu/r-nu*cosA*C+nu*sinA*S+eta*sinA*cosA/W2-
        eta*(y*cosA+z*sinA)/W2r+eta*y*z/W2r2-eta*y*z/Wr3)-
        by*x/8/M_PI/(1-nu)*(y/r3+sinA*cosA*cosA/W2-cosA*(y*cosA+z*sinA)/
        W2r+y*z*cosA/W2r2-y*z*cosA/Wr3)-
        bz*x*sinA/8/M_PI/(1-nu)*(y*z/Wr3-sinA*cosA/W2+(y*cosA+z*sinA)/
        W2r-y*z/W2r2);
    return out;
}


WITHIN_KERNEL Real6 TDSetupS(Real3 obs, Real alpha, Real3 slip, Real nu,
    Real3 TriVertex, Real3 SideVec) 
{
    // TDSetupS transforms coordinates of the calculation points as well as 
    // slip vector components from ADCS into TDCS. It then calculates the 
    // strains in ADCS and transforms them into TDCS.

    // Transformation matrix
    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);

    // Transform coordinates of the calculation points from TDCS into ADCS
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    // Transform the in-plane slip vector components from TDCS into ADCS
    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    // Calculate strains associated with an angular dislocation in ADCS
    Real6 out_adcs = AngDisStrain(obs.x,y1,z1,-M_PI+alpha,slip.x,by1,bz1,nu);

    // Transform strains from ADCS into TDCS
    Real3 B0 = make3(1.0, 0.0, 0.0);
    Real3 B1 = make3(0.0, A1.x, A1.y);
    Real3 B2 = make3(0.0, A2.x, A2.y);
    return tensor_transform3(B0, B1, B2, out_adcs);
}

WITHIN_KERNEL Real3 AngDisDispFSC(Real y1, Real y2, Real y3, Real beta, 
                                  Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real r2b = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(r2b);

    Real Fib = 2*atan(-y2/(-(rb+y3b)*(1.0 / tan(beta/2))+y1)); // The Burgers' function

    Real v1cb1 = b1/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*((1-2*nu-a/rb)*cotB-y1/(rb+y3b)*(nu+a/rb))+(1-2*nu)*
        y2*cosB*cotB/(rb+z3b)*(cosB+a/rb)+a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*(-(1-2*nu)*cotB+y1/(rb+y3b)*(2*nu+a/rb)+
        a*y1/(rb*rb))+y2*(y3b-a)/(rb*(rb+z3b))*(cosB/(rb+z3b)*((rb*
        cosB+y3b)*((1-2*nu)*cosB-a/rb)*cotB+2*(1-nu)*(rb*sinB-y1)*cosB)-
        a*y3b*cosB*cotB/(rb*rb)));

    Real v2cb1 = b1/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)-nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1-2*nu)*cosB*log(rb+z3b))-(1-2*nu)/(rb+y3b)*(y1*
        cotB*(1-2*nu-a/rb)+nu*y3b-a+(y2*y2)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*z1b*cotB/(rb+z3b)*(cosB+a/rb)-a*y1*(y3b-a)*cotB/(rb*rb*rb)+
        (y3b-a)/(rb+y3b)*(-2*nu+1/rb*((1-2*nu)*y1*cotB-a)+(y2*y2)/(rb*
        (rb+y3b))*(2*nu+a/rb)+a*(y2*y2)/(rb*rb*rb))+(y3b-a)/(rb+z3b)*((cosB*cosB)-
        1/rb*((1-2*nu)*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/(rb*
        (rb+z3b))*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*(rb*cosB+y3b))));

    Real v3cb1 = b1/4/M_PI/(1-nu)*(2*(1-nu)*(((1-2*nu)*Fib*cotB)+(y2/(rb+y3b)*(2*
        nu+a/rb))-(y2*cosB/(rb+z3b)*(cosB+a/rb)))+y2*(y3b-a)/rb*(2*
        nu/(rb+y3b)+a/(rb*rb))+y2*(y3b-a)*cosB/(rb*(rb+z3b))*(1-2*nu-
        (rb*cosB+y3b)/(rb+z3b)*(cosB+a/rb)-a*y3b/(rb*rb)));

    Real v1cb2 = b2/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)+nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1)*cosB*log(rb+z3b))+(1-2*nu)/(rb+y3b)*(-(1-2*nu)*
        y1*cotB+nu*y3b-a+a*y1*cotB/rb+(y1*y1)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*cotB/(rb+z3b)*(z1b*cosB-a*(rb*sinB-y1)/(rb*cosB))-a*y1*
        (y3b-a)*cotB/(rb*rb*rb)+(y3b-a)/(rb+y3b)*(2*nu+1/rb*((1-2*nu)*y1*
        cotB+a)-(y1*y1)/(rb*(rb+y3b))*(2*nu+a/rb)-a*(y1*y1)/(rb*rb*rb))+(y3b-a)*
        cotB/(rb+z3b)*(-cosB*sinB+a*y1*y3b/((rb*rb*rb)*cosB)+(rb*sinB-y1)/
        rb*(2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB)))));
                    
    Real v2cb2 = b2/4/M_PI/(1-nu)*(2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*(-(1-2*nu-a/rb)*cotB+y1/(rb+y3b)*(nu+a/rb))-(1-2*nu)*
        y2*cotB/(rb+z3b)*(1+a/(rb*cosB))-a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*y1/rb*
        (1/rb+1/(rb+y3b)))+y2*(y3b-a)*cotB/(rb*(rb+z3b))*(-2*(1-nu)*
        cosB+(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB))+a*y3b/((rb*rb)*cosB)));
                    
    Real v3cb2 = b2/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*cotB*(log(rb+y3b)-cosB*
        log(rb+z3b))-2*(1-nu)*y1/(rb+y3b)*(2*nu+a/rb)+2*(1-nu)*z1b/(rb+
        z3b)*(cosB+a/rb)+(y3b-a)/rb*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*
        y1/(rb*rb))-(y3b-a)/(rb+z3b)*(cosB*sinB+(rb*cosB+y3b)*cotB/rb*
        (2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b))+a/rb*(sinB-y3b*z1b/
        (rb*rb)-z1b*(rb*cosB+y3b)/(rb*(rb+z3b)))));

    Real v1cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(y2/(rb+y3b)*(1+a/rb)-y2*cosB/(rb+
        z3b)*(cosB+a/rb))-y2*(y3b-a)/rb*(a/(rb*rb)+1/(rb+y3b))+y2*
        (y3b-a)*cosB/(rb*(rb+z3b))*((rb*cosB+y3b)/(rb+z3b)*(cosB+a/
        rb)+a*y3b/(rb*rb)));
                    
    Real v2cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(-sinB*log(rb+z3b)-y1/(rb+y3b)*(1+a/
        rb)+z1b/(rb+z3b)*(cosB+a/rb))+y1*(y3b-a)/rb*(a/(rb*rb)+1/(rb+
        y3b))-(y3b-a)/(rb+z3b)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/
        (rb*rb))-1/(rb*(rb+z3b))*((y2*y2)*cosB*sinB-a*z1b/rb*(rb*cosB+y3b))));
                    
    Real v3cb3 = b3/4/M_PI/(1-nu)*(2*(1-nu)*Fib+2*(1-nu)*(y2*sinB/(rb+z3b)*(cosB+
        a/rb))+y2*(y3b-a)*sinB/(rb*(rb+z3b))*(1+(rb*cosB+y3b)/(rb+
        z3b)*(cosB+a/rb)+a*y3b/(rb*rb)));

    return make3(
        v1cb1+v1cb2+v1cb3,
        v2cb1+v2cb2+v2cb3,
        v3cb1+v3cb2+v3cb3
    );
}

WITHIN_KERNEL Real6 AngDisStrainFSC(Real y1, Real y2, Real y3, Real beta,
                                    Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real rb2 = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(rb2);

    Real W1 = rb*cosB+y3b;
    Real W2 = cosB+a/rb;
    Real W3 = cosB+y3b/rb;
    Real W4 = nu+a/rb;
    Real W5 = 2*nu+a/rb;
    Real W6 = rb+y3b;
    Real W7 = rb+z3b;
    Real W8 = y3+a;
    Real W9 = 1+a/rb/cosB;

    Real N1 = 1-2*nu;

    Real rFib_ry2 = z1b/rb/(rb+z3b)-y1/rb/(rb+y3b);
    Real rFib_ry1 = y2/rb/(rb+y3b)-cosB*y2/rb/(rb+z3b);
    Real rFib_ry3 = -sinB*y2/rb/(rb+z3b);

    Real6 out;
    out.x = b1*(1.0/4.0*((-2+2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)*cotB- y1/W6*W4)/rb*y1+N1*y2/W6*(a/(rb*rb*rb)*y1*cotB-1/W6*W4+(y1*y1)/ (W6*W6)*W4/rb+(y1*y1)/W6*a/(rb*rb*rb))-N1*y2*cosB*cotB/(W7*W7)*W2*(y1/ rb-sinB)-N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*y1-3*a*y2*W8*cotB/(rb*rb2*rb2)* y1-y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1-y2*W8/ rb2/(W6*W6)*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1+y2*W8/rb/W6* (1/W6*W5-(y1*y1)/(W6*W6)*W5/rb-(y1*y1)/W6*a/(rb*rb*rb)+a/rb2-2*a*(y1*y1) /(rb2*rb2))-y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*y1-y2*W8/rb/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)*(y1/rb-sinB)+y2*W8/rb/W7*(-cosB/ (W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)*(y1/ rb-sinB)+cosB/W7*(1/rb*cosB*y1*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb2) *y1*cotB+(2-2*nu)*(1/rb*sinB*y1-1)*cosB)+2*a*y3b*cosB*cotB/ (rb2*rb2)*y1))/M_PI/(1-nu))+ b2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1)* cosB*(y1/rb-sinB)/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1* cotB/rb+(y1*y1)/W6*W4)/rb*y1+N1/W6*(-N1*cotB+a*cotB/rb-a* (y1*y1)*cotB/(rb*rb*rb)+2*y1/W6*W4-(y1*y1*y1)/(W6*W6)*W4/rb-(y1*y1*y1)/W6*a/ (rb*rb*rb))+N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*(y1/ rb-sinB)-N1*cotB/W7*((cosB*cosB)-a*(1/rb*sinB*y1-1)/rb/cosB+a* (rb*sinB-y1)/(rb*rb*rb)/cosB*y1)-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8* cotB/(rb*rb2*rb2)-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6* W5-a*(y1*y1)/(rb*rb*rb))/rb*y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y1+ 1/rb*N1*cotB-2*y1/rb/W6*W5+(y1*y1*y1)/(rb*rb*rb)/W6*W5+(y1*y1*y1)/rb2/ (W6*W6)*W5+(y1*y1*y1)/(rb2*rb2)/W6*a-2*a/(rb*rb*rb)*y1+3*a*(y1*y1*y1)/(rb*rb2*rb2))-W8* cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb* ((2-2*nu)*cosB-W1/W7*W9))*(y1/rb-sinB)+W8*cotB/W7*(a*y3b/ (rb*rb*rb)/cosB-3*a*(y1*y1)*y3b/(rb*rb2*rb2)/cosB+(1/rb*sinB*y1-1)/rb* ((2-2*nu)*cosB-W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/ W7*W9)*y1+(rb*sinB-y1)/rb*(-1/rb*cosB*y1/W7*W9+W1/(W7*W7)* W9*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)/cosB*y1)))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)/rb*y1-y2/W6*a/(rb*rb*rb)*y1+y2* cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb*rb)*y1)+y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1-y2*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a*y3b/rb2)*y1-y2*W8* cosB/rb/(W7*W7)*(W1/W7*W2+a*y3b/rb2)*(y1/rb-sinB)+y2*W8* cosB/rb/W7*(1/rb*cosB*y1/W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/(rb2*rb2)*y1))/M_PI/(1-nu));

    out.y = b1*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB/rb*y2/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+(y2*y2) /W6*W4)/rb*y2-N1/W6*(a*y1*cotB/(rb*rb*rb)*y2+2*y2/W6*W4-(y2*y2*y2) /(W6*W6)*W4/rb-(y2*y2*y2)/W6*a/(rb*rb*rb))+N1*z1b*cotB/(W7*W7)*W2/rb* y2+N1*z1b*cotB/W7*a/(rb*rb*rb)*y2+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1-W8/ (W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y2+2*y2/rb/ W6*W5-(y2*y2*y2)/(rb*rb*rb)/W6*W5-(y2*y2*y2)/rb2/(W6*W6)*W5-(y2*y2*y2)/(rb2*rb2)/W6* a+2*a/(rb*rb*rb)*y2-3*a*(y2*y2*y2)/(rb*rb2*rb2))-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1* z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1))/rb*y2+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a* cosB)*y2-3*a*y3b*z1b*cotB/(rb*rb2*rb2)*y2+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1)*y2+1/rb2/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1)*y2-1/rb/W7*(2*y2*(cosB*cosB)+a*z1b*cotB/(rb*rb*rb)*W1*y2-a* z1b*cotB/rb2*cosB*y2)))/M_PI/(1-nu))+ b2*(1.0/4.0*((2-2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((W5-1)*cotB+y1/W6* W4)-N1*(y2*y2)/(W6*W6)*((W5-1)*cotB+y1/W6*W4)/rb+N1*y2/W6*(-a/ (rb*rb*rb)*y2*cotB-y1/(W6*W6)*W4/rb*y2-y2/W6*a/(rb*rb*rb)*y1)-N1*cotB/ W7*W9+N1*(y2*y2)*cotB/(W7*W7)*W9/rb+N1*(y2*y2)*cotB/W7*a/(rb*rb*rb)/ cosB-a*W8*cotB/(rb*rb*rb)+3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/rb/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/(rb*rb*rb)/W6* (N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/rb2/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))+y2*W8/rb/W6* (2*nu*y1/(W6*W6)/rb*y2+a*y1/(rb*rb*rb)*(1/rb+1/W6)*y2-a*y1/rb* (-1/(rb*rb*rb)*y2-1/(W6*W6)/rb*y2))+W8*cotB/rb/W7*((-2+2*nu)*cosB+ W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/rb2/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)+y2*W8*cotB/rb/W7*(1/ rb*cosB*y2/W7*W9-W1/(W7*W7)*W9/rb*y2-W1/W7*a/(rb*rb*rb)/cosB*y2- 2*a*y3b/(rb2*rb2)/cosB*y2))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-sinB/rb*y2/W7+y2/(W6*W6)*(1+a/rb)/rb*y1+y2/W6* a/(rb*rb*rb)*y1-z1b/(W7*W7)*W2/rb*y2-z1b/W7*a/(rb*rb*rb)*y2)-y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1+y1*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/ rb*y2)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))/rb*y2-W8/W7*(sinB*a/ (rb*rb*rb)*y2-z1b/(rb*rb*rb)*(1+a*y3b/rb2)*y2-2*z1b/(rb*rb2*rb2)*a*y3b*y2+ 1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2+1/rb2/(W7*W7)* ((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2-1/rb/W7*(2*y2*cosB*sinB+a* z1b/(rb*rb*rb)*W1*y2-a*z1b/rb2*cosB*y2)))/M_PI/(1-nu));

    out.z = b1*(1.0/4.0*((2-2*nu)*(N1*rFib_ry3*cotB-y2/(W6*W6)*W5*(y3b/rb+1)- 1.0/2.0*y2/W6*a/(rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7* a/(rb*rb*rb)*2*y3b)+y2/rb*(2*nu/W6+a/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)*2*y3b+y2*W8/rb*(-2*nu/(W6*W6)*(y3b/rb+1)-a/ (rb2*rb2)*2*y3b)+y2*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)- 1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)*2* y3b-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a*y3b/rb2)*W3+y2* W8*cosB/rb/W7*(-(cosB*y3b/rb+1)/W7*W2+W1/(W7*W7)*W2*W3+1.0/2.0* W1/W7*a/(rb*rb*rb)*2*y3b-a/rb2+a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2*(1.0/4.0*((-2+2*nu)*N1*cotB*((y3b/rb+1)/W6-cosB*W3/W7)+(2-2*nu)* y1/(W6*W6)*W5*(y3b/rb+1)+1.0/2.0*(2-2*nu)*y1/W6*a/(rb*rb*rb)*2*y3b+(2- 2*nu)*sinB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*W3-1.0/2.0*(2-2*nu)*z1b/ W7*a/(rb*rb*rb)*2*y3b+1/rb*(N1*cotB-2*nu*y1/W6-a*y1/rb2)-1.0/2.0* W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/W6-a*y1/rb2)*2*y3b+W8/rb*(2*nu* y1/(W6*W6)*(y3b/rb+1)+a*y1/(rb2*rb2)*2*y3b)-1/W7*(cosB*sinB+W1* cotB/rb*((2-2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b* W1/rb/W7))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)*cosB-W1/ W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*W3-W8/W7*((cosB* y3b/rb+1)*cotB/rb*((2-2*nu)*cosB-W1/W7)-1.0/2.0*W1*cotB/(rb*rb*rb)* ((2-2*nu)*cosB-W1/W7)*2*y3b+W1*cotB/rb*(-(cosB*y3b/rb+1)/W7+ W1/(W7*W7)*W3)-1.0/2.0*a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7)* 2*y3b+a/rb*(-z1b/rb2-y3b*sinB/rb2+y3b*z1b/(rb2*rb2)*2*y3b- sinB*W1/rb/W7-z1b*(cosB*y3b/rb+1)/rb/W7+1.0/2.0*z1b*W1/(rb*rb*rb)/ W7*2*y3b+z1b*W1/rb/(W7*W7)*W3)))/M_PI/(1-nu))+ b3*(1.0/4.0*((2-2*nu)*rFib_ry3-(2-2*nu)*y2*sinB/(W7*W7)*W2*W3-1.0/2.0* (2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*2*y3b+y2*sinB/rb/W7*(1+W1/W7* W2+a*y3b/rb2)-1.0/2.0*y2*W8*sinB/(rb*rb*rb)/W7*(1+W1/W7*W2+a*y3b/ rb2)*2*y3b-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+a*y3b/rb2)*W3+ y2*W8*sinB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/(W7*W7)*W2*W3- 1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu));

    out.a = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((1-W5)*cotB-y1/ W6*W4)-N1*(y2*y2)/(W6*W6)*((1-W5)*cotB-y1/W6*W4)/rb+N1*y2/W6* (a/(rb*rb*rb)*y2*cotB+y1/(W6*W6)*W4/rb*y2+y2/W6*a/(rb*rb*rb)*y1)+N1* cosB*cotB/W7*W2-N1*(y2*y2)*cosB*cotB/(W7*W7)*W2/rb-N1*(y2*y2)*cosB* cotB/W7*a/(rb*rb*rb)+a*W8*cotB/(rb*rb*rb)-3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/ rb/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/(rb*rb*rb)/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/rb2/(W6*W6)*(-N1*cotB+y1/ W6*W5+a*y1/rb2)+y2*W8/rb/W6*(-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1-2*a*y1/(rb2*rb2)*y2)+W8/rb/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)-(y2*y2)*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2- 2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)-(y2*y2)*W8/rb2/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)+y2*W8/rb/W7*(-cosB/(W7*W7)*(W1* (N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)/rb*y2+cosB/ W7*(1/rb*cosB*y2*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb*rb)*y2*cotB+(2-2* nu)/rb*sinB*y2*cosB)+2*a*y3b*cosB*cotB/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1)* cosB/rb*y2/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/rb+ (y1*y1)/W6*W4)/rb*y2+N1/W6*(-a*y1*cotB/(rb*rb*rb)*y2-(y1*y1)/(W6*W6) *W4/rb*y2-(y1*y1)/W6*a/(rb*rb*rb)*y2)+N1*cotB/(W7*W7)*(z1b*cosB-a* (rb*sinB-y1)/rb/cosB)/rb*y2-N1*cotB/W7*(-a/rb2*sinB*y2/ cosB+a*(rb*sinB-y1)/(rb*rb*rb)/cosB*y2)+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1- W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y2+(y1*y1)/(rb*rb2) /W6*W5*y2+(y1*y1)/rb2/(W6*W6)*W5*y2+(y1*y1)/(rb2*rb2)/W6*a*y2+3* a*(y1*y1)/(rb*rb2*rb2)*y2)-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/ cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))/rb*y2+W8*cotB/ W7*(-3*a*y1*y3b/(rb*rb2*rb2)/cosB*y2+1/rb2*sinB*y2*((2-2*nu)*cosB- W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7*W9)*y2+(rb* sinB-y1)/rb*(-1/rb*cosB*y2/W7*W9+W1/(W7*W7)*W9/rb*y2+W1/W7* a/(rb*rb*rb)/cosB*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(1/W6*(1+a/rb)-(y2*y2)/(W6*W6)*(1+a/rb)/rb-(y2*y2)/ W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)*cosB/W7* a/(rb*rb*rb))-W8/rb*(a/rb2+1/W6)+(y2*y2)*W8/(rb*rb*rb)*(a/rb2+1/W6)- y2*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/rb*y2)+W8*cosB/rb/W7* (W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a* y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7)*(W1/W7*W2+a*y3b/rb2)+y2* W8*cosB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)*W2/rb*y2-W1/ W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB*(y1/rb-sinB)/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b- a+(y2*y2)/W6*W4)/rb*y1-N1/W6*((1-W5)*cotB+a*(y1*y1)*cotB/(rb*rb*rb)- (y2*y2)/(W6*W6)*W4/rb*y1-(y2*y2)/W6*a/(rb*rb*rb)*y1)-N1*cosB*cotB/W7* W2+N1*z1b*cotB/(W7*W7)*W2*(y1/rb-sinB)+N1*z1b*cotB/W7*a/(rb*rb2) *y1-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8*cotB/(rb*rb2*rb2)-W8/(W6*W6)*(-2* nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/(rb*rb*rb))/rb* y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y1+1/rb*N1*cotB-(y2*y2)/ (rb*rb*rb)/W6*W5*y1-(y2*y2)/rb2/(W6*W6)*W5*y1-(y2*y2)/(rb2*rb2)/W6*a*y1- 3*a*(y2*y2)/(rb*rb2*rb2)*y1)-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1*z1b*cotB+a* cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1))*(y1/rb-sinB)+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a*cosB)* y1-1/rb*N1*cosB*cotB+a*y3b*cosB*cotB/(rb*rb*rb)-3*a*y3b*z1b*cotB/ (rb*rb2*rb2)*y1+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*y1+1/ rb/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*(y1/rb-sinB)-1/rb/ W7*(-a*cosB*cotB/rb*W1+a*z1b*cotB/(rb*rb*rb)*W1*y1-a*z1b*cotB/ rb2*cosB*y1)))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)/rb*y1+N1*y2/W6*(-a/(rb*rb*rb)*y1*cotB+1/W6*W4-(y1*y1) /(W6*W6)*W4/rb-(y1*y1)/W6*a/(rb*rb*rb))+N1*y2*cotB/(W7*W7)*W9*(y1/ rb-sinB)+N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*y1+3*a*y2*W8*cotB/(rb*rb2*rb2) *y1-y2*W8/(rb*rb*rb)/W6*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/ W6))*y1-y2*W8/rb2/(W6*W6)*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/ rb+1/W6))*y1+y2*W8/rb/W6*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/ rb*(1/rb+1/W6)+a*(y1*y1)/(rb*rb*rb)*(1/rb+1/W6)-a*y1/rb*(-1/ (rb*rb*rb)*y1-1/(W6*W6)/rb*y1))-y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)*y1-y2*W8*cotB/rb/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)*(y1/rb-sinB)+y2*W8* cotB/rb/W7*(1/rb*cosB*y1/W7*W9-W1/(W7*W7)*W9*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)/cosB*y1-2*a*y3b/(rb2*rb2)/cosB*y1))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*(y1/rb-sinB)/W7-1/W6*(1+a/rb)+(y1*y1)/(W6*W6) *(1+a/rb)/rb+(y1*y1)/W6*a/(rb*rb*rb)+cosB/W7*W2-z1b/(W7*W7)*W2* (y1/rb-sinB)-z1b/W7*a/(rb*rb*rb)*y1)+W8/rb*(a/rb2+1/W6)-(y1*y1)* W8/(rb*rb*rb)*(a/rb2+1/W6)+y1*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))*(y1/rb-sinB)-W8/W7* (sinB*a/(rb*rb*rb)*y1+cosB/rb*(1+a*y3b/rb2)-z1b/(rb*rb*rb)*(1+a*y3b/ rb2)*y1-2*z1b/(rb*rb2*rb2)*a*y3b*y1+1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a* z1b/rb*W1)*y1+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)* (y1/rb-sinB)-1/rb/W7*(-a*cosB/rb*W1+a*z1b/(rb*rb*rb)*W1*y1-a* z1b/rb2*cosB*y1)))/M_PI/(1-nu));

    out.b = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)* cotB-y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB+ y1/(W6*W6)*W4*(y3b/rb+1)+1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)-N1*y2*cosB* cotB/(W7*W7)*W2*W3-1.0/2.0*N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*2*y3b+a/ (rb*rb*rb)*y2*cotB-3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/ W6*W5+a*y1/rb2)*2*y3b-y2*W8/rb/(W6*W6)*(-N1*cotB+y1/W6*W5+ a*y1/rb2)*(y3b/rb+1)+y2*W8/rb/W6*(-y1/(W6*W6)*W5*(y3b/rb+ 1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b-a*y1/(rb2*rb2)*2*y3b)+y2/rb/W7* (cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)- a*y3b*cosB*cotB/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)*2*y3b-y2*W8/rb/(W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*W3+y2*W8/rb/ W7*(-cosB/(W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)*W3+cosB/W7*((cosB*y3b/rb+1)*(N1*cosB-a/rb)*cotB+1.0/2.0*W1* a/(rb*rb*rb)*2*y3b*cotB+1.0/2.0*(2-2*nu)/rb*sinB*2*y3b*cosB)-a*cosB* cotB/rb2+a*y3b*cosB*cotB/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)*(y3b/rb+1)/W6-((2-2*nu)*(cotB*cotB) +1)*cosB*W3/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/ rb+(y1*y1)/W6*W4)*(y3b/rb+1)+N1/W6*(nu-1.0/2.0*a*y1*cotB/(rb*rb*rb)*2* y3b-(y1*y1)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y1*y1)/W6*a/(rb*rb*rb)*2*y3b)+ N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*W3-N1*cotB/ W7*(cosB*sinB-1.0/2.0*a/rb2*sinB*2*y3b/cosB+1.0/2.0*a*(rb*sinB-y1)/ (rb*rb*rb)/cosB*2*y3b)-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2* y3b+1/W6*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a* (y1*y1)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB+a)*2* y3b+1.0/2.0*(y1*y1)/(rb*rb*rb)/W6*W5*2*y3b+(y1*y1)/rb/(W6*W6)*W5*(y3b/rb+ 1)+1.0/2.0*(y1*y1)/(rb2*rb2)/W6*a*2*y3b+3.0/2.0*a*(y1*y1)/(rb*rb2*rb2)*2*y3b)+ cotB/W7*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb*((2- 2*nu)*cosB-W1/W7*W9))-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb2) /cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))*W3+W8*cotB/ W7*(a/(rb*rb*rb)/cosB*y1-3.0/2.0*a*y1*y3b/(rb*rb2*rb2)/cosB*2*y3b+1.0/2.0/ rb2*sinB*2*y3b*((2-2*nu)*cosB-W1/W7*W9)-1.0/2.0*(rb*sinB-y1)/(rb*rb2) *((2-2*nu)*cosB-W1/W7*W9)*2*y3b+(rb*sinB-y1)/rb*(-(cosB*y3b/ rb+1)/W7*W9+W1/(W7*W7)*W9*W3+1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2* y3b)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)*(y3b/rb+1)-1.0/2.0*y2/W6*a/ (rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7*a/(rb*rb*rb)*2* y3b)-y2/rb*(a/rb2+1/W6)+1.0/2.0*y2*W8/(rb*rb*rb)*(a/rb2+1/W6)*2* y3b-y2*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)*(y3b/rb+1))+y2*cosB/ rb/W7*(W1/W7*W2+a*y3b/rb2)-1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(W1/ W7*W2+a*y3b/rb2)*2*y3b-y2*W8*cosB/rb/(W7*W7)*(W1/W7*W2+a* y3b/rb2)*W3+y2*W8*cosB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/ (W7*W7)*W2*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2* y3b))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry1*cotB-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1+y2*cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb2) *y1)-y2*W8/(rb*rb*rb)*(2*nu/W6+a/rb2)*y1+y2*W8/rb*(-2*nu/(W6*W6) /rb*y1-2*a/(rb2*rb2)*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7* W2-a*y3b/rb2)*y1-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a* y3b/rb2)*(y1/rb-sinB)+y2*W8*cosB/rb/W7*(-1/rb*cosB*y1/W7* W2+W1/(W7*W7)*W2*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)*y1+2*a*y3b/(rb2*rb2) *y1))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y1/W6-cosB*(y1/rb-sinB)/W7)- (2-2*nu)/W6*W5+(2-2*nu)*(y1*y1)/(W6*W6)*W5/rb+(2-2*nu)*(y1*y1)/W6* a/(rb*rb*rb)+(2-2*nu)*cosB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y1-W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/ W6-a*y1/rb2)*y1+W8/rb*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/rb2+ 2*a*(y1*y1)/(rb2*rb2))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)* cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*(y1/rb- sinB)-W8/W7*(1/rb2*cosB*y1*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y1+W1*cotB/rb*(-1/rb*cosB* y1/W7+W1/(W7*W7)*(y1/rb-sinB))-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2- z1b*W1/rb/W7)*y1+a/rb*(-y3b*cosB/rb2+2*y3b*z1b/(rb2*rb2)*y1- cosB*W1/rb/W7-z1b/rb2*cosB*y1/W7+z1b*W1/(rb*rb*rb)/W7*y1+z1b* W1/rb/(W7*W7)*(y1/rb-sinB))))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry1-(2-2*nu)*y2*sinB/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*y1-y2*W8*sinB/(rb*rb*rb)/W7*(1+ W1/W7*W2+a*y3b/rb2)*y1-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+ a*y3b/rb2)*(y1/rb-sinB)+y2*W8*sinB/rb/W7*(1/rb*cosB*y1/ W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)-W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/ (rb2*rb2)*y1))/M_PI/(1-nu));

    out.c = b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)*(y3b/rb+1)/W6-((2-2*nu)* (cotB*cotB)+1-2*nu)*cosB*W3/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+ (y2*y2)/W6*W4)*(y3b/rb+1)-N1/W6*(1.0/2.0*a*y1*cotB/(rb*rb*rb)*2*y3b+ nu-(y2*y2)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y2*y2)/W6*a/(rb*rb*rb)*2*y3b)-N1* sinB*cotB/W7*W2+N1*z1b*cotB/(W7*W7)*W2*W3+1.0/2.0*N1*z1b*cotB/W7* a/(rb*rb*rb)*2*y3b-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2*y3b+ 1/W6*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))-W8/(W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+ a*(y2*y2)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB-a)* 2*y3b-1.0/2.0*(y2*y2)/(rb*rb*rb)/W6*W5*2*y3b-(y2*y2)/rb/(W6*W6)*W5*(y3b/ rb+1)-1.0/2.0*(y2*y2)/(rb2*rb2)/W6*a*2*y3b-3.0/2.0*a*(y2*y2)/(rb*rb2*rb2)*2*y3b)+ 1/W7*((cosB*cosB)-1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb2) -1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))-W8/(W7*W7)*((cosB*cosB)- 1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7* ((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))*W3+W8/W7*(1.0/2.0/(rb*rb*rb)*(N1* z1b*cotB+a*cosB)*2*y3b-1/rb*N1*sinB*cotB+a*z1b*cotB/(rb*rb*rb)+a* y3b*sinB*cotB/(rb*rb*rb)-3.0/2.0*a*y3b*z1b*cotB/(rb*rb2*rb2)*2*y3b+1.0/2.0/(rb*rb2) /W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*2*y3b+1/rb/(W7*W7)*((y2*y2) *(cosB*cosB)-a*z1b*cotB/rb*W1)*W3-1/rb/W7*(-a*sinB*cotB/rb*W1+ 1.0/2.0*a*z1b*cotB/(rb*rb*rb)*W1*2*y3b-a*z1b*cotB/rb*(cosB*y3b/rb+ 1))))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(-1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB-y1/ (W6*W6)*W4*(y3b/rb+1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)+N1*y2*cotB/ (W7*W7)*W9*W3+1.0/2.0*N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*2*y3b-a/(rb*rb*rb)* y2*cotB+3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(N1*cotB-2* nu*y1/W6-a*y1/rb*(1/rb+1/W6))-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*2*y3b-y2*W8/rb/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*(y3b/rb+1)+y2* W8/rb/W6*(2*nu*y1/(W6*W6)*(y3b/rb+1)+1.0/2.0*a*y1/(rb*rb*rb)*(1/rb+ 1/W6)*2*y3b-a*y1/rb*(-1.0/2.0/(rb*rb*rb)*2*y3b-1/(W6*W6)*(y3b/rb+ 1)))+y2*cotB/rb/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)- 1.0/2.0*y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/ rb2/cosB)*2*y3b-y2*W8*cotB/rb/(W7*W7)*((-2+2*nu)*cosB+W1/W7* W9+a*y3b/rb2/cosB)*W3+y2*W8*cotB/rb/W7*((cosB*y3b/rb+1)/ W7*W9-W1/(W7*W7)*W9*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2*y3b+a/rb2/ cosB-a*y3b/(rb2*rb2)/cosB*2*y3b))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*W3/W7+y1/(W6*W6)*(1+a/rb)*(y3b/rb+1)+ 1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b+sinB/W7*W2-z1b/(W7*W7)*W2*W3-1.0/2.0* z1b/W7*a/(rb*rb*rb)*2*y3b)+y1/rb*(a/rb2+1/W6)-1.0/2.0*y1*W8/(rb*rb2) *(a/rb2+1/W6)*2*y3b+y1*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)* (y3b/rb+1))-1/W7*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))+W8/(W7*W7)*(sinB*(cosB- a/rb)+z1b/rb*(1+a*y3b/rb2)-1/rb/W7*((y2*y2)*cosB*sinB-a*z1b/ rb*W1))*W3-W8/W7*(1.0/2.0*sinB*a/(rb*rb*rb)*2*y3b+sinB/rb*(1+a*y3b/ rb2)-1.0/2.0*z1b/(rb*rb*rb)*(1+a*y3b/rb2)*2*y3b+z1b/rb*(a/rb2-a* y3b/(rb2*rb2)*2*y3b)+1.0/2.0/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb* W1)*2*y3b+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*W3-1/ rb/W7*(-a*sinB/rb*W1+1.0/2.0*a*z1b/(rb*rb*rb)*W1*2*y3b-a*z1b/rb* (cosB*y3b/rb+1))))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry2*cotB+1/W6*W5-(y2*y2)/(W6*W6)*W5/ rb-(y2*y2)/W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)* cosB/W7*a/(rb*rb*rb))+W8/rb*(2*nu/W6+a/rb2)-(y2*y2)*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)+y2*W8/rb*(-2*nu/(W6*W6)/rb*y2-2*a/(rb2*rb2)*y2)+ W8*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/ (rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7) *(1-2*nu-W1/W7*W2-a*y3b/rb2)+y2*W8*cosB/rb/W7*(-1/rb* cosB*y2/W7*W2+W1/(W7*W7)*W2/rb*y2+W1/W7*a/(rb*rb*rb)*y2+2*a* y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y2/W6-cosB/rb*y2/W7)+(2- 2*nu)*y1/(W6*W6)*W5/rb*y2+(2-2*nu)*y1/W6*a/(rb*rb*rb)*y2-(2-2* nu)*z1b/(W7*W7)*W2/rb*y2-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y2-W8/(rb*rb2) *(N1*cotB-2*nu*y1/W6-a*y1/rb2)*y2+W8/rb*(2*nu*y1/(W6*W6)/ rb*y2+2*a*y1/(rb2*rb2)*y2)+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2- 2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))/ rb*y2-W8/W7*(1/rb2*cosB*y2*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y2+W1*cotB/rb*(-cosB/rb* y2/W7+W1/(W7*W7)/rb*y2)-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/ rb/W7)*y2+a/rb*(2*y3b*z1b/(rb2*rb2)*y2-z1b/rb2*cosB*y2/W7+ z1b*W1/(rb*rb*rb)/W7*y2+z1b*W1/rb2/(W7*W7)*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry2+(2-2*nu)*sinB/W7*W2-(2-2*nu)*(y2*y2)* sinB/(W7*W7)*W2/rb-(2-2*nu)*(y2*y2)*sinB/W7*a/(rb*rb*rb)+W8*sinB/rb/ W7*(1+W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/(rb*rb*rb)/W7*(1+W1/ W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/rb2/(W7*W7)*(1+W1/W7*W2+a* y3b/rb2)+y2*W8*sinB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)* W2/rb*y2-W1/W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu));

    return out;
}

WITHIN_KERNEL Real3 AngSetupFSC(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make3(0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real3 vA = AngDisDispFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real3 vB = AngDisDispFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );

    Real3 v = sub3(vB, vA);
    return inv_transform3(ey1, ey2, ey3, v);
}

WITHIN_KERNEL Real6 AngSetupFSC_S(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make6(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real6 vA = AngDisStrainFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real6 vB = AngDisStrainFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );
    Real6 v = sub6(vB, vA);
    return tensor_transform3(ey1, ey2, ey3, v);
}

#endif





KERNEL
void pairs_disp_fs(GLOBAL_MEM Real* results, int n_pairs, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips, Real nu)
{
    int i = get_global_id(0);
    if (i >= n_pairs) {
        return;
    }
    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[i * 9 + 0 * 3 + 0];
            tri0.y = tris[i * 9 + 0 * 3 + 1];
            tri0.z = tris[i * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[i * 9 + 1 * 3 + 0];
            tri1.y = tris[i * 9 + 1 * 3 + 1];
            tri1.z = tris[i * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[i * 9 + 2 * 3 + 0];
            tri2.y = tris[i * 9 + 2 * 3 + 1];
            tri2.z = tris[i * 9 + 2 * 3 + 2];

    Real3 slip = make3(
        slips[i * 3 + 2],
        slips[i * 3 + 0],
        slips[i * 3 + 1]
    );

    
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        results[i * 3 + 0] = full_out.x;
        results[i * 3 + 1] = full_out.y;
        results[i * 3 + 2] = full_out.z;
}


KERNEL
void pairs_disp_hs(GLOBAL_MEM Real* results, int n_pairs, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips, Real nu)
{
    int i = get_global_id(0);
    if (i >= n_pairs) {
        return;
    }
    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[i * 9 + 0 * 3 + 0];
            tri0.y = tris[i * 9 + 0 * 3 + 1];
            tri0.z = tris[i * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[i * 9 + 1 * 3 + 0];
            tri1.y = tris[i * 9 + 1 * 3 + 1];
            tri1.z = tris[i * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[i * 9 + 2 * 3 + 0];
            tri2.y = tris[i * 9 + 2 * 3 + 1];
            tri2.z = tris[i * 9 + 2 * 3 + 2];

    Real3 slip = make3(
        slips[i * 3 + 2],
        slips[i * 3 + 0],
        slips[i * 3 + 1]
    );

    
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


        results[i * 3 + 0] = full_out.x;
        results[i * 3 + 1] = full_out.y;
        results[i * 3 + 2] = full_out.z;
}


KERNEL
void pairs_strain_fs(GLOBAL_MEM Real* results, int n_pairs, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips, Real nu)
{
    int i = get_global_id(0);
    if (i >= n_pairs) {
        return;
    }
    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[i * 9 + 0 * 3 + 0];
            tri0.y = tris[i * 9 + 0 * 3 + 1];
            tri0.z = tris[i * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[i * 9 + 1 * 3 + 0];
            tri1.y = tris[i * 9 + 1 * 3 + 1];
            tri1.z = tris[i * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[i * 9 + 2 * 3 + 0];
            tri2.y = tris[i * 9 + 2 * 3 + 1];
            tri2.z = tris[i * 9 + 2 * 3 + 2];

    Real3 slip = make3(
        slips[i * 3 + 2],
        slips[i * 3 + 0],
        slips[i * 3 + 1]
    );

    
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        results[i * 6 + 0] = full_out.x;
        results[i * 6 + 1] = full_out.y;
        results[i * 6 + 2] = full_out.z;
        results[i * 6 + 3] = full_out.a;
        results[i * 6 + 4] = full_out.b;
        results[i * 6 + 5] = full_out.c;
}


KERNEL
void pairs_strain_hs(GLOBAL_MEM Real* results, int n_pairs, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips, Real nu)
{
    int i = get_global_id(0);
    if (i >= n_pairs) {
        return;
    }
    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[i * 9 + 0 * 3 + 0];
            tri0.y = tris[i * 9 + 0 * 3 + 1];
            tri0.z = tris[i * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[i * 9 + 1 * 3 + 0];
            tri1.y = tris[i * 9 + 1 * 3 + 1];
            tri1.z = tris[i * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[i * 9 + 2 * 3 + 0];
            tri2.y = tris[i * 9 + 2 * 3 + 1];
            tri2.z = tris[i * 9 + 2 * 3 + 2];

    Real3 slip = make3(
        slips[i * 3 + 2],
        slips[i * 3 + 0],
        slips[i * 3 + 1]
    );

    
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


        results[i * 6 + 0] = full_out.x;
        results[i * 6 + 1] = full_out.y;
        results[i * 6 + 2] = full_out.z;
        results[i * 6 + 3] = full_out.a;
        results[i * 6 + 4] = full_out.b;
        results[i * 6 + 5] = full_out.c;
}







#ifndef CUTDE_COMMON
#define CUTDE_COMMON



#define Real double
#define EPS 2.220446049250313e-16

#ifndef M_PI
  #define M_PI   3.14159265358979323846264338327950288
#endif

typedef struct Real2 {
    Real x;
    Real y;
} Real2;

typedef struct Real3 {
    Real x;
    Real y;
    Real z;
} Real3;

typedef struct Real6 {
    Real x;
    Real y;
    Real z;
    Real a;
    Real b;
    Real c;
} Real6;

WITHIN_KERNEL void print(Real x) {
    printf("%f \n", x);
}

    

WITHIN_KERNEL Real2 add2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
    return out;
}

    

WITHIN_KERNEL Real2 sub2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
    return out;
}

    

WITHIN_KERNEL Real2 mul2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
    return out;
}

    

WITHIN_KERNEL Real2 div2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
    return out;
}


    

WITHIN_KERNEL Real2 add_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
    return out;
}

    

WITHIN_KERNEL Real2 sub_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
    return out;
}

    

WITHIN_KERNEL Real2 mul_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
    return out;
}

    

WITHIN_KERNEL Real2 div_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
    return out;
}


    WITHIN_KERNEL void print2(Real2 x) {
        
        printf("%f %f  \n", x.x,x.y);
    }

    WITHIN_KERNEL Real sum2(Real2 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
        return out;
    }

    WITHIN_KERNEL Real dot2(Real2 x, Real2 y) {
        return sum2(mul2(x, y));
    }

    WITHIN_KERNEL Real length2(Real2 x) {
        return sqrt(dot2(x, x));
    }

    WITHIN_KERNEL Real2 negate2(Real2 x) {
        return mul_scalar2(x, -1);
    }

    WITHIN_KERNEL Real2 normalize2(Real2 x) {
        return div_scalar2(x, length2(x));
    }

    WITHIN_KERNEL Real2 make2(
        Real x
            ,
        Real y
    ) {
        Real2 out;
            out.x = x;
            out.y = y;
        return out;
    }
    

WITHIN_KERNEL Real3 add3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
    return out;
}

    

WITHIN_KERNEL Real3 sub3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
    return out;
}

    

WITHIN_KERNEL Real3 mul3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
    return out;
}

    

WITHIN_KERNEL Real3 div3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
    return out;
}


    

WITHIN_KERNEL Real3 add_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
    return out;
}

    

WITHIN_KERNEL Real3 sub_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
    return out;
}

    

WITHIN_KERNEL Real3 mul_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
    return out;
}

    

WITHIN_KERNEL Real3 div_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
    return out;
}


    WITHIN_KERNEL void print3(Real3 x) {
        
        printf("%f %f %f  \n", x.x,x.y,x.z);
    }

    WITHIN_KERNEL Real sum3(Real3 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
        return out;
    }

    WITHIN_KERNEL Real dot3(Real3 x, Real3 y) {
        return sum3(mul3(x, y));
    }

    WITHIN_KERNEL Real length3(Real3 x) {
        return sqrt(dot3(x, x));
    }

    WITHIN_KERNEL Real3 negate3(Real3 x) {
        return mul_scalar3(x, -1);
    }

    WITHIN_KERNEL Real3 normalize3(Real3 x) {
        return div_scalar3(x, length3(x));
    }

    WITHIN_KERNEL Real3 make3(
        Real x
            ,
        Real y
            ,
        Real z
    ) {
        Real3 out;
            out.x = x;
            out.y = y;
            out.z = z;
        return out;
    }
    

WITHIN_KERNEL Real6 add6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
        
        out.a = a.a + b.a;
        
        out.b = a.b + b.b;
        
        out.c = a.c + b.c;
    return out;
}

    

WITHIN_KERNEL Real6 sub6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
        
        out.a = a.a - b.a;
        
        out.b = a.b - b.b;
        
        out.c = a.c - b.c;
    return out;
}

    

WITHIN_KERNEL Real6 mul6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
        
        out.a = a.a * b.a;
        
        out.b = a.b * b.b;
        
        out.c = a.c * b.c;
    return out;
}

    

WITHIN_KERNEL Real6 div6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
        
        out.a = a.a / b.a;
        
        out.b = a.b / b.b;
        
        out.c = a.c / b.c;
    return out;
}


    

WITHIN_KERNEL Real6 add_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
        
        out.a = a.a + b;
        
        out.b = a.b + b;
        
        out.c = a.c + b;
    return out;
}

    

WITHIN_KERNEL Real6 sub_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
        
        out.a = a.a - b;
        
        out.b = a.b - b;
        
        out.c = a.c - b;
    return out;
}

    

WITHIN_KERNEL Real6 mul_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
        
        out.a = a.a * b;
        
        out.b = a.b * b;
        
        out.c = a.c * b;
    return out;
}

    

WITHIN_KERNEL Real6 div_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
        
        out.a = a.a / b;
        
        out.b = a.b / b;
        
        out.c = a.c / b;
    return out;
}


    WITHIN_KERNEL void print6(Real6 x) {
        
        printf("%f %f %f %f %f %f  \n", x.x,x.y,x.z,x.a,x.b,x.c);
    }

    WITHIN_KERNEL Real sum6(Real6 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
            out += x.a;
            out += x.b;
            out += x.c;
        return out;
    }

    WITHIN_KERNEL Real dot6(Real6 x, Real6 y) {
        return sum6(mul6(x, y));
    }

    WITHIN_KERNEL Real length6(Real6 x) {
        return sqrt(dot6(x, x));
    }

    WITHIN_KERNEL Real6 negate6(Real6 x) {
        return mul_scalar6(x, -1);
    }

    WITHIN_KERNEL Real6 normalize6(Real6 x) {
        return div_scalar6(x, length6(x));
    }

    WITHIN_KERNEL Real6 make6(
        Real x
            ,
        Real y
            ,
        Real z
            ,
        Real a
            ,
        Real b
            ,
        Real c
    ) {
        Real6 out;
            out.x = x;
            out.y = y;
            out.z = z;
            out.a = a;
            out.b = b;
            out.c = c;
        return out;
    }


WITHIN_KERNEL Real2 transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = dot2(a,v);
    out.y = dot2(b,v);
    return out;
}

WITHIN_KERNEL Real2 inv_transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = a.x * v.x + b.x * v.y;
    out.y = a.y * v.x + b.y * v.y;
    return out;
}

WITHIN_KERNEL Real3 transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = dot3(a,v);
    out.y = dot3(b,v);
    out.z = dot3(c,v);
    return out;
}

WITHIN_KERNEL Real3 inv_transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = a.x * v.x + b.x * v.y + c.x * v.z;
    out.y = a.y * v.x + b.y * v.y + c.y * v.z;
    out.z = a.z * v.x + b.z * v.y + c.z * v.z;
    return out;
}

WITHIN_KERNEL Real3 cross3(Real3 x, Real3 y) {
    Real3 out;
    out.x = x.y * y.z - x.z * y.y;
    out.y = x.z * y.x - x.x * y.z;
    out.z = x.x * y.y - x.y * y.x;
    return out;
}

WITHIN_KERNEL Real6 tensor_transform3(Real3 a, Real3 b, Real3 c, Real6 tensor) {
    Real A[9];
    A[0] = a.x; A[1] = a.y; A[2] = a.z;
    A[3] = b.x; A[4] = b.y; A[5] = b.z;
    A[6] = c.x; A[7] = c.y; A[8] = c.z;
    Real6 out;
    out.x = A[0]*A[0]*tensor.x+2*A[0]*A[3]*tensor.a+2*A[0]*A[6]*tensor.b+
        2*A[3]*A[6]*tensor.c+ A[3]*A[3]*tensor.y+A[6]*A[6]*tensor.z;
    out.y = A[1]*A[1]*tensor.x+2*A[1]*A[4]*tensor.a+2*A[1]*A[7]*tensor.b+
        2*A[4]*A[7]*tensor.c+A[4]*A[4]*tensor.y+A[7]*A[7]*tensor.z;
    out.z = A[2]*A[2]*tensor.x+2*A[2]*A[5]*tensor.a+2*A[2]*A[8]*tensor.b+
        2*A[5]*A[8]*tensor.c+A[5]*A[5]*tensor.y+A[8]*A[8]*tensor.z;
    out.a = A[0]*A[1]*tensor.x+(A[0]*A[4]+A[1]*A[3])*tensor.a+(A[0]*A[7]+
        A[1]*A[6])*tensor.b+(A[7]*A[3]+A[6]*A[4])*tensor.c+A[4]*A[3]*tensor.y+
        A[6]*A[7]*tensor.z;
    out.b = A[0]*A[2]*tensor.x+(A[0]*A[5]+A[2]*A[3])*tensor.a+(A[0]*A[8]+
        A[2]*A[6])*tensor.b+(A[8]*A[3]+A[6]*A[5])*tensor.c+A[5]*A[3]*tensor.y+
        A[6]*A[8]*tensor.z;
    out.c = A[1]*A[2]*tensor.x+(A[2]*A[4]+A[1]*A[5])*tensor.a+(A[2]*A[7]+
        A[1]*A[8])*tensor.b+(A[7]*A[5]+A[8]*A[4])*tensor.c+A[4]*A[5]*tensor.y+
        A[7]*A[8]*tensor.z;
    return out;
}

WITHIN_KERNEL int trimodefinder(Real3 obs, Real3 tri0, Real3 tri1, Real3 tri2) {
    // trimodefinder calculates the normalized barycentric coordinates of
    // the points with respect to the TD vertices and specifies the appropriate
    // artefact-free configuration of the angular dislocations for the
    // calculations. The input matrices x, y and z share the same size and
    // correspond to the y, z and x coordinates in the TDCS, respectively. p1,
    // p2 and p3 are two-component matrices representing the y and z coordinates
    // of the TD vertices in the TDCS, respectively.
    // The components of the output (trimode) corresponding to each calculation
    // points, are 1 for the first configuration, -1 for the second
    // configuration and 0 for the calculation point that lie on the TD sides.

    Real a = ((tri1.z-tri2.z)*(obs.y-tri2.y)+(tri2.y-tri1.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real b = ((tri2.z-tri0.z)*(obs.y-tri2.y)+(tri0.y-tri2.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real c = 1-a-b;

    int result = 1;
    if ((a<=0 && b>c && c>a) ||
            (b<=0 && c>a && a>b) ||
            (c<=0 && a>b && b>c)) {
        result = -1;
    }

    if ((a==0 && b>=0 && c>=0) ||
            (a>=0 && b==0 && c>=0) ||
            (a>=0 && b>=0 && c==0)) {
        result = 0;
    }
    if (result == 0 && obs.x != 0) {
        result = 1;
    }

    return result;
}

WITHIN_KERNEL Real3 AngDisDisp(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;
    Real r = sqrt(x * x + y * y + z * z);

    Real ux = bx/8/M_PI/(1-nu)*(x*y/r/(r-z)-x*eta/r/(r-zeta));
    Real vx = bx/8/M_PI/(1-nu)*(eta*sinA/(r-zeta)-y*eta/r/(r-zeta)+        y*y/r/(r-z)+(1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real wx = bx/8/M_PI/(1-nu)*(eta*cosA/(r-zeta)-y/r-eta*z/r/(r-zeta)-        (1-2*nu)*sinA*log(r-zeta));

    Real uy = by/8/M_PI/(1-nu)*(x*x*cosA/r/(r-zeta)-x*x/r/(r-z)-         (1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real vy = by*x/8/M_PI/(1-nu)*(y*cosA/r/(r-zeta)-sinA*cosA/(r-zeta)-y/r/(r-z));
    Real wy = by*x/8/M_PI/(1-nu)*(z*cosA/r/(r-zeta)-cosA*cosA/(r-zeta)+1/r);

    Real uz = bz*sinA/8/M_PI/(1-nu)*((1-2*nu)*log(r-zeta)-x*x/r/(r-zeta));
    Real vz = bz*x*sinA/8/M_PI/(1-nu)*(sinA/(r-zeta)-y/r/(r-zeta));
    Real wz = bz*x*sinA/8/M_PI/(1-nu)*(cosA/(r-zeta)-z/r/(r-zeta));
    return make3(ux+uy+uz, vx+vy+vz, wx+wy+wz);
}

WITHIN_KERNEL Real3 TDSetupD(Real3 obs, Real alpha, Real3 slip, Real nu, Real3 TriVertex, Real3 SideVec) {
    // TDSetupD transforms coordinates of the calculation points as well as
    // slip vector components from ADCS into TDCS. It then calculates the
    // displacements in ADCS and transforms them into TDCS.

    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    Real3 uvw = AngDisDisp(obs.x, y1, z1, -M_PI + alpha, slip.x, by1, bz1, nu);

    Real2 r3 = inv_transform2(A1, A2, make2(uvw.y, uvw.z));
    Real v = r3.x;
    Real w = r3.y;
    return make3(uvw.x, v, w);
}

WITHIN_KERNEL Real6 AngDisStrain(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {
    // AngDisStrain calculates the strains associated with an angular 
    // dislocation in an elastic full-space.

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;

    Real x2 = x*x;
    Real y2 = y*y;
    Real z2 = z*z;
    Real r2 = x2+y2+z2;
    Real r = sqrt(r2);
    Real r3 = r*r2;
    Real rz = r*(r-z);
    Real rmz = (r-z);
    Real r2z2 = r2*rmz*rmz;
    Real r3z = r3*rmz;

    Real W = zeta-r;
    Real W2 = W*W;
    Real Wr = W*r;
    Real W2r = W2*r;
    Real Wr3 = W*r3;
    Real W2r2 = W2*r2;

    Real C = (r*cosA-z)/Wr;
    Real S = (r*sinA-y)/Wr;

    // Partial derivatives of the Burgers' function
    Real rFi_rx = (eta/r/(r-zeta)-y/r/(r-z))/4/M_PI;
    Real rFi_ry = (x/r/(r-z)-cosA*x/r/(r-zeta))/4/M_PI;
    Real rFi_rz = (sinA*x/r/(r-zeta))/4/M_PI;

    Real6 out;
    out.x = bx*(rFi_rx)+
        bx/8/M_PI/(1-nu)*(eta/Wr+eta*x2/W2r2-eta*x2/Wr3+y/rz-
        x2*y/r2z2-x2*y/r3z)-
        by*x/8/M_PI/(1-nu)*(((2*nu+1)/Wr+x2/W2r2-x2/Wr3)*cosA+
        (2*nu+1)/rz-x2/r2z2-x2/r3z)+
        bz*x*sinA/8/M_PI/(1-nu)*((2*nu+1)/Wr+x2/W2r2-x2/Wr3);

    out.y = by*(rFi_ry)+
        bx/8/M_PI/(1-nu)*((1/Wr+S*S-y2/Wr3)*eta+(2*nu+1)*y/rz-y*y*y/r2z2-
        y*y*y/r3z-2*nu*cosA*S)-
        by*x/8/M_PI/(1-nu)*(1/rz-y2/r2z2-y2/r3z+
        (1/Wr+S*S-y2/Wr3)*cosA)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+S*S-y2/Wr3);

    out.z = bz*(rFi_rz)+
        bx/8/M_PI/(1-nu)*(eta/W/r+eta*C*C-eta*z2/Wr3+y*z/r3+
        2*nu*sinA*C)-
        by*x/8/M_PI/(1-nu)*((1/Wr+C*C-z2/Wr3)*cosA+z/r3)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+C*C-z2/Wr3);

    out.a = bx*(rFi_ry)/2+by*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(x*y2/r2z2-nu*x/rz+x*y2/r3z-nu*x*cosA/Wr+
        eta*x*S/Wr+eta*x*y/Wr3)+
        by/8/M_PI/(1-nu)*(x2*y/r2z2-nu*y/rz+x2*y/r3z+nu*cosA*S+
        x2*y*cosA/Wr3+x2*cosA*S/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*S+x2*S/Wr+x2*y/Wr3);

    out.b = bx*(rFi_rz)/2+bz*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(-x*y/r3+nu*x*sinA/Wr+eta*x*C/Wr+
        eta*x*z/Wr3)+
        by/8/M_PI/(1-nu)*(-x2/r3+nu/r+nu*cosA*C+x2*z*cosA/Wr3+
        x2*cosA*C/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*C+x2*C/Wr+x2*z/Wr3);

    out.c = by*(rFi_rz)/2+bz*(rFi_ry)/2+
        bx/8/M_PI/(1-nu)*(y2/r3-nu/r-nu*cosA*C+nu*sinA*S+eta*sinA*cosA/W2-
        eta*(y*cosA+z*sinA)/W2r+eta*y*z/W2r2-eta*y*z/Wr3)-
        by*x/8/M_PI/(1-nu)*(y/r3+sinA*cosA*cosA/W2-cosA*(y*cosA+z*sinA)/
        W2r+y*z*cosA/W2r2-y*z*cosA/Wr3)-
        bz*x*sinA/8/M_PI/(1-nu)*(y*z/Wr3-sinA*cosA/W2+(y*cosA+z*sinA)/
        W2r-y*z/W2r2);
    return out;
}


WITHIN_KERNEL Real6 TDSetupS(Real3 obs, Real alpha, Real3 slip, Real nu,
    Real3 TriVertex, Real3 SideVec) 
{
    // TDSetupS transforms coordinates of the calculation points as well as 
    // slip vector components from ADCS into TDCS. It then calculates the 
    // strains in ADCS and transforms them into TDCS.

    // Transformation matrix
    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);

    // Transform coordinates of the calculation points from TDCS into ADCS
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    // Transform the in-plane slip vector components from TDCS into ADCS
    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    // Calculate strains associated with an angular dislocation in ADCS
    Real6 out_adcs = AngDisStrain(obs.x,y1,z1,-M_PI+alpha,slip.x,by1,bz1,nu);

    // Transform strains from ADCS into TDCS
    Real3 B0 = make3(1.0, 0.0, 0.0);
    Real3 B1 = make3(0.0, A1.x, A1.y);
    Real3 B2 = make3(0.0, A2.x, A2.y);
    return tensor_transform3(B0, B1, B2, out_adcs);
}

WITHIN_KERNEL Real3 AngDisDispFSC(Real y1, Real y2, Real y3, Real beta, 
                                  Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real r2b = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(r2b);

    Real Fib = 2*atan(-y2/(-(rb+y3b)*(1.0 / tan(beta/2))+y1)); // The Burgers' function

    Real v1cb1 = b1/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*((1-2*nu-a/rb)*cotB-y1/(rb+y3b)*(nu+a/rb))+(1-2*nu)*
        y2*cosB*cotB/(rb+z3b)*(cosB+a/rb)+a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*(-(1-2*nu)*cotB+y1/(rb+y3b)*(2*nu+a/rb)+
        a*y1/(rb*rb))+y2*(y3b-a)/(rb*(rb+z3b))*(cosB/(rb+z3b)*((rb*
        cosB+y3b)*((1-2*nu)*cosB-a/rb)*cotB+2*(1-nu)*(rb*sinB-y1)*cosB)-
        a*y3b*cosB*cotB/(rb*rb)));

    Real v2cb1 = b1/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)-nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1-2*nu)*cosB*log(rb+z3b))-(1-2*nu)/(rb+y3b)*(y1*
        cotB*(1-2*nu-a/rb)+nu*y3b-a+(y2*y2)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*z1b*cotB/(rb+z3b)*(cosB+a/rb)-a*y1*(y3b-a)*cotB/(rb*rb*rb)+
        (y3b-a)/(rb+y3b)*(-2*nu+1/rb*((1-2*nu)*y1*cotB-a)+(y2*y2)/(rb*
        (rb+y3b))*(2*nu+a/rb)+a*(y2*y2)/(rb*rb*rb))+(y3b-a)/(rb+z3b)*((cosB*cosB)-
        1/rb*((1-2*nu)*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/(rb*
        (rb+z3b))*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*(rb*cosB+y3b))));

    Real v3cb1 = b1/4/M_PI/(1-nu)*(2*(1-nu)*(((1-2*nu)*Fib*cotB)+(y2/(rb+y3b)*(2*
        nu+a/rb))-(y2*cosB/(rb+z3b)*(cosB+a/rb)))+y2*(y3b-a)/rb*(2*
        nu/(rb+y3b)+a/(rb*rb))+y2*(y3b-a)*cosB/(rb*(rb+z3b))*(1-2*nu-
        (rb*cosB+y3b)/(rb+z3b)*(cosB+a/rb)-a*y3b/(rb*rb)));

    Real v1cb2 = b2/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)+nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1)*cosB*log(rb+z3b))+(1-2*nu)/(rb+y3b)*(-(1-2*nu)*
        y1*cotB+nu*y3b-a+a*y1*cotB/rb+(y1*y1)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*cotB/(rb+z3b)*(z1b*cosB-a*(rb*sinB-y1)/(rb*cosB))-a*y1*
        (y3b-a)*cotB/(rb*rb*rb)+(y3b-a)/(rb+y3b)*(2*nu+1/rb*((1-2*nu)*y1*
        cotB+a)-(y1*y1)/(rb*(rb+y3b))*(2*nu+a/rb)-a*(y1*y1)/(rb*rb*rb))+(y3b-a)*
        cotB/(rb+z3b)*(-cosB*sinB+a*y1*y3b/((rb*rb*rb)*cosB)+(rb*sinB-y1)/
        rb*(2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB)))));
                    
    Real v2cb2 = b2/4/M_PI/(1-nu)*(2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*(-(1-2*nu-a/rb)*cotB+y1/(rb+y3b)*(nu+a/rb))-(1-2*nu)*
        y2*cotB/(rb+z3b)*(1+a/(rb*cosB))-a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*y1/rb*
        (1/rb+1/(rb+y3b)))+y2*(y3b-a)*cotB/(rb*(rb+z3b))*(-2*(1-nu)*
        cosB+(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB))+a*y3b/((rb*rb)*cosB)));
                    
    Real v3cb2 = b2/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*cotB*(log(rb+y3b)-cosB*
        log(rb+z3b))-2*(1-nu)*y1/(rb+y3b)*(2*nu+a/rb)+2*(1-nu)*z1b/(rb+
        z3b)*(cosB+a/rb)+(y3b-a)/rb*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*
        y1/(rb*rb))-(y3b-a)/(rb+z3b)*(cosB*sinB+(rb*cosB+y3b)*cotB/rb*
        (2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b))+a/rb*(sinB-y3b*z1b/
        (rb*rb)-z1b*(rb*cosB+y3b)/(rb*(rb+z3b)))));

    Real v1cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(y2/(rb+y3b)*(1+a/rb)-y2*cosB/(rb+
        z3b)*(cosB+a/rb))-y2*(y3b-a)/rb*(a/(rb*rb)+1/(rb+y3b))+y2*
        (y3b-a)*cosB/(rb*(rb+z3b))*((rb*cosB+y3b)/(rb+z3b)*(cosB+a/
        rb)+a*y3b/(rb*rb)));
                    
    Real v2cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(-sinB*log(rb+z3b)-y1/(rb+y3b)*(1+a/
        rb)+z1b/(rb+z3b)*(cosB+a/rb))+y1*(y3b-a)/rb*(a/(rb*rb)+1/(rb+
        y3b))-(y3b-a)/(rb+z3b)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/
        (rb*rb))-1/(rb*(rb+z3b))*((y2*y2)*cosB*sinB-a*z1b/rb*(rb*cosB+y3b))));
                    
    Real v3cb3 = b3/4/M_PI/(1-nu)*(2*(1-nu)*Fib+2*(1-nu)*(y2*sinB/(rb+z3b)*(cosB+
        a/rb))+y2*(y3b-a)*sinB/(rb*(rb+z3b))*(1+(rb*cosB+y3b)/(rb+
        z3b)*(cosB+a/rb)+a*y3b/(rb*rb)));

    return make3(
        v1cb1+v1cb2+v1cb3,
        v2cb1+v2cb2+v2cb3,
        v3cb1+v3cb2+v3cb3
    );
}

WITHIN_KERNEL Real6 AngDisStrainFSC(Real y1, Real y2, Real y3, Real beta,
                                    Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real rb2 = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(rb2);

    Real W1 = rb*cosB+y3b;
    Real W2 = cosB+a/rb;
    Real W3 = cosB+y3b/rb;
    Real W4 = nu+a/rb;
    Real W5 = 2*nu+a/rb;
    Real W6 = rb+y3b;
    Real W7 = rb+z3b;
    Real W8 = y3+a;
    Real W9 = 1+a/rb/cosB;

    Real N1 = 1-2*nu;

    Real rFib_ry2 = z1b/rb/(rb+z3b)-y1/rb/(rb+y3b);
    Real rFib_ry1 = y2/rb/(rb+y3b)-cosB*y2/rb/(rb+z3b);
    Real rFib_ry3 = -sinB*y2/rb/(rb+z3b);

    Real6 out;
    out.x = b1*(1.0/4.0*((-2+2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)*cotB- y1/W6*W4)/rb*y1+N1*y2/W6*(a/(rb*rb*rb)*y1*cotB-1/W6*W4+(y1*y1)/ (W6*W6)*W4/rb+(y1*y1)/W6*a/(rb*rb*rb))-N1*y2*cosB*cotB/(W7*W7)*W2*(y1/ rb-sinB)-N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*y1-3*a*y2*W8*cotB/(rb*rb2*rb2)* y1-y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1-y2*W8/ rb2/(W6*W6)*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1+y2*W8/rb/W6* (1/W6*W5-(y1*y1)/(W6*W6)*W5/rb-(y1*y1)/W6*a/(rb*rb*rb)+a/rb2-2*a*(y1*y1) /(rb2*rb2))-y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*y1-y2*W8/rb/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)*(y1/rb-sinB)+y2*W8/rb/W7*(-cosB/ (W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)*(y1/ rb-sinB)+cosB/W7*(1/rb*cosB*y1*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb2) *y1*cotB+(2-2*nu)*(1/rb*sinB*y1-1)*cosB)+2*a*y3b*cosB*cotB/ (rb2*rb2)*y1))/M_PI/(1-nu))+ b2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1)* cosB*(y1/rb-sinB)/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1* cotB/rb+(y1*y1)/W6*W4)/rb*y1+N1/W6*(-N1*cotB+a*cotB/rb-a* (y1*y1)*cotB/(rb*rb*rb)+2*y1/W6*W4-(y1*y1*y1)/(W6*W6)*W4/rb-(y1*y1*y1)/W6*a/ (rb*rb*rb))+N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*(y1/ rb-sinB)-N1*cotB/W7*((cosB*cosB)-a*(1/rb*sinB*y1-1)/rb/cosB+a* (rb*sinB-y1)/(rb*rb*rb)/cosB*y1)-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8* cotB/(rb*rb2*rb2)-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6* W5-a*(y1*y1)/(rb*rb*rb))/rb*y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y1+ 1/rb*N1*cotB-2*y1/rb/W6*W5+(y1*y1*y1)/(rb*rb*rb)/W6*W5+(y1*y1*y1)/rb2/ (W6*W6)*W5+(y1*y1*y1)/(rb2*rb2)/W6*a-2*a/(rb*rb*rb)*y1+3*a*(y1*y1*y1)/(rb*rb2*rb2))-W8* cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb* ((2-2*nu)*cosB-W1/W7*W9))*(y1/rb-sinB)+W8*cotB/W7*(a*y3b/ (rb*rb*rb)/cosB-3*a*(y1*y1)*y3b/(rb*rb2*rb2)/cosB+(1/rb*sinB*y1-1)/rb* ((2-2*nu)*cosB-W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/ W7*W9)*y1+(rb*sinB-y1)/rb*(-1/rb*cosB*y1/W7*W9+W1/(W7*W7)* W9*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)/cosB*y1)))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)/rb*y1-y2/W6*a/(rb*rb*rb)*y1+y2* cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb*rb)*y1)+y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1-y2*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a*y3b/rb2)*y1-y2*W8* cosB/rb/(W7*W7)*(W1/W7*W2+a*y3b/rb2)*(y1/rb-sinB)+y2*W8* cosB/rb/W7*(1/rb*cosB*y1/W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/(rb2*rb2)*y1))/M_PI/(1-nu));

    out.y = b1*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB/rb*y2/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+(y2*y2) /W6*W4)/rb*y2-N1/W6*(a*y1*cotB/(rb*rb*rb)*y2+2*y2/W6*W4-(y2*y2*y2) /(W6*W6)*W4/rb-(y2*y2*y2)/W6*a/(rb*rb*rb))+N1*z1b*cotB/(W7*W7)*W2/rb* y2+N1*z1b*cotB/W7*a/(rb*rb*rb)*y2+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1-W8/ (W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y2+2*y2/rb/ W6*W5-(y2*y2*y2)/(rb*rb*rb)/W6*W5-(y2*y2*y2)/rb2/(W6*W6)*W5-(y2*y2*y2)/(rb2*rb2)/W6* a+2*a/(rb*rb*rb)*y2-3*a*(y2*y2*y2)/(rb*rb2*rb2))-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1* z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1))/rb*y2+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a* cosB)*y2-3*a*y3b*z1b*cotB/(rb*rb2*rb2)*y2+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1)*y2+1/rb2/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1)*y2-1/rb/W7*(2*y2*(cosB*cosB)+a*z1b*cotB/(rb*rb*rb)*W1*y2-a* z1b*cotB/rb2*cosB*y2)))/M_PI/(1-nu))+ b2*(1.0/4.0*((2-2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((W5-1)*cotB+y1/W6* W4)-N1*(y2*y2)/(W6*W6)*((W5-1)*cotB+y1/W6*W4)/rb+N1*y2/W6*(-a/ (rb*rb*rb)*y2*cotB-y1/(W6*W6)*W4/rb*y2-y2/W6*a/(rb*rb*rb)*y1)-N1*cotB/ W7*W9+N1*(y2*y2)*cotB/(W7*W7)*W9/rb+N1*(y2*y2)*cotB/W7*a/(rb*rb*rb)/ cosB-a*W8*cotB/(rb*rb*rb)+3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/rb/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/(rb*rb*rb)/W6* (N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/rb2/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))+y2*W8/rb/W6* (2*nu*y1/(W6*W6)/rb*y2+a*y1/(rb*rb*rb)*(1/rb+1/W6)*y2-a*y1/rb* (-1/(rb*rb*rb)*y2-1/(W6*W6)/rb*y2))+W8*cotB/rb/W7*((-2+2*nu)*cosB+ W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/rb2/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)+y2*W8*cotB/rb/W7*(1/ rb*cosB*y2/W7*W9-W1/(W7*W7)*W9/rb*y2-W1/W7*a/(rb*rb*rb)/cosB*y2- 2*a*y3b/(rb2*rb2)/cosB*y2))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-sinB/rb*y2/W7+y2/(W6*W6)*(1+a/rb)/rb*y1+y2/W6* a/(rb*rb*rb)*y1-z1b/(W7*W7)*W2/rb*y2-z1b/W7*a/(rb*rb*rb)*y2)-y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1+y1*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/ rb*y2)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))/rb*y2-W8/W7*(sinB*a/ (rb*rb*rb)*y2-z1b/(rb*rb*rb)*(1+a*y3b/rb2)*y2-2*z1b/(rb*rb2*rb2)*a*y3b*y2+ 1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2+1/rb2/(W7*W7)* ((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2-1/rb/W7*(2*y2*cosB*sinB+a* z1b/(rb*rb*rb)*W1*y2-a*z1b/rb2*cosB*y2)))/M_PI/(1-nu));

    out.z = b1*(1.0/4.0*((2-2*nu)*(N1*rFib_ry3*cotB-y2/(W6*W6)*W5*(y3b/rb+1)- 1.0/2.0*y2/W6*a/(rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7* a/(rb*rb*rb)*2*y3b)+y2/rb*(2*nu/W6+a/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)*2*y3b+y2*W8/rb*(-2*nu/(W6*W6)*(y3b/rb+1)-a/ (rb2*rb2)*2*y3b)+y2*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)- 1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)*2* y3b-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a*y3b/rb2)*W3+y2* W8*cosB/rb/W7*(-(cosB*y3b/rb+1)/W7*W2+W1/(W7*W7)*W2*W3+1.0/2.0* W1/W7*a/(rb*rb*rb)*2*y3b-a/rb2+a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2*(1.0/4.0*((-2+2*nu)*N1*cotB*((y3b/rb+1)/W6-cosB*W3/W7)+(2-2*nu)* y1/(W6*W6)*W5*(y3b/rb+1)+1.0/2.0*(2-2*nu)*y1/W6*a/(rb*rb*rb)*2*y3b+(2- 2*nu)*sinB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*W3-1.0/2.0*(2-2*nu)*z1b/ W7*a/(rb*rb*rb)*2*y3b+1/rb*(N1*cotB-2*nu*y1/W6-a*y1/rb2)-1.0/2.0* W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/W6-a*y1/rb2)*2*y3b+W8/rb*(2*nu* y1/(W6*W6)*(y3b/rb+1)+a*y1/(rb2*rb2)*2*y3b)-1/W7*(cosB*sinB+W1* cotB/rb*((2-2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b* W1/rb/W7))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)*cosB-W1/ W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*W3-W8/W7*((cosB* y3b/rb+1)*cotB/rb*((2-2*nu)*cosB-W1/W7)-1.0/2.0*W1*cotB/(rb*rb*rb)* ((2-2*nu)*cosB-W1/W7)*2*y3b+W1*cotB/rb*(-(cosB*y3b/rb+1)/W7+ W1/(W7*W7)*W3)-1.0/2.0*a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7)* 2*y3b+a/rb*(-z1b/rb2-y3b*sinB/rb2+y3b*z1b/(rb2*rb2)*2*y3b- sinB*W1/rb/W7-z1b*(cosB*y3b/rb+1)/rb/W7+1.0/2.0*z1b*W1/(rb*rb*rb)/ W7*2*y3b+z1b*W1/rb/(W7*W7)*W3)))/M_PI/(1-nu))+ b3*(1.0/4.0*((2-2*nu)*rFib_ry3-(2-2*nu)*y2*sinB/(W7*W7)*W2*W3-1.0/2.0* (2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*2*y3b+y2*sinB/rb/W7*(1+W1/W7* W2+a*y3b/rb2)-1.0/2.0*y2*W8*sinB/(rb*rb*rb)/W7*(1+W1/W7*W2+a*y3b/ rb2)*2*y3b-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+a*y3b/rb2)*W3+ y2*W8*sinB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/(W7*W7)*W2*W3- 1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu));

    out.a = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((1-W5)*cotB-y1/ W6*W4)-N1*(y2*y2)/(W6*W6)*((1-W5)*cotB-y1/W6*W4)/rb+N1*y2/W6* (a/(rb*rb*rb)*y2*cotB+y1/(W6*W6)*W4/rb*y2+y2/W6*a/(rb*rb*rb)*y1)+N1* cosB*cotB/W7*W2-N1*(y2*y2)*cosB*cotB/(W7*W7)*W2/rb-N1*(y2*y2)*cosB* cotB/W7*a/(rb*rb*rb)+a*W8*cotB/(rb*rb*rb)-3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/ rb/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/(rb*rb*rb)/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/rb2/(W6*W6)*(-N1*cotB+y1/ W6*W5+a*y1/rb2)+y2*W8/rb/W6*(-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1-2*a*y1/(rb2*rb2)*y2)+W8/rb/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)-(y2*y2)*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2- 2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)-(y2*y2)*W8/rb2/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)+y2*W8/rb/W7*(-cosB/(W7*W7)*(W1* (N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)/rb*y2+cosB/ W7*(1/rb*cosB*y2*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb*rb)*y2*cotB+(2-2* nu)/rb*sinB*y2*cosB)+2*a*y3b*cosB*cotB/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1)* cosB/rb*y2/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/rb+ (y1*y1)/W6*W4)/rb*y2+N1/W6*(-a*y1*cotB/(rb*rb*rb)*y2-(y1*y1)/(W6*W6) *W4/rb*y2-(y1*y1)/W6*a/(rb*rb*rb)*y2)+N1*cotB/(W7*W7)*(z1b*cosB-a* (rb*sinB-y1)/rb/cosB)/rb*y2-N1*cotB/W7*(-a/rb2*sinB*y2/ cosB+a*(rb*sinB-y1)/(rb*rb*rb)/cosB*y2)+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1- W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y2+(y1*y1)/(rb*rb2) /W6*W5*y2+(y1*y1)/rb2/(W6*W6)*W5*y2+(y1*y1)/(rb2*rb2)/W6*a*y2+3* a*(y1*y1)/(rb*rb2*rb2)*y2)-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/ cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))/rb*y2+W8*cotB/ W7*(-3*a*y1*y3b/(rb*rb2*rb2)/cosB*y2+1/rb2*sinB*y2*((2-2*nu)*cosB- W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7*W9)*y2+(rb* sinB-y1)/rb*(-1/rb*cosB*y2/W7*W9+W1/(W7*W7)*W9/rb*y2+W1/W7* a/(rb*rb*rb)/cosB*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(1/W6*(1+a/rb)-(y2*y2)/(W6*W6)*(1+a/rb)/rb-(y2*y2)/ W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)*cosB/W7* a/(rb*rb*rb))-W8/rb*(a/rb2+1/W6)+(y2*y2)*W8/(rb*rb*rb)*(a/rb2+1/W6)- y2*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/rb*y2)+W8*cosB/rb/W7* (W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a* y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7)*(W1/W7*W2+a*y3b/rb2)+y2* W8*cosB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)*W2/rb*y2-W1/ W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB*(y1/rb-sinB)/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b- a+(y2*y2)/W6*W4)/rb*y1-N1/W6*((1-W5)*cotB+a*(y1*y1)*cotB/(rb*rb*rb)- (y2*y2)/(W6*W6)*W4/rb*y1-(y2*y2)/W6*a/(rb*rb*rb)*y1)-N1*cosB*cotB/W7* W2+N1*z1b*cotB/(W7*W7)*W2*(y1/rb-sinB)+N1*z1b*cotB/W7*a/(rb*rb2) *y1-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8*cotB/(rb*rb2*rb2)-W8/(W6*W6)*(-2* nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/(rb*rb*rb))/rb* y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y1+1/rb*N1*cotB-(y2*y2)/ (rb*rb*rb)/W6*W5*y1-(y2*y2)/rb2/(W6*W6)*W5*y1-(y2*y2)/(rb2*rb2)/W6*a*y1- 3*a*(y2*y2)/(rb*rb2*rb2)*y1)-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1*z1b*cotB+a* cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1))*(y1/rb-sinB)+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a*cosB)* y1-1/rb*N1*cosB*cotB+a*y3b*cosB*cotB/(rb*rb*rb)-3*a*y3b*z1b*cotB/ (rb*rb2*rb2)*y1+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*y1+1/ rb/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*(y1/rb-sinB)-1/rb/ W7*(-a*cosB*cotB/rb*W1+a*z1b*cotB/(rb*rb*rb)*W1*y1-a*z1b*cotB/ rb2*cosB*y1)))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)/rb*y1+N1*y2/W6*(-a/(rb*rb*rb)*y1*cotB+1/W6*W4-(y1*y1) /(W6*W6)*W4/rb-(y1*y1)/W6*a/(rb*rb*rb))+N1*y2*cotB/(W7*W7)*W9*(y1/ rb-sinB)+N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*y1+3*a*y2*W8*cotB/(rb*rb2*rb2) *y1-y2*W8/(rb*rb*rb)/W6*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/ W6))*y1-y2*W8/rb2/(W6*W6)*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/ rb+1/W6))*y1+y2*W8/rb/W6*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/ rb*(1/rb+1/W6)+a*(y1*y1)/(rb*rb*rb)*(1/rb+1/W6)-a*y1/rb*(-1/ (rb*rb*rb)*y1-1/(W6*W6)/rb*y1))-y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)*y1-y2*W8*cotB/rb/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)*(y1/rb-sinB)+y2*W8* cotB/rb/W7*(1/rb*cosB*y1/W7*W9-W1/(W7*W7)*W9*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)/cosB*y1-2*a*y3b/(rb2*rb2)/cosB*y1))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*(y1/rb-sinB)/W7-1/W6*(1+a/rb)+(y1*y1)/(W6*W6) *(1+a/rb)/rb+(y1*y1)/W6*a/(rb*rb*rb)+cosB/W7*W2-z1b/(W7*W7)*W2* (y1/rb-sinB)-z1b/W7*a/(rb*rb*rb)*y1)+W8/rb*(a/rb2+1/W6)-(y1*y1)* W8/(rb*rb*rb)*(a/rb2+1/W6)+y1*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))*(y1/rb-sinB)-W8/W7* (sinB*a/(rb*rb*rb)*y1+cosB/rb*(1+a*y3b/rb2)-z1b/(rb*rb*rb)*(1+a*y3b/ rb2)*y1-2*z1b/(rb*rb2*rb2)*a*y3b*y1+1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a* z1b/rb*W1)*y1+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)* (y1/rb-sinB)-1/rb/W7*(-a*cosB/rb*W1+a*z1b/(rb*rb*rb)*W1*y1-a* z1b/rb2*cosB*y1)))/M_PI/(1-nu));

    out.b = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)* cotB-y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB+ y1/(W6*W6)*W4*(y3b/rb+1)+1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)-N1*y2*cosB* cotB/(W7*W7)*W2*W3-1.0/2.0*N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*2*y3b+a/ (rb*rb*rb)*y2*cotB-3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/ W6*W5+a*y1/rb2)*2*y3b-y2*W8/rb/(W6*W6)*(-N1*cotB+y1/W6*W5+ a*y1/rb2)*(y3b/rb+1)+y2*W8/rb/W6*(-y1/(W6*W6)*W5*(y3b/rb+ 1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b-a*y1/(rb2*rb2)*2*y3b)+y2/rb/W7* (cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)- a*y3b*cosB*cotB/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)*2*y3b-y2*W8/rb/(W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*W3+y2*W8/rb/ W7*(-cosB/(W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)*W3+cosB/W7*((cosB*y3b/rb+1)*(N1*cosB-a/rb)*cotB+1.0/2.0*W1* a/(rb*rb*rb)*2*y3b*cotB+1.0/2.0*(2-2*nu)/rb*sinB*2*y3b*cosB)-a*cosB* cotB/rb2+a*y3b*cosB*cotB/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)*(y3b/rb+1)/W6-((2-2*nu)*(cotB*cotB) +1)*cosB*W3/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/ rb+(y1*y1)/W6*W4)*(y3b/rb+1)+N1/W6*(nu-1.0/2.0*a*y1*cotB/(rb*rb*rb)*2* y3b-(y1*y1)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y1*y1)/W6*a/(rb*rb*rb)*2*y3b)+ N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*W3-N1*cotB/ W7*(cosB*sinB-1.0/2.0*a/rb2*sinB*2*y3b/cosB+1.0/2.0*a*(rb*sinB-y1)/ (rb*rb*rb)/cosB*2*y3b)-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2* y3b+1/W6*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a* (y1*y1)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB+a)*2* y3b+1.0/2.0*(y1*y1)/(rb*rb*rb)/W6*W5*2*y3b+(y1*y1)/rb/(W6*W6)*W5*(y3b/rb+ 1)+1.0/2.0*(y1*y1)/(rb2*rb2)/W6*a*2*y3b+3.0/2.0*a*(y1*y1)/(rb*rb2*rb2)*2*y3b)+ cotB/W7*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb*((2- 2*nu)*cosB-W1/W7*W9))-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb2) /cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))*W3+W8*cotB/ W7*(a/(rb*rb*rb)/cosB*y1-3.0/2.0*a*y1*y3b/(rb*rb2*rb2)/cosB*2*y3b+1.0/2.0/ rb2*sinB*2*y3b*((2-2*nu)*cosB-W1/W7*W9)-1.0/2.0*(rb*sinB-y1)/(rb*rb2) *((2-2*nu)*cosB-W1/W7*W9)*2*y3b+(rb*sinB-y1)/rb*(-(cosB*y3b/ rb+1)/W7*W9+W1/(W7*W7)*W9*W3+1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2* y3b)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)*(y3b/rb+1)-1.0/2.0*y2/W6*a/ (rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7*a/(rb*rb*rb)*2* y3b)-y2/rb*(a/rb2+1/W6)+1.0/2.0*y2*W8/(rb*rb*rb)*(a/rb2+1/W6)*2* y3b-y2*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)*(y3b/rb+1))+y2*cosB/ rb/W7*(W1/W7*W2+a*y3b/rb2)-1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(W1/ W7*W2+a*y3b/rb2)*2*y3b-y2*W8*cosB/rb/(W7*W7)*(W1/W7*W2+a* y3b/rb2)*W3+y2*W8*cosB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/ (W7*W7)*W2*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2* y3b))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry1*cotB-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1+y2*cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb2) *y1)-y2*W8/(rb*rb*rb)*(2*nu/W6+a/rb2)*y1+y2*W8/rb*(-2*nu/(W6*W6) /rb*y1-2*a/(rb2*rb2)*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7* W2-a*y3b/rb2)*y1-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a* y3b/rb2)*(y1/rb-sinB)+y2*W8*cosB/rb/W7*(-1/rb*cosB*y1/W7* W2+W1/(W7*W7)*W2*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)*y1+2*a*y3b/(rb2*rb2) *y1))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y1/W6-cosB*(y1/rb-sinB)/W7)- (2-2*nu)/W6*W5+(2-2*nu)*(y1*y1)/(W6*W6)*W5/rb+(2-2*nu)*(y1*y1)/W6* a/(rb*rb*rb)+(2-2*nu)*cosB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y1-W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/ W6-a*y1/rb2)*y1+W8/rb*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/rb2+ 2*a*(y1*y1)/(rb2*rb2))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)* cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*(y1/rb- sinB)-W8/W7*(1/rb2*cosB*y1*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y1+W1*cotB/rb*(-1/rb*cosB* y1/W7+W1/(W7*W7)*(y1/rb-sinB))-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2- z1b*W1/rb/W7)*y1+a/rb*(-y3b*cosB/rb2+2*y3b*z1b/(rb2*rb2)*y1- cosB*W1/rb/W7-z1b/rb2*cosB*y1/W7+z1b*W1/(rb*rb*rb)/W7*y1+z1b* W1/rb/(W7*W7)*(y1/rb-sinB))))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry1-(2-2*nu)*y2*sinB/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*y1-y2*W8*sinB/(rb*rb*rb)/W7*(1+ W1/W7*W2+a*y3b/rb2)*y1-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+ a*y3b/rb2)*(y1/rb-sinB)+y2*W8*sinB/rb/W7*(1/rb*cosB*y1/ W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)-W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/ (rb2*rb2)*y1))/M_PI/(1-nu));

    out.c = b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)*(y3b/rb+1)/W6-((2-2*nu)* (cotB*cotB)+1-2*nu)*cosB*W3/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+ (y2*y2)/W6*W4)*(y3b/rb+1)-N1/W6*(1.0/2.0*a*y1*cotB/(rb*rb*rb)*2*y3b+ nu-(y2*y2)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y2*y2)/W6*a/(rb*rb*rb)*2*y3b)-N1* sinB*cotB/W7*W2+N1*z1b*cotB/(W7*W7)*W2*W3+1.0/2.0*N1*z1b*cotB/W7* a/(rb*rb*rb)*2*y3b-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2*y3b+ 1/W6*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))-W8/(W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+ a*(y2*y2)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB-a)* 2*y3b-1.0/2.0*(y2*y2)/(rb*rb*rb)/W6*W5*2*y3b-(y2*y2)/rb/(W6*W6)*W5*(y3b/ rb+1)-1.0/2.0*(y2*y2)/(rb2*rb2)/W6*a*2*y3b-3.0/2.0*a*(y2*y2)/(rb*rb2*rb2)*2*y3b)+ 1/W7*((cosB*cosB)-1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb2) -1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))-W8/(W7*W7)*((cosB*cosB)- 1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7* ((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))*W3+W8/W7*(1.0/2.0/(rb*rb*rb)*(N1* z1b*cotB+a*cosB)*2*y3b-1/rb*N1*sinB*cotB+a*z1b*cotB/(rb*rb*rb)+a* y3b*sinB*cotB/(rb*rb*rb)-3.0/2.0*a*y3b*z1b*cotB/(rb*rb2*rb2)*2*y3b+1.0/2.0/(rb*rb2) /W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*2*y3b+1/rb/(W7*W7)*((y2*y2) *(cosB*cosB)-a*z1b*cotB/rb*W1)*W3-1/rb/W7*(-a*sinB*cotB/rb*W1+ 1.0/2.0*a*z1b*cotB/(rb*rb*rb)*W1*2*y3b-a*z1b*cotB/rb*(cosB*y3b/rb+ 1))))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(-1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB-y1/ (W6*W6)*W4*(y3b/rb+1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)+N1*y2*cotB/ (W7*W7)*W9*W3+1.0/2.0*N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*2*y3b-a/(rb*rb*rb)* y2*cotB+3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(N1*cotB-2* nu*y1/W6-a*y1/rb*(1/rb+1/W6))-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*2*y3b-y2*W8/rb/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*(y3b/rb+1)+y2* W8/rb/W6*(2*nu*y1/(W6*W6)*(y3b/rb+1)+1.0/2.0*a*y1/(rb*rb*rb)*(1/rb+ 1/W6)*2*y3b-a*y1/rb*(-1.0/2.0/(rb*rb*rb)*2*y3b-1/(W6*W6)*(y3b/rb+ 1)))+y2*cotB/rb/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)- 1.0/2.0*y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/ rb2/cosB)*2*y3b-y2*W8*cotB/rb/(W7*W7)*((-2+2*nu)*cosB+W1/W7* W9+a*y3b/rb2/cosB)*W3+y2*W8*cotB/rb/W7*((cosB*y3b/rb+1)/ W7*W9-W1/(W7*W7)*W9*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2*y3b+a/rb2/ cosB-a*y3b/(rb2*rb2)/cosB*2*y3b))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*W3/W7+y1/(W6*W6)*(1+a/rb)*(y3b/rb+1)+ 1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b+sinB/W7*W2-z1b/(W7*W7)*W2*W3-1.0/2.0* z1b/W7*a/(rb*rb*rb)*2*y3b)+y1/rb*(a/rb2+1/W6)-1.0/2.0*y1*W8/(rb*rb2) *(a/rb2+1/W6)*2*y3b+y1*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)* (y3b/rb+1))-1/W7*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))+W8/(W7*W7)*(sinB*(cosB- a/rb)+z1b/rb*(1+a*y3b/rb2)-1/rb/W7*((y2*y2)*cosB*sinB-a*z1b/ rb*W1))*W3-W8/W7*(1.0/2.0*sinB*a/(rb*rb*rb)*2*y3b+sinB/rb*(1+a*y3b/ rb2)-1.0/2.0*z1b/(rb*rb*rb)*(1+a*y3b/rb2)*2*y3b+z1b/rb*(a/rb2-a* y3b/(rb2*rb2)*2*y3b)+1.0/2.0/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb* W1)*2*y3b+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*W3-1/ rb/W7*(-a*sinB/rb*W1+1.0/2.0*a*z1b/(rb*rb*rb)*W1*2*y3b-a*z1b/rb* (cosB*y3b/rb+1))))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry2*cotB+1/W6*W5-(y2*y2)/(W6*W6)*W5/ rb-(y2*y2)/W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)* cosB/W7*a/(rb*rb*rb))+W8/rb*(2*nu/W6+a/rb2)-(y2*y2)*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)+y2*W8/rb*(-2*nu/(W6*W6)/rb*y2-2*a/(rb2*rb2)*y2)+ W8*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/ (rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7) *(1-2*nu-W1/W7*W2-a*y3b/rb2)+y2*W8*cosB/rb/W7*(-1/rb* cosB*y2/W7*W2+W1/(W7*W7)*W2/rb*y2+W1/W7*a/(rb*rb*rb)*y2+2*a* y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y2/W6-cosB/rb*y2/W7)+(2- 2*nu)*y1/(W6*W6)*W5/rb*y2+(2-2*nu)*y1/W6*a/(rb*rb*rb)*y2-(2-2* nu)*z1b/(W7*W7)*W2/rb*y2-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y2-W8/(rb*rb2) *(N1*cotB-2*nu*y1/W6-a*y1/rb2)*y2+W8/rb*(2*nu*y1/(W6*W6)/ rb*y2+2*a*y1/(rb2*rb2)*y2)+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2- 2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))/ rb*y2-W8/W7*(1/rb2*cosB*y2*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y2+W1*cotB/rb*(-cosB/rb* y2/W7+W1/(W7*W7)/rb*y2)-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/ rb/W7)*y2+a/rb*(2*y3b*z1b/(rb2*rb2)*y2-z1b/rb2*cosB*y2/W7+ z1b*W1/(rb*rb*rb)/W7*y2+z1b*W1/rb2/(W7*W7)*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry2+(2-2*nu)*sinB/W7*W2-(2-2*nu)*(y2*y2)* sinB/(W7*W7)*W2/rb-(2-2*nu)*(y2*y2)*sinB/W7*a/(rb*rb*rb)+W8*sinB/rb/ W7*(1+W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/(rb*rb*rb)/W7*(1+W1/ W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/rb2/(W7*W7)*(1+W1/W7*W2+a* y3b/rb2)+y2*W8*sinB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)* W2/rb*y2-W1/W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu));

    return out;
}

WITHIN_KERNEL Real3 AngSetupFSC(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make3(0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real3 vA = AngDisDispFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real3 vB = AngDisDispFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );

    Real3 v = sub3(vB, vA);
    return inv_transform3(ey1, ey2, ey3, v);
}

WITHIN_KERNEL Real6 AngSetupFSC_S(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make6(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real6 vA = AngDisStrainFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real6 vB = AngDisStrainFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );
    Real6 v = sub6(vB, vA);
    return tensor_transform3(ey1, ey2, ey3, v);
}

#endif





KERNEL
void blocks_disp_fs(GLOBAL_MEM Real* results, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM int* block_start,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_id = get_local_id(0);
    int team_size = get_local_size(0);

    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;

    int bs = block_start[block_idx];

    for (int i = os + team_id; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int j = ss; j < se; j++) {
            int src_idx = j - ss;

                Real3 tri0;
                    tri0.x = tris[j * 9 + 0 * 3 + 0];
                    tri0.y = tris[j * 9 + 0 * 3 + 1];
                    tri0.z = tris[j * 9 + 0 * 3 + 2];
                Real3 tri1;
                    tri1.x = tris[j * 9 + 1 * 3 + 0];
                    tri1.y = tris[j * 9 + 1 * 3 + 1];
                    tri1.z = tris[j * 9 + 1 * 3 + 2];
                Real3 tri2;
                    tri2.x = tris[j * 9 + 2 * 3 + 0];
                    tri2.y = tris[j * 9 + 2 * 3 + 1];
                    tri2.z = tris[j * 9 + 2 * 3 + 2];
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.y = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.z;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.z = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.z;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.x = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.z;
                }
            }
        }
    }
}


KERNEL
void blocks_disp_hs(GLOBAL_MEM Real* results, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM int* block_start,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_id = get_local_id(0);
    int team_size = get_local_size(0);

    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;

    int bs = block_start[block_idx];

    for (int i = os + team_id; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int j = ss; j < se; j++) {
            int src_idx = j - ss;

                Real3 tri0;
                    tri0.x = tris[j * 9 + 0 * 3 + 0];
                    tri0.y = tris[j * 9 + 0 * 3 + 1];
                    tri0.z = tris[j * 9 + 0 * 3 + 2];
                Real3 tri1;
                    tri1.x = tris[j * 9 + 1 * 3 + 0];
                    tri1.y = tris[j * 9 + 1 * 3 + 1];
                    tri1.z = tris[j * 9 + 1 * 3 + 2];
                Real3 tri2;
                    tri2.x = tris[j * 9 + 2 * 3 + 0];
                    tri2.y = tris[j * 9 + 2 * 3 + 1];
                    tri2.z = tris[j * 9 + 2 * 3 + 2];
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.y = 1.0;

                
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.z;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.z = 1.0;

                
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.z;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.x = 1.0;

                
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 3 + 0) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 1) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 3 + 2) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.z;
                }
            }
        }
    }
}


KERNEL
void blocks_strain_fs(GLOBAL_MEM Real* results, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM int* block_start,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_id = get_local_id(0);
    int team_size = get_local_size(0);

    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;

    int bs = block_start[block_idx];

    for (int i = os + team_id; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int j = ss; j < se; j++) {
            int src_idx = j - ss;

                Real3 tri0;
                    tri0.x = tris[j * 9 + 0 * 3 + 0];
                    tri0.y = tris[j * 9 + 0 * 3 + 1];
                    tri0.z = tris[j * 9 + 0 * 3 + 2];
                Real3 tri1;
                    tri1.x = tris[j * 9 + 1 * 3 + 0];
                    tri1.y = tris[j * 9 + 1 * 3 + 1];
                    tri1.z = tris[j * 9 + 1 * 3 + 2];
                Real3 tri2;
                    tri2.x = tris[j * 9 + 2 * 3 + 0];
                    tri2.y = tris[j * 9 + 2 * 3 + 1];
                    tri2.z = tris[j * 9 + 2 * 3 + 2];
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.y = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.c;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.z = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.c;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.x = 1.0;

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.c;
                }
            }
        }
    }
}


KERNEL
void blocks_strain_hs(GLOBAL_MEM Real* results, 
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM int* block_start,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_id = get_local_id(0);
    int team_size = get_local_size(0);

    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;

    int bs = block_start[block_idx];

    for (int i = os + team_id; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int j = ss; j < se; j++) {
            int src_idx = j - ss;

                Real3 tri0;
                    tri0.x = tris[j * 9 + 0 * 3 + 0];
                    tri0.y = tris[j * 9 + 0 * 3 + 1];
                    tri0.z = tris[j * 9 + 0 * 3 + 2];
                Real3 tri1;
                    tri1.x = tris[j * 9 + 1 * 3 + 0];
                    tri1.y = tris[j * 9 + 1 * 3 + 1];
                    tri1.z = tris[j * 9 + 1 * 3 + 2];
                Real3 tri2;
                    tri2.x = tris[j * 9 + 2 * 3 + 0];
                    tri2.y = tris[j * 9 + 2 * 3 + 1];
                    tri2.z = tris[j * 9 + 2 * 3 + 2];
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.y = 1.0;

                
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 0;
                    results[idx] = full_out.c;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.z = 1.0;

                
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 1;
                    results[idx] = full_out.c;
                }
            }
            {
                Real3 slip = make3(0.0, 0.0, 0.0);
                slip.x = 1.0;

                
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


                {
                    int idx = bs + (
                        (obs_idx * 6 + 0) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.x;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 1) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.y;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 2) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.z;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 3) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.a;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 4) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.b;
                }
                {
                    int idx = bs + (
                        (obs_idx * 6 + 5) * n_src + src_idx
                    ) * 3 + 2;
                    results[idx] = full_out.c;
                }
            }
        }
    }
}







#ifndef CUTDE_COMMON
#define CUTDE_COMMON



#define Real double
#define EPS 2.220446049250313e-16

#ifndef M_PI
  #define M_PI   3.14159265358979323846264338327950288
#endif

typedef struct Real2 {
    Real x;
    Real y;
} Real2;

typedef struct Real3 {
    Real x;
    Real y;
    Real z;
} Real3;

typedef struct Real6 {
    Real x;
    Real y;
    Real z;
    Real a;
    Real b;
    Real c;
} Real6;

WITHIN_KERNEL void print(Real x) {
    printf("%f \n", x);
}

    

WITHIN_KERNEL Real2 add2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
    return out;
}

    

WITHIN_KERNEL Real2 sub2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
    return out;
}

    

WITHIN_KERNEL Real2 mul2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
    return out;
}

    

WITHIN_KERNEL Real2 div2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
    return out;
}


    

WITHIN_KERNEL Real2 add_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
    return out;
}

    

WITHIN_KERNEL Real2 sub_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
    return out;
}

    

WITHIN_KERNEL Real2 mul_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
    return out;
}

    

WITHIN_KERNEL Real2 div_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
    return out;
}


    WITHIN_KERNEL void print2(Real2 x) {
        
        printf("%f %f  \n", x.x,x.y);
    }

    WITHIN_KERNEL Real sum2(Real2 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
        return out;
    }

    WITHIN_KERNEL Real dot2(Real2 x, Real2 y) {
        return sum2(mul2(x, y));
    }

    WITHIN_KERNEL Real length2(Real2 x) {
        return sqrt(dot2(x, x));
    }

    WITHIN_KERNEL Real2 negate2(Real2 x) {
        return mul_scalar2(x, -1);
    }

    WITHIN_KERNEL Real2 normalize2(Real2 x) {
        return div_scalar2(x, length2(x));
    }

    WITHIN_KERNEL Real2 make2(
        Real x
            ,
        Real y
    ) {
        Real2 out;
            out.x = x;
            out.y = y;
        return out;
    }
    

WITHIN_KERNEL Real3 add3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
    return out;
}

    

WITHIN_KERNEL Real3 sub3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
    return out;
}

    

WITHIN_KERNEL Real3 mul3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
    return out;
}

    

WITHIN_KERNEL Real3 div3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
    return out;
}


    

WITHIN_KERNEL Real3 add_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
    return out;
}

    

WITHIN_KERNEL Real3 sub_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
    return out;
}

    

WITHIN_KERNEL Real3 mul_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
    return out;
}

    

WITHIN_KERNEL Real3 div_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
    return out;
}


    WITHIN_KERNEL void print3(Real3 x) {
        
        printf("%f %f %f  \n", x.x,x.y,x.z);
    }

    WITHIN_KERNEL Real sum3(Real3 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
        return out;
    }

    WITHIN_KERNEL Real dot3(Real3 x, Real3 y) {
        return sum3(mul3(x, y));
    }

    WITHIN_KERNEL Real length3(Real3 x) {
        return sqrt(dot3(x, x));
    }

    WITHIN_KERNEL Real3 negate3(Real3 x) {
        return mul_scalar3(x, -1);
    }

    WITHIN_KERNEL Real3 normalize3(Real3 x) {
        return div_scalar3(x, length3(x));
    }

    WITHIN_KERNEL Real3 make3(
        Real x
            ,
        Real y
            ,
        Real z
    ) {
        Real3 out;
            out.x = x;
            out.y = y;
            out.z = z;
        return out;
    }
    

WITHIN_KERNEL Real6 add6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
        
        out.a = a.a + b.a;
        
        out.b = a.b + b.b;
        
        out.c = a.c + b.c;
    return out;
}

    

WITHIN_KERNEL Real6 sub6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
        
        out.a = a.a - b.a;
        
        out.b = a.b - b.b;
        
        out.c = a.c - b.c;
    return out;
}

    

WITHIN_KERNEL Real6 mul6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
        
        out.a = a.a * b.a;
        
        out.b = a.b * b.b;
        
        out.c = a.c * b.c;
    return out;
}

    

WITHIN_KERNEL Real6 div6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
        
        out.a = a.a / b.a;
        
        out.b = a.b / b.b;
        
        out.c = a.c / b.c;
    return out;
}


    

WITHIN_KERNEL Real6 add_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
        
        out.a = a.a + b;
        
        out.b = a.b + b;
        
        out.c = a.c + b;
    return out;
}

    

WITHIN_KERNEL Real6 sub_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
        
        out.a = a.a - b;
        
        out.b = a.b - b;
        
        out.c = a.c - b;
    return out;
}

    

WITHIN_KERNEL Real6 mul_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
        
        out.a = a.a * b;
        
        out.b = a.b * b;
        
        out.c = a.c * b;
    return out;
}

    

WITHIN_KERNEL Real6 div_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
        
        out.a = a.a / b;
        
        out.b = a.b / b;
        
        out.c = a.c / b;
    return out;
}


    WITHIN_KERNEL void print6(Real6 x) {
        
        printf("%f %f %f %f %f %f  \n", x.x,x.y,x.z,x.a,x.b,x.c);
    }

    WITHIN_KERNEL Real sum6(Real6 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
            out += x.a;
            out += x.b;
            out += x.c;
        return out;
    }

    WITHIN_KERNEL Real dot6(Real6 x, Real6 y) {
        return sum6(mul6(x, y));
    }

    WITHIN_KERNEL Real length6(Real6 x) {
        return sqrt(dot6(x, x));
    }

    WITHIN_KERNEL Real6 negate6(Real6 x) {
        return mul_scalar6(x, -1);
    }

    WITHIN_KERNEL Real6 normalize6(Real6 x) {
        return div_scalar6(x, length6(x));
    }

    WITHIN_KERNEL Real6 make6(
        Real x
            ,
        Real y
            ,
        Real z
            ,
        Real a
            ,
        Real b
            ,
        Real c
    ) {
        Real6 out;
            out.x = x;
            out.y = y;
            out.z = z;
            out.a = a;
            out.b = b;
            out.c = c;
        return out;
    }


WITHIN_KERNEL Real2 transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = dot2(a,v);
    out.y = dot2(b,v);
    return out;
}

WITHIN_KERNEL Real2 inv_transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = a.x * v.x + b.x * v.y;
    out.y = a.y * v.x + b.y * v.y;
    return out;
}

WITHIN_KERNEL Real3 transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = dot3(a,v);
    out.y = dot3(b,v);
    out.z = dot3(c,v);
    return out;
}

WITHIN_KERNEL Real3 inv_transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = a.x * v.x + b.x * v.y + c.x * v.z;
    out.y = a.y * v.x + b.y * v.y + c.y * v.z;
    out.z = a.z * v.x + b.z * v.y + c.z * v.z;
    return out;
}

WITHIN_KERNEL Real3 cross3(Real3 x, Real3 y) {
    Real3 out;
    out.x = x.y * y.z - x.z * y.y;
    out.y = x.z * y.x - x.x * y.z;
    out.z = x.x * y.y - x.y * y.x;
    return out;
}

WITHIN_KERNEL Real6 tensor_transform3(Real3 a, Real3 b, Real3 c, Real6 tensor) {
    Real A[9];
    A[0] = a.x; A[1] = a.y; A[2] = a.z;
    A[3] = b.x; A[4] = b.y; A[5] = b.z;
    A[6] = c.x; A[7] = c.y; A[8] = c.z;
    Real6 out;
    out.x = A[0]*A[0]*tensor.x+2*A[0]*A[3]*tensor.a+2*A[0]*A[6]*tensor.b+
        2*A[3]*A[6]*tensor.c+ A[3]*A[3]*tensor.y+A[6]*A[6]*tensor.z;
    out.y = A[1]*A[1]*tensor.x+2*A[1]*A[4]*tensor.a+2*A[1]*A[7]*tensor.b+
        2*A[4]*A[7]*tensor.c+A[4]*A[4]*tensor.y+A[7]*A[7]*tensor.z;
    out.z = A[2]*A[2]*tensor.x+2*A[2]*A[5]*tensor.a+2*A[2]*A[8]*tensor.b+
        2*A[5]*A[8]*tensor.c+A[5]*A[5]*tensor.y+A[8]*A[8]*tensor.z;
    out.a = A[0]*A[1]*tensor.x+(A[0]*A[4]+A[1]*A[3])*tensor.a+(A[0]*A[7]+
        A[1]*A[6])*tensor.b+(A[7]*A[3]+A[6]*A[4])*tensor.c+A[4]*A[3]*tensor.y+
        A[6]*A[7]*tensor.z;
    out.b = A[0]*A[2]*tensor.x+(A[0]*A[5]+A[2]*A[3])*tensor.a+(A[0]*A[8]+
        A[2]*A[6])*tensor.b+(A[8]*A[3]+A[6]*A[5])*tensor.c+A[5]*A[3]*tensor.y+
        A[6]*A[8]*tensor.z;
    out.c = A[1]*A[2]*tensor.x+(A[2]*A[4]+A[1]*A[5])*tensor.a+(A[2]*A[7]+
        A[1]*A[8])*tensor.b+(A[7]*A[5]+A[8]*A[4])*tensor.c+A[4]*A[5]*tensor.y+
        A[7]*A[8]*tensor.z;
    return out;
}

WITHIN_KERNEL int trimodefinder(Real3 obs, Real3 tri0, Real3 tri1, Real3 tri2) {
    // trimodefinder calculates the normalized barycentric coordinates of
    // the points with respect to the TD vertices and specifies the appropriate
    // artefact-free configuration of the angular dislocations for the
    // calculations. The input matrices x, y and z share the same size and
    // correspond to the y, z and x coordinates in the TDCS, respectively. p1,
    // p2 and p3 are two-component matrices representing the y and z coordinates
    // of the TD vertices in the TDCS, respectively.
    // The components of the output (trimode) corresponding to each calculation
    // points, are 1 for the first configuration, -1 for the second
    // configuration and 0 for the calculation point that lie on the TD sides.

    Real a = ((tri1.z-tri2.z)*(obs.y-tri2.y)+(tri2.y-tri1.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real b = ((tri2.z-tri0.z)*(obs.y-tri2.y)+(tri0.y-tri2.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real c = 1-a-b;

    int result = 1;
    if ((a<=0 && b>c && c>a) ||
            (b<=0 && c>a && a>b) ||
            (c<=0 && a>b && b>c)) {
        result = -1;
    }

    if ((a==0 && b>=0 && c>=0) ||
            (a>=0 && b==0 && c>=0) ||
            (a>=0 && b>=0 && c==0)) {
        result = 0;
    }
    if (result == 0 && obs.x != 0) {
        result = 1;
    }

    return result;
}

WITHIN_KERNEL Real3 AngDisDisp(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;
    Real r = sqrt(x * x + y * y + z * z);

    Real ux = bx/8/M_PI/(1-nu)*(x*y/r/(r-z)-x*eta/r/(r-zeta));
    Real vx = bx/8/M_PI/(1-nu)*(eta*sinA/(r-zeta)-y*eta/r/(r-zeta)+        y*y/r/(r-z)+(1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real wx = bx/8/M_PI/(1-nu)*(eta*cosA/(r-zeta)-y/r-eta*z/r/(r-zeta)-        (1-2*nu)*sinA*log(r-zeta));

    Real uy = by/8/M_PI/(1-nu)*(x*x*cosA/r/(r-zeta)-x*x/r/(r-z)-         (1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real vy = by*x/8/M_PI/(1-nu)*(y*cosA/r/(r-zeta)-sinA*cosA/(r-zeta)-y/r/(r-z));
    Real wy = by*x/8/M_PI/(1-nu)*(z*cosA/r/(r-zeta)-cosA*cosA/(r-zeta)+1/r);

    Real uz = bz*sinA/8/M_PI/(1-nu)*((1-2*nu)*log(r-zeta)-x*x/r/(r-zeta));
    Real vz = bz*x*sinA/8/M_PI/(1-nu)*(sinA/(r-zeta)-y/r/(r-zeta));
    Real wz = bz*x*sinA/8/M_PI/(1-nu)*(cosA/(r-zeta)-z/r/(r-zeta));
    return make3(ux+uy+uz, vx+vy+vz, wx+wy+wz);
}

WITHIN_KERNEL Real3 TDSetupD(Real3 obs, Real alpha, Real3 slip, Real nu, Real3 TriVertex, Real3 SideVec) {
    // TDSetupD transforms coordinates of the calculation points as well as
    // slip vector components from ADCS into TDCS. It then calculates the
    // displacements in ADCS and transforms them into TDCS.

    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    Real3 uvw = AngDisDisp(obs.x, y1, z1, -M_PI + alpha, slip.x, by1, bz1, nu);

    Real2 r3 = inv_transform2(A1, A2, make2(uvw.y, uvw.z));
    Real v = r3.x;
    Real w = r3.y;
    return make3(uvw.x, v, w);
}

WITHIN_KERNEL Real6 AngDisStrain(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {
    // AngDisStrain calculates the strains associated with an angular 
    // dislocation in an elastic full-space.

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;

    Real x2 = x*x;
    Real y2 = y*y;
    Real z2 = z*z;
    Real r2 = x2+y2+z2;
    Real r = sqrt(r2);
    Real r3 = r*r2;
    Real rz = r*(r-z);
    Real rmz = (r-z);
    Real r2z2 = r2*rmz*rmz;
    Real r3z = r3*rmz;

    Real W = zeta-r;
    Real W2 = W*W;
    Real Wr = W*r;
    Real W2r = W2*r;
    Real Wr3 = W*r3;
    Real W2r2 = W2*r2;

    Real C = (r*cosA-z)/Wr;
    Real S = (r*sinA-y)/Wr;

    // Partial derivatives of the Burgers' function
    Real rFi_rx = (eta/r/(r-zeta)-y/r/(r-z))/4/M_PI;
    Real rFi_ry = (x/r/(r-z)-cosA*x/r/(r-zeta))/4/M_PI;
    Real rFi_rz = (sinA*x/r/(r-zeta))/4/M_PI;

    Real6 out;
    out.x = bx*(rFi_rx)+
        bx/8/M_PI/(1-nu)*(eta/Wr+eta*x2/W2r2-eta*x2/Wr3+y/rz-
        x2*y/r2z2-x2*y/r3z)-
        by*x/8/M_PI/(1-nu)*(((2*nu+1)/Wr+x2/W2r2-x2/Wr3)*cosA+
        (2*nu+1)/rz-x2/r2z2-x2/r3z)+
        bz*x*sinA/8/M_PI/(1-nu)*((2*nu+1)/Wr+x2/W2r2-x2/Wr3);

    out.y = by*(rFi_ry)+
        bx/8/M_PI/(1-nu)*((1/Wr+S*S-y2/Wr3)*eta+(2*nu+1)*y/rz-y*y*y/r2z2-
        y*y*y/r3z-2*nu*cosA*S)-
        by*x/8/M_PI/(1-nu)*(1/rz-y2/r2z2-y2/r3z+
        (1/Wr+S*S-y2/Wr3)*cosA)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+S*S-y2/Wr3);

    out.z = bz*(rFi_rz)+
        bx/8/M_PI/(1-nu)*(eta/W/r+eta*C*C-eta*z2/Wr3+y*z/r3+
        2*nu*sinA*C)-
        by*x/8/M_PI/(1-nu)*((1/Wr+C*C-z2/Wr3)*cosA+z/r3)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+C*C-z2/Wr3);

    out.a = bx*(rFi_ry)/2+by*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(x*y2/r2z2-nu*x/rz+x*y2/r3z-nu*x*cosA/Wr+
        eta*x*S/Wr+eta*x*y/Wr3)+
        by/8/M_PI/(1-nu)*(x2*y/r2z2-nu*y/rz+x2*y/r3z+nu*cosA*S+
        x2*y*cosA/Wr3+x2*cosA*S/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*S+x2*S/Wr+x2*y/Wr3);

    out.b = bx*(rFi_rz)/2+bz*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(-x*y/r3+nu*x*sinA/Wr+eta*x*C/Wr+
        eta*x*z/Wr3)+
        by/8/M_PI/(1-nu)*(-x2/r3+nu/r+nu*cosA*C+x2*z*cosA/Wr3+
        x2*cosA*C/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*C+x2*C/Wr+x2*z/Wr3);

    out.c = by*(rFi_rz)/2+bz*(rFi_ry)/2+
        bx/8/M_PI/(1-nu)*(y2/r3-nu/r-nu*cosA*C+nu*sinA*S+eta*sinA*cosA/W2-
        eta*(y*cosA+z*sinA)/W2r+eta*y*z/W2r2-eta*y*z/Wr3)-
        by*x/8/M_PI/(1-nu)*(y/r3+sinA*cosA*cosA/W2-cosA*(y*cosA+z*sinA)/
        W2r+y*z*cosA/W2r2-y*z*cosA/Wr3)-
        bz*x*sinA/8/M_PI/(1-nu)*(y*z/Wr3-sinA*cosA/W2+(y*cosA+z*sinA)/
        W2r-y*z/W2r2);
    return out;
}


WITHIN_KERNEL Real6 TDSetupS(Real3 obs, Real alpha, Real3 slip, Real nu,
    Real3 TriVertex, Real3 SideVec) 
{
    // TDSetupS transforms coordinates of the calculation points as well as 
    // slip vector components from ADCS into TDCS. It then calculates the 
    // strains in ADCS and transforms them into TDCS.

    // Transformation matrix
    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);

    // Transform coordinates of the calculation points from TDCS into ADCS
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    // Transform the in-plane slip vector components from TDCS into ADCS
    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    // Calculate strains associated with an angular dislocation in ADCS
    Real6 out_adcs = AngDisStrain(obs.x,y1,z1,-M_PI+alpha,slip.x,by1,bz1,nu);

    // Transform strains from ADCS into TDCS
    Real3 B0 = make3(1.0, 0.0, 0.0);
    Real3 B1 = make3(0.0, A1.x, A1.y);
    Real3 B2 = make3(0.0, A2.x, A2.y);
    return tensor_transform3(B0, B1, B2, out_adcs);
}

WITHIN_KERNEL Real3 AngDisDispFSC(Real y1, Real y2, Real y3, Real beta, 
                                  Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real r2b = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(r2b);

    Real Fib = 2*atan(-y2/(-(rb+y3b)*(1.0 / tan(beta/2))+y1)); // The Burgers' function

    Real v1cb1 = b1/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*((1-2*nu-a/rb)*cotB-y1/(rb+y3b)*(nu+a/rb))+(1-2*nu)*
        y2*cosB*cotB/(rb+z3b)*(cosB+a/rb)+a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*(-(1-2*nu)*cotB+y1/(rb+y3b)*(2*nu+a/rb)+
        a*y1/(rb*rb))+y2*(y3b-a)/(rb*(rb+z3b))*(cosB/(rb+z3b)*((rb*
        cosB+y3b)*((1-2*nu)*cosB-a/rb)*cotB+2*(1-nu)*(rb*sinB-y1)*cosB)-
        a*y3b*cosB*cotB/(rb*rb)));

    Real v2cb1 = b1/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)-nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1-2*nu)*cosB*log(rb+z3b))-(1-2*nu)/(rb+y3b)*(y1*
        cotB*(1-2*nu-a/rb)+nu*y3b-a+(y2*y2)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*z1b*cotB/(rb+z3b)*(cosB+a/rb)-a*y1*(y3b-a)*cotB/(rb*rb*rb)+
        (y3b-a)/(rb+y3b)*(-2*nu+1/rb*((1-2*nu)*y1*cotB-a)+(y2*y2)/(rb*
        (rb+y3b))*(2*nu+a/rb)+a*(y2*y2)/(rb*rb*rb))+(y3b-a)/(rb+z3b)*((cosB*cosB)-
        1/rb*((1-2*nu)*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/(rb*
        (rb+z3b))*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*(rb*cosB+y3b))));

    Real v3cb1 = b1/4/M_PI/(1-nu)*(2*(1-nu)*(((1-2*nu)*Fib*cotB)+(y2/(rb+y3b)*(2*
        nu+a/rb))-(y2*cosB/(rb+z3b)*(cosB+a/rb)))+y2*(y3b-a)/rb*(2*
        nu/(rb+y3b)+a/(rb*rb))+y2*(y3b-a)*cosB/(rb*(rb+z3b))*(1-2*nu-
        (rb*cosB+y3b)/(rb+z3b)*(cosB+a/rb)-a*y3b/(rb*rb)));

    Real v1cb2 = b2/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)+nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1)*cosB*log(rb+z3b))+(1-2*nu)/(rb+y3b)*(-(1-2*nu)*
        y1*cotB+nu*y3b-a+a*y1*cotB/rb+(y1*y1)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*cotB/(rb+z3b)*(z1b*cosB-a*(rb*sinB-y1)/(rb*cosB))-a*y1*
        (y3b-a)*cotB/(rb*rb*rb)+(y3b-a)/(rb+y3b)*(2*nu+1/rb*((1-2*nu)*y1*
        cotB+a)-(y1*y1)/(rb*(rb+y3b))*(2*nu+a/rb)-a*(y1*y1)/(rb*rb*rb))+(y3b-a)*
        cotB/(rb+z3b)*(-cosB*sinB+a*y1*y3b/((rb*rb*rb)*cosB)+(rb*sinB-y1)/
        rb*(2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB)))));
                    
    Real v2cb2 = b2/4/M_PI/(1-nu)*(2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*(-(1-2*nu-a/rb)*cotB+y1/(rb+y3b)*(nu+a/rb))-(1-2*nu)*
        y2*cotB/(rb+z3b)*(1+a/(rb*cosB))-a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*y1/rb*
        (1/rb+1/(rb+y3b)))+y2*(y3b-a)*cotB/(rb*(rb+z3b))*(-2*(1-nu)*
        cosB+(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB))+a*y3b/((rb*rb)*cosB)));
                    
    Real v3cb2 = b2/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*cotB*(log(rb+y3b)-cosB*
        log(rb+z3b))-2*(1-nu)*y1/(rb+y3b)*(2*nu+a/rb)+2*(1-nu)*z1b/(rb+
        z3b)*(cosB+a/rb)+(y3b-a)/rb*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*
        y1/(rb*rb))-(y3b-a)/(rb+z3b)*(cosB*sinB+(rb*cosB+y3b)*cotB/rb*
        (2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b))+a/rb*(sinB-y3b*z1b/
        (rb*rb)-z1b*(rb*cosB+y3b)/(rb*(rb+z3b)))));

    Real v1cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(y2/(rb+y3b)*(1+a/rb)-y2*cosB/(rb+
        z3b)*(cosB+a/rb))-y2*(y3b-a)/rb*(a/(rb*rb)+1/(rb+y3b))+y2*
        (y3b-a)*cosB/(rb*(rb+z3b))*((rb*cosB+y3b)/(rb+z3b)*(cosB+a/
        rb)+a*y3b/(rb*rb)));
                    
    Real v2cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(-sinB*log(rb+z3b)-y1/(rb+y3b)*(1+a/
        rb)+z1b/(rb+z3b)*(cosB+a/rb))+y1*(y3b-a)/rb*(a/(rb*rb)+1/(rb+
        y3b))-(y3b-a)/(rb+z3b)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/
        (rb*rb))-1/(rb*(rb+z3b))*((y2*y2)*cosB*sinB-a*z1b/rb*(rb*cosB+y3b))));
                    
    Real v3cb3 = b3/4/M_PI/(1-nu)*(2*(1-nu)*Fib+2*(1-nu)*(y2*sinB/(rb+z3b)*(cosB+
        a/rb))+y2*(y3b-a)*sinB/(rb*(rb+z3b))*(1+(rb*cosB+y3b)/(rb+
        z3b)*(cosB+a/rb)+a*y3b/(rb*rb)));

    return make3(
        v1cb1+v1cb2+v1cb3,
        v2cb1+v2cb2+v2cb3,
        v3cb1+v3cb2+v3cb3
    );
}

WITHIN_KERNEL Real6 AngDisStrainFSC(Real y1, Real y2, Real y3, Real beta,
                                    Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real rb2 = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(rb2);

    Real W1 = rb*cosB+y3b;
    Real W2 = cosB+a/rb;
    Real W3 = cosB+y3b/rb;
    Real W4 = nu+a/rb;
    Real W5 = 2*nu+a/rb;
    Real W6 = rb+y3b;
    Real W7 = rb+z3b;
    Real W8 = y3+a;
    Real W9 = 1+a/rb/cosB;

    Real N1 = 1-2*nu;

    Real rFib_ry2 = z1b/rb/(rb+z3b)-y1/rb/(rb+y3b);
    Real rFib_ry1 = y2/rb/(rb+y3b)-cosB*y2/rb/(rb+z3b);
    Real rFib_ry3 = -sinB*y2/rb/(rb+z3b);

    Real6 out;
    out.x = b1*(1.0/4.0*((-2+2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)*cotB- y1/W6*W4)/rb*y1+N1*y2/W6*(a/(rb*rb*rb)*y1*cotB-1/W6*W4+(y1*y1)/ (W6*W6)*W4/rb+(y1*y1)/W6*a/(rb*rb*rb))-N1*y2*cosB*cotB/(W7*W7)*W2*(y1/ rb-sinB)-N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*y1-3*a*y2*W8*cotB/(rb*rb2*rb2)* y1-y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1-y2*W8/ rb2/(W6*W6)*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1+y2*W8/rb/W6* (1/W6*W5-(y1*y1)/(W6*W6)*W5/rb-(y1*y1)/W6*a/(rb*rb*rb)+a/rb2-2*a*(y1*y1) /(rb2*rb2))-y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*y1-y2*W8/rb/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)*(y1/rb-sinB)+y2*W8/rb/W7*(-cosB/ (W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)*(y1/ rb-sinB)+cosB/W7*(1/rb*cosB*y1*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb2) *y1*cotB+(2-2*nu)*(1/rb*sinB*y1-1)*cosB)+2*a*y3b*cosB*cotB/ (rb2*rb2)*y1))/M_PI/(1-nu))+ b2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1)* cosB*(y1/rb-sinB)/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1* cotB/rb+(y1*y1)/W6*W4)/rb*y1+N1/W6*(-N1*cotB+a*cotB/rb-a* (y1*y1)*cotB/(rb*rb*rb)+2*y1/W6*W4-(y1*y1*y1)/(W6*W6)*W4/rb-(y1*y1*y1)/W6*a/ (rb*rb*rb))+N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*(y1/ rb-sinB)-N1*cotB/W7*((cosB*cosB)-a*(1/rb*sinB*y1-1)/rb/cosB+a* (rb*sinB-y1)/(rb*rb*rb)/cosB*y1)-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8* cotB/(rb*rb2*rb2)-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6* W5-a*(y1*y1)/(rb*rb*rb))/rb*y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y1+ 1/rb*N1*cotB-2*y1/rb/W6*W5+(y1*y1*y1)/(rb*rb*rb)/W6*W5+(y1*y1*y1)/rb2/ (W6*W6)*W5+(y1*y1*y1)/(rb2*rb2)/W6*a-2*a/(rb*rb*rb)*y1+3*a*(y1*y1*y1)/(rb*rb2*rb2))-W8* cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb* ((2-2*nu)*cosB-W1/W7*W9))*(y1/rb-sinB)+W8*cotB/W7*(a*y3b/ (rb*rb*rb)/cosB-3*a*(y1*y1)*y3b/(rb*rb2*rb2)/cosB+(1/rb*sinB*y1-1)/rb* ((2-2*nu)*cosB-W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/ W7*W9)*y1+(rb*sinB-y1)/rb*(-1/rb*cosB*y1/W7*W9+W1/(W7*W7)* W9*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)/cosB*y1)))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)/rb*y1-y2/W6*a/(rb*rb*rb)*y1+y2* cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb*rb)*y1)+y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1-y2*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a*y3b/rb2)*y1-y2*W8* cosB/rb/(W7*W7)*(W1/W7*W2+a*y3b/rb2)*(y1/rb-sinB)+y2*W8* cosB/rb/W7*(1/rb*cosB*y1/W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/(rb2*rb2)*y1))/M_PI/(1-nu));

    out.y = b1*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB/rb*y2/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+(y2*y2) /W6*W4)/rb*y2-N1/W6*(a*y1*cotB/(rb*rb*rb)*y2+2*y2/W6*W4-(y2*y2*y2) /(W6*W6)*W4/rb-(y2*y2*y2)/W6*a/(rb*rb*rb))+N1*z1b*cotB/(W7*W7)*W2/rb* y2+N1*z1b*cotB/W7*a/(rb*rb*rb)*y2+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1-W8/ (W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y2+2*y2/rb/ W6*W5-(y2*y2*y2)/(rb*rb*rb)/W6*W5-(y2*y2*y2)/rb2/(W6*W6)*W5-(y2*y2*y2)/(rb2*rb2)/W6* a+2*a/(rb*rb*rb)*y2-3*a*(y2*y2*y2)/(rb*rb2*rb2))-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1* z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1))/rb*y2+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a* cosB)*y2-3*a*y3b*z1b*cotB/(rb*rb2*rb2)*y2+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1)*y2+1/rb2/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1)*y2-1/rb/W7*(2*y2*(cosB*cosB)+a*z1b*cotB/(rb*rb*rb)*W1*y2-a* z1b*cotB/rb2*cosB*y2)))/M_PI/(1-nu))+ b2*(1.0/4.0*((2-2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((W5-1)*cotB+y1/W6* W4)-N1*(y2*y2)/(W6*W6)*((W5-1)*cotB+y1/W6*W4)/rb+N1*y2/W6*(-a/ (rb*rb*rb)*y2*cotB-y1/(W6*W6)*W4/rb*y2-y2/W6*a/(rb*rb*rb)*y1)-N1*cotB/ W7*W9+N1*(y2*y2)*cotB/(W7*W7)*W9/rb+N1*(y2*y2)*cotB/W7*a/(rb*rb*rb)/ cosB-a*W8*cotB/(rb*rb*rb)+3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/rb/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/(rb*rb*rb)/W6* (N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/rb2/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))+y2*W8/rb/W6* (2*nu*y1/(W6*W6)/rb*y2+a*y1/(rb*rb*rb)*(1/rb+1/W6)*y2-a*y1/rb* (-1/(rb*rb*rb)*y2-1/(W6*W6)/rb*y2))+W8*cotB/rb/W7*((-2+2*nu)*cosB+ W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/rb2/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)+y2*W8*cotB/rb/W7*(1/ rb*cosB*y2/W7*W9-W1/(W7*W7)*W9/rb*y2-W1/W7*a/(rb*rb*rb)/cosB*y2- 2*a*y3b/(rb2*rb2)/cosB*y2))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-sinB/rb*y2/W7+y2/(W6*W6)*(1+a/rb)/rb*y1+y2/W6* a/(rb*rb*rb)*y1-z1b/(W7*W7)*W2/rb*y2-z1b/W7*a/(rb*rb*rb)*y2)-y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1+y1*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/ rb*y2)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))/rb*y2-W8/W7*(sinB*a/ (rb*rb*rb)*y2-z1b/(rb*rb*rb)*(1+a*y3b/rb2)*y2-2*z1b/(rb*rb2*rb2)*a*y3b*y2+ 1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2+1/rb2/(W7*W7)* ((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2-1/rb/W7*(2*y2*cosB*sinB+a* z1b/(rb*rb*rb)*W1*y2-a*z1b/rb2*cosB*y2)))/M_PI/(1-nu));

    out.z = b1*(1.0/4.0*((2-2*nu)*(N1*rFib_ry3*cotB-y2/(W6*W6)*W5*(y3b/rb+1)- 1.0/2.0*y2/W6*a/(rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7* a/(rb*rb*rb)*2*y3b)+y2/rb*(2*nu/W6+a/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)*2*y3b+y2*W8/rb*(-2*nu/(W6*W6)*(y3b/rb+1)-a/ (rb2*rb2)*2*y3b)+y2*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)- 1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)*2* y3b-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a*y3b/rb2)*W3+y2* W8*cosB/rb/W7*(-(cosB*y3b/rb+1)/W7*W2+W1/(W7*W7)*W2*W3+1.0/2.0* W1/W7*a/(rb*rb*rb)*2*y3b-a/rb2+a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2*(1.0/4.0*((-2+2*nu)*N1*cotB*((y3b/rb+1)/W6-cosB*W3/W7)+(2-2*nu)* y1/(W6*W6)*W5*(y3b/rb+1)+1.0/2.0*(2-2*nu)*y1/W6*a/(rb*rb*rb)*2*y3b+(2- 2*nu)*sinB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*W3-1.0/2.0*(2-2*nu)*z1b/ W7*a/(rb*rb*rb)*2*y3b+1/rb*(N1*cotB-2*nu*y1/W6-a*y1/rb2)-1.0/2.0* W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/W6-a*y1/rb2)*2*y3b+W8/rb*(2*nu* y1/(W6*W6)*(y3b/rb+1)+a*y1/(rb2*rb2)*2*y3b)-1/W7*(cosB*sinB+W1* cotB/rb*((2-2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b* W1/rb/W7))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)*cosB-W1/ W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*W3-W8/W7*((cosB* y3b/rb+1)*cotB/rb*((2-2*nu)*cosB-W1/W7)-1.0/2.0*W1*cotB/(rb*rb*rb)* ((2-2*nu)*cosB-W1/W7)*2*y3b+W1*cotB/rb*(-(cosB*y3b/rb+1)/W7+ W1/(W7*W7)*W3)-1.0/2.0*a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7)* 2*y3b+a/rb*(-z1b/rb2-y3b*sinB/rb2+y3b*z1b/(rb2*rb2)*2*y3b- sinB*W1/rb/W7-z1b*(cosB*y3b/rb+1)/rb/W7+1.0/2.0*z1b*W1/(rb*rb*rb)/ W7*2*y3b+z1b*W1/rb/(W7*W7)*W3)))/M_PI/(1-nu))+ b3*(1.0/4.0*((2-2*nu)*rFib_ry3-(2-2*nu)*y2*sinB/(W7*W7)*W2*W3-1.0/2.0* (2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*2*y3b+y2*sinB/rb/W7*(1+W1/W7* W2+a*y3b/rb2)-1.0/2.0*y2*W8*sinB/(rb*rb*rb)/W7*(1+W1/W7*W2+a*y3b/ rb2)*2*y3b-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+a*y3b/rb2)*W3+ y2*W8*sinB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/(W7*W7)*W2*W3- 1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu));

    out.a = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((1-W5)*cotB-y1/ W6*W4)-N1*(y2*y2)/(W6*W6)*((1-W5)*cotB-y1/W6*W4)/rb+N1*y2/W6* (a/(rb*rb*rb)*y2*cotB+y1/(W6*W6)*W4/rb*y2+y2/W6*a/(rb*rb*rb)*y1)+N1* cosB*cotB/W7*W2-N1*(y2*y2)*cosB*cotB/(W7*W7)*W2/rb-N1*(y2*y2)*cosB* cotB/W7*a/(rb*rb*rb)+a*W8*cotB/(rb*rb*rb)-3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/ rb/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/(rb*rb*rb)/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/rb2/(W6*W6)*(-N1*cotB+y1/ W6*W5+a*y1/rb2)+y2*W8/rb/W6*(-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1-2*a*y1/(rb2*rb2)*y2)+W8/rb/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)-(y2*y2)*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2- 2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)-(y2*y2)*W8/rb2/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)+y2*W8/rb/W7*(-cosB/(W7*W7)*(W1* (N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)/rb*y2+cosB/ W7*(1/rb*cosB*y2*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb*rb)*y2*cotB+(2-2* nu)/rb*sinB*y2*cosB)+2*a*y3b*cosB*cotB/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1)* cosB/rb*y2/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/rb+ (y1*y1)/W6*W4)/rb*y2+N1/W6*(-a*y1*cotB/(rb*rb*rb)*y2-(y1*y1)/(W6*W6) *W4/rb*y2-(y1*y1)/W6*a/(rb*rb*rb)*y2)+N1*cotB/(W7*W7)*(z1b*cosB-a* (rb*sinB-y1)/rb/cosB)/rb*y2-N1*cotB/W7*(-a/rb2*sinB*y2/ cosB+a*(rb*sinB-y1)/(rb*rb*rb)/cosB*y2)+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1- W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y2+(y1*y1)/(rb*rb2) /W6*W5*y2+(y1*y1)/rb2/(W6*W6)*W5*y2+(y1*y1)/(rb2*rb2)/W6*a*y2+3* a*(y1*y1)/(rb*rb2*rb2)*y2)-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/ cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))/rb*y2+W8*cotB/ W7*(-3*a*y1*y3b/(rb*rb2*rb2)/cosB*y2+1/rb2*sinB*y2*((2-2*nu)*cosB- W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7*W9)*y2+(rb* sinB-y1)/rb*(-1/rb*cosB*y2/W7*W9+W1/(W7*W7)*W9/rb*y2+W1/W7* a/(rb*rb*rb)/cosB*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(1/W6*(1+a/rb)-(y2*y2)/(W6*W6)*(1+a/rb)/rb-(y2*y2)/ W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)*cosB/W7* a/(rb*rb*rb))-W8/rb*(a/rb2+1/W6)+(y2*y2)*W8/(rb*rb*rb)*(a/rb2+1/W6)- y2*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/rb*y2)+W8*cosB/rb/W7* (W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a* y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7)*(W1/W7*W2+a*y3b/rb2)+y2* W8*cosB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)*W2/rb*y2-W1/ W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB*(y1/rb-sinB)/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b- a+(y2*y2)/W6*W4)/rb*y1-N1/W6*((1-W5)*cotB+a*(y1*y1)*cotB/(rb*rb*rb)- (y2*y2)/(W6*W6)*W4/rb*y1-(y2*y2)/W6*a/(rb*rb*rb)*y1)-N1*cosB*cotB/W7* W2+N1*z1b*cotB/(W7*W7)*W2*(y1/rb-sinB)+N1*z1b*cotB/W7*a/(rb*rb2) *y1-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8*cotB/(rb*rb2*rb2)-W8/(W6*W6)*(-2* nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/(rb*rb*rb))/rb* y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y1+1/rb*N1*cotB-(y2*y2)/ (rb*rb*rb)/W6*W5*y1-(y2*y2)/rb2/(W6*W6)*W5*y1-(y2*y2)/(rb2*rb2)/W6*a*y1- 3*a*(y2*y2)/(rb*rb2*rb2)*y1)-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1*z1b*cotB+a* cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1))*(y1/rb-sinB)+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a*cosB)* y1-1/rb*N1*cosB*cotB+a*y3b*cosB*cotB/(rb*rb*rb)-3*a*y3b*z1b*cotB/ (rb*rb2*rb2)*y1+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*y1+1/ rb/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*(y1/rb-sinB)-1/rb/ W7*(-a*cosB*cotB/rb*W1+a*z1b*cotB/(rb*rb*rb)*W1*y1-a*z1b*cotB/ rb2*cosB*y1)))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)/rb*y1+N1*y2/W6*(-a/(rb*rb*rb)*y1*cotB+1/W6*W4-(y1*y1) /(W6*W6)*W4/rb-(y1*y1)/W6*a/(rb*rb*rb))+N1*y2*cotB/(W7*W7)*W9*(y1/ rb-sinB)+N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*y1+3*a*y2*W8*cotB/(rb*rb2*rb2) *y1-y2*W8/(rb*rb*rb)/W6*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/ W6))*y1-y2*W8/rb2/(W6*W6)*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/ rb+1/W6))*y1+y2*W8/rb/W6*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/ rb*(1/rb+1/W6)+a*(y1*y1)/(rb*rb*rb)*(1/rb+1/W6)-a*y1/rb*(-1/ (rb*rb*rb)*y1-1/(W6*W6)/rb*y1))-y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)*y1-y2*W8*cotB/rb/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)*(y1/rb-sinB)+y2*W8* cotB/rb/W7*(1/rb*cosB*y1/W7*W9-W1/(W7*W7)*W9*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)/cosB*y1-2*a*y3b/(rb2*rb2)/cosB*y1))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*(y1/rb-sinB)/W7-1/W6*(1+a/rb)+(y1*y1)/(W6*W6) *(1+a/rb)/rb+(y1*y1)/W6*a/(rb*rb*rb)+cosB/W7*W2-z1b/(W7*W7)*W2* (y1/rb-sinB)-z1b/W7*a/(rb*rb*rb)*y1)+W8/rb*(a/rb2+1/W6)-(y1*y1)* W8/(rb*rb*rb)*(a/rb2+1/W6)+y1*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))*(y1/rb-sinB)-W8/W7* (sinB*a/(rb*rb*rb)*y1+cosB/rb*(1+a*y3b/rb2)-z1b/(rb*rb*rb)*(1+a*y3b/ rb2)*y1-2*z1b/(rb*rb2*rb2)*a*y3b*y1+1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a* z1b/rb*W1)*y1+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)* (y1/rb-sinB)-1/rb/W7*(-a*cosB/rb*W1+a*z1b/(rb*rb*rb)*W1*y1-a* z1b/rb2*cosB*y1)))/M_PI/(1-nu));

    out.b = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)* cotB-y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB+ y1/(W6*W6)*W4*(y3b/rb+1)+1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)-N1*y2*cosB* cotB/(W7*W7)*W2*W3-1.0/2.0*N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*2*y3b+a/ (rb*rb*rb)*y2*cotB-3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/ W6*W5+a*y1/rb2)*2*y3b-y2*W8/rb/(W6*W6)*(-N1*cotB+y1/W6*W5+ a*y1/rb2)*(y3b/rb+1)+y2*W8/rb/W6*(-y1/(W6*W6)*W5*(y3b/rb+ 1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b-a*y1/(rb2*rb2)*2*y3b)+y2/rb/W7* (cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)- a*y3b*cosB*cotB/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)*2*y3b-y2*W8/rb/(W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*W3+y2*W8/rb/ W7*(-cosB/(W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)*W3+cosB/W7*((cosB*y3b/rb+1)*(N1*cosB-a/rb)*cotB+1.0/2.0*W1* a/(rb*rb*rb)*2*y3b*cotB+1.0/2.0*(2-2*nu)/rb*sinB*2*y3b*cosB)-a*cosB* cotB/rb2+a*y3b*cosB*cotB/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)*(y3b/rb+1)/W6-((2-2*nu)*(cotB*cotB) +1)*cosB*W3/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/ rb+(y1*y1)/W6*W4)*(y3b/rb+1)+N1/W6*(nu-1.0/2.0*a*y1*cotB/(rb*rb*rb)*2* y3b-(y1*y1)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y1*y1)/W6*a/(rb*rb*rb)*2*y3b)+ N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*W3-N1*cotB/ W7*(cosB*sinB-1.0/2.0*a/rb2*sinB*2*y3b/cosB+1.0/2.0*a*(rb*sinB-y1)/ (rb*rb*rb)/cosB*2*y3b)-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2* y3b+1/W6*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a* (y1*y1)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB+a)*2* y3b+1.0/2.0*(y1*y1)/(rb*rb*rb)/W6*W5*2*y3b+(y1*y1)/rb/(W6*W6)*W5*(y3b/rb+ 1)+1.0/2.0*(y1*y1)/(rb2*rb2)/W6*a*2*y3b+3.0/2.0*a*(y1*y1)/(rb*rb2*rb2)*2*y3b)+ cotB/W7*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb*((2- 2*nu)*cosB-W1/W7*W9))-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb2) /cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))*W3+W8*cotB/ W7*(a/(rb*rb*rb)/cosB*y1-3.0/2.0*a*y1*y3b/(rb*rb2*rb2)/cosB*2*y3b+1.0/2.0/ rb2*sinB*2*y3b*((2-2*nu)*cosB-W1/W7*W9)-1.0/2.0*(rb*sinB-y1)/(rb*rb2) *((2-2*nu)*cosB-W1/W7*W9)*2*y3b+(rb*sinB-y1)/rb*(-(cosB*y3b/ rb+1)/W7*W9+W1/(W7*W7)*W9*W3+1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2* y3b)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)*(y3b/rb+1)-1.0/2.0*y2/W6*a/ (rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7*a/(rb*rb*rb)*2* y3b)-y2/rb*(a/rb2+1/W6)+1.0/2.0*y2*W8/(rb*rb*rb)*(a/rb2+1/W6)*2* y3b-y2*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)*(y3b/rb+1))+y2*cosB/ rb/W7*(W1/W7*W2+a*y3b/rb2)-1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(W1/ W7*W2+a*y3b/rb2)*2*y3b-y2*W8*cosB/rb/(W7*W7)*(W1/W7*W2+a* y3b/rb2)*W3+y2*W8*cosB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/ (W7*W7)*W2*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2* y3b))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry1*cotB-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1+y2*cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb2) *y1)-y2*W8/(rb*rb*rb)*(2*nu/W6+a/rb2)*y1+y2*W8/rb*(-2*nu/(W6*W6) /rb*y1-2*a/(rb2*rb2)*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7* W2-a*y3b/rb2)*y1-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a* y3b/rb2)*(y1/rb-sinB)+y2*W8*cosB/rb/W7*(-1/rb*cosB*y1/W7* W2+W1/(W7*W7)*W2*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)*y1+2*a*y3b/(rb2*rb2) *y1))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y1/W6-cosB*(y1/rb-sinB)/W7)- (2-2*nu)/W6*W5+(2-2*nu)*(y1*y1)/(W6*W6)*W5/rb+(2-2*nu)*(y1*y1)/W6* a/(rb*rb*rb)+(2-2*nu)*cosB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y1-W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/ W6-a*y1/rb2)*y1+W8/rb*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/rb2+ 2*a*(y1*y1)/(rb2*rb2))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)* cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*(y1/rb- sinB)-W8/W7*(1/rb2*cosB*y1*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y1+W1*cotB/rb*(-1/rb*cosB* y1/W7+W1/(W7*W7)*(y1/rb-sinB))-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2- z1b*W1/rb/W7)*y1+a/rb*(-y3b*cosB/rb2+2*y3b*z1b/(rb2*rb2)*y1- cosB*W1/rb/W7-z1b/rb2*cosB*y1/W7+z1b*W1/(rb*rb*rb)/W7*y1+z1b* W1/rb/(W7*W7)*(y1/rb-sinB))))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry1-(2-2*nu)*y2*sinB/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*y1-y2*W8*sinB/(rb*rb*rb)/W7*(1+ W1/W7*W2+a*y3b/rb2)*y1-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+ a*y3b/rb2)*(y1/rb-sinB)+y2*W8*sinB/rb/W7*(1/rb*cosB*y1/ W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)-W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/ (rb2*rb2)*y1))/M_PI/(1-nu));

    out.c = b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)*(y3b/rb+1)/W6-((2-2*nu)* (cotB*cotB)+1-2*nu)*cosB*W3/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+ (y2*y2)/W6*W4)*(y3b/rb+1)-N1/W6*(1.0/2.0*a*y1*cotB/(rb*rb*rb)*2*y3b+ nu-(y2*y2)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y2*y2)/W6*a/(rb*rb*rb)*2*y3b)-N1* sinB*cotB/W7*W2+N1*z1b*cotB/(W7*W7)*W2*W3+1.0/2.0*N1*z1b*cotB/W7* a/(rb*rb*rb)*2*y3b-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2*y3b+ 1/W6*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))-W8/(W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+ a*(y2*y2)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB-a)* 2*y3b-1.0/2.0*(y2*y2)/(rb*rb*rb)/W6*W5*2*y3b-(y2*y2)/rb/(W6*W6)*W5*(y3b/ rb+1)-1.0/2.0*(y2*y2)/(rb2*rb2)/W6*a*2*y3b-3.0/2.0*a*(y2*y2)/(rb*rb2*rb2)*2*y3b)+ 1/W7*((cosB*cosB)-1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb2) -1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))-W8/(W7*W7)*((cosB*cosB)- 1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7* ((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))*W3+W8/W7*(1.0/2.0/(rb*rb*rb)*(N1* z1b*cotB+a*cosB)*2*y3b-1/rb*N1*sinB*cotB+a*z1b*cotB/(rb*rb*rb)+a* y3b*sinB*cotB/(rb*rb*rb)-3.0/2.0*a*y3b*z1b*cotB/(rb*rb2*rb2)*2*y3b+1.0/2.0/(rb*rb2) /W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*2*y3b+1/rb/(W7*W7)*((y2*y2) *(cosB*cosB)-a*z1b*cotB/rb*W1)*W3-1/rb/W7*(-a*sinB*cotB/rb*W1+ 1.0/2.0*a*z1b*cotB/(rb*rb*rb)*W1*2*y3b-a*z1b*cotB/rb*(cosB*y3b/rb+ 1))))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(-1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB-y1/ (W6*W6)*W4*(y3b/rb+1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)+N1*y2*cotB/ (W7*W7)*W9*W3+1.0/2.0*N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*2*y3b-a/(rb*rb*rb)* y2*cotB+3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(N1*cotB-2* nu*y1/W6-a*y1/rb*(1/rb+1/W6))-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*2*y3b-y2*W8/rb/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*(y3b/rb+1)+y2* W8/rb/W6*(2*nu*y1/(W6*W6)*(y3b/rb+1)+1.0/2.0*a*y1/(rb*rb*rb)*(1/rb+ 1/W6)*2*y3b-a*y1/rb*(-1.0/2.0/(rb*rb*rb)*2*y3b-1/(W6*W6)*(y3b/rb+ 1)))+y2*cotB/rb/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)- 1.0/2.0*y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/ rb2/cosB)*2*y3b-y2*W8*cotB/rb/(W7*W7)*((-2+2*nu)*cosB+W1/W7* W9+a*y3b/rb2/cosB)*W3+y2*W8*cotB/rb/W7*((cosB*y3b/rb+1)/ W7*W9-W1/(W7*W7)*W9*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2*y3b+a/rb2/ cosB-a*y3b/(rb2*rb2)/cosB*2*y3b))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*W3/W7+y1/(W6*W6)*(1+a/rb)*(y3b/rb+1)+ 1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b+sinB/W7*W2-z1b/(W7*W7)*W2*W3-1.0/2.0* z1b/W7*a/(rb*rb*rb)*2*y3b)+y1/rb*(a/rb2+1/W6)-1.0/2.0*y1*W8/(rb*rb2) *(a/rb2+1/W6)*2*y3b+y1*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)* (y3b/rb+1))-1/W7*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))+W8/(W7*W7)*(sinB*(cosB- a/rb)+z1b/rb*(1+a*y3b/rb2)-1/rb/W7*((y2*y2)*cosB*sinB-a*z1b/ rb*W1))*W3-W8/W7*(1.0/2.0*sinB*a/(rb*rb*rb)*2*y3b+sinB/rb*(1+a*y3b/ rb2)-1.0/2.0*z1b/(rb*rb*rb)*(1+a*y3b/rb2)*2*y3b+z1b/rb*(a/rb2-a* y3b/(rb2*rb2)*2*y3b)+1.0/2.0/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb* W1)*2*y3b+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*W3-1/ rb/W7*(-a*sinB/rb*W1+1.0/2.0*a*z1b/(rb*rb*rb)*W1*2*y3b-a*z1b/rb* (cosB*y3b/rb+1))))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry2*cotB+1/W6*W5-(y2*y2)/(W6*W6)*W5/ rb-(y2*y2)/W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)* cosB/W7*a/(rb*rb*rb))+W8/rb*(2*nu/W6+a/rb2)-(y2*y2)*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)+y2*W8/rb*(-2*nu/(W6*W6)/rb*y2-2*a/(rb2*rb2)*y2)+ W8*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/ (rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7) *(1-2*nu-W1/W7*W2-a*y3b/rb2)+y2*W8*cosB/rb/W7*(-1/rb* cosB*y2/W7*W2+W1/(W7*W7)*W2/rb*y2+W1/W7*a/(rb*rb*rb)*y2+2*a* y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y2/W6-cosB/rb*y2/W7)+(2- 2*nu)*y1/(W6*W6)*W5/rb*y2+(2-2*nu)*y1/W6*a/(rb*rb*rb)*y2-(2-2* nu)*z1b/(W7*W7)*W2/rb*y2-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y2-W8/(rb*rb2) *(N1*cotB-2*nu*y1/W6-a*y1/rb2)*y2+W8/rb*(2*nu*y1/(W6*W6)/ rb*y2+2*a*y1/(rb2*rb2)*y2)+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2- 2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))/ rb*y2-W8/W7*(1/rb2*cosB*y2*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y2+W1*cotB/rb*(-cosB/rb* y2/W7+W1/(W7*W7)/rb*y2)-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/ rb/W7)*y2+a/rb*(2*y3b*z1b/(rb2*rb2)*y2-z1b/rb2*cosB*y2/W7+ z1b*W1/(rb*rb*rb)/W7*y2+z1b*W1/rb2/(W7*W7)*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry2+(2-2*nu)*sinB/W7*W2-(2-2*nu)*(y2*y2)* sinB/(W7*W7)*W2/rb-(2-2*nu)*(y2*y2)*sinB/W7*a/(rb*rb*rb)+W8*sinB/rb/ W7*(1+W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/(rb*rb*rb)/W7*(1+W1/ W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/rb2/(W7*W7)*(1+W1/W7*W2+a* y3b/rb2)+y2*W8*sinB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)* W2/rb*y2-W1/W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu));

    return out;
}

WITHIN_KERNEL Real3 AngSetupFSC(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make3(0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real3 vA = AngDisDispFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real3 vB = AngDisDispFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );

    Real3 v = sub3(vB, vA);
    return inv_transform3(ey1, ey2, ey3, v);
}

WITHIN_KERNEL Real6 AngSetupFSC_S(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make6(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real6 vA = AngDisStrainFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real6 vB = AngDisStrainFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );
    Real6 v = sub6(vB, vA);
    return tensor_transform3(ey1, ey2, ey3, v);
}

#endif





KERNEL
void matrix_disp_fs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    Real nu)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    if (i >= n_obs) {
        return;
    }

    if (j >= n_src) {
        return;
    }

    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.y = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 0;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 0;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 0;
            results[idx] = full_out.z;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.z = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 1;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 1;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 1;
            results[idx] = full_out.z;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.x = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 2;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 2;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 2;
            results[idx] = full_out.z;
        }
    }
}


KERNEL
void matrix_disp_hs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    Real nu)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    if (i >= n_obs) {
        return;
    }

    if (j >= n_src) {
        return;
    }

    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.y = 1.0;

        
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 0;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 0;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 0;
            results[idx] = full_out.z;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.z = 1.0;

        
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 1;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 1;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 1;
            results[idx] = full_out.z;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.x = 1.0;

        
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


        {
            int idx = ((i * 3 + 0) * n_src + j) * 3 + 2;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 3 + 1) * n_src + j) * 3 + 2;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 3 + 2) * n_src + j) * 3 + 2;
            results[idx] = full_out.z;
        }
    }
}


KERNEL
void matrix_strain_fs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    Real nu)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    if (i >= n_obs) {
        return;
    }

    if (j >= n_src) {
        return;
    }

    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.y = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 0;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 0;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 0;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 0;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 0;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 0;
            results[idx] = full_out.c;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.z = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 1;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 1;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 1;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 1;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 1;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 1;
            results[idx] = full_out.c;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.x = 1.0;

        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 2;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 2;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 2;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 2;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 2;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 2;
            results[idx] = full_out.c;
        }
    }
}


KERNEL
void matrix_strain_hs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    Real nu)
{
    int i = get_global_id(0);
    int j = get_global_id(1);

    if (i >= n_obs) {
        return;
    }

    if (j >= n_src) {
        return;
    }

    Real3 obs;
        Real3 tri0;
        obs.x = obs_pts[i * 3 + 0];
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
        obs.y = obs_pts[i * 3 + 1];
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
        obs.z = obs_pts[i * 3 + 2];
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.y = 1.0;

        
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 0;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 0;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 0;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 0;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 0;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 0;
            results[idx] = full_out.c;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.z = 1.0;

        
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 1;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 1;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 1;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 1;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 1;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 1;
            results[idx] = full_out.c;
        }
    }
    {
        Real3 slip = make3(0.0, 0.0, 0.0);
        slip.x = 1.0;

        
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


        {
            int idx = ((i * 6 + 0) * n_src + j) * 3 + 2;
            results[idx] = full_out.x;
        }
        {
            int idx = ((i * 6 + 1) * n_src + j) * 3 + 2;
            results[idx] = full_out.y;
        }
        {
            int idx = ((i * 6 + 2) * n_src + j) * 3 + 2;
            results[idx] = full_out.z;
        }
        {
            int idx = ((i * 6 + 3) * n_src + j) * 3 + 2;
            results[idx] = full_out.a;
        }
        {
            int idx = ((i * 6 + 4) * n_src + j) * 3 + 2;
            results[idx] = full_out.b;
        }
        {
            int idx = ((i * 6 + 5) * n_src + j) * 3 + 2;
            results[idx] = full_out.c;
        }
    }
}







#ifndef CUTDE_COMMON
#define CUTDE_COMMON



#define Real double
#define EPS 2.220446049250313e-16

#ifndef M_PI
  #define M_PI   3.14159265358979323846264338327950288
#endif

typedef struct Real2 {
    Real x;
    Real y;
} Real2;

typedef struct Real3 {
    Real x;
    Real y;
    Real z;
} Real3;

typedef struct Real6 {
    Real x;
    Real y;
    Real z;
    Real a;
    Real b;
    Real c;
} Real6;

WITHIN_KERNEL void print(Real x) {
    printf("%f \n", x);
}

    

WITHIN_KERNEL Real2 add2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
    return out;
}

    

WITHIN_KERNEL Real2 sub2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
    return out;
}

    

WITHIN_KERNEL Real2 mul2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
    return out;
}

    

WITHIN_KERNEL Real2 div2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
    return out;
}


    

WITHIN_KERNEL Real2 add_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
    return out;
}

    

WITHIN_KERNEL Real2 sub_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
    return out;
}

    

WITHIN_KERNEL Real2 mul_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
    return out;
}

    

WITHIN_KERNEL Real2 div_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
    return out;
}


    WITHIN_KERNEL void print2(Real2 x) {
        
        printf("%f %f  \n", x.x,x.y);
    }

    WITHIN_KERNEL Real sum2(Real2 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
        return out;
    }

    WITHIN_KERNEL Real dot2(Real2 x, Real2 y) {
        return sum2(mul2(x, y));
    }

    WITHIN_KERNEL Real length2(Real2 x) {
        return sqrt(dot2(x, x));
    }

    WITHIN_KERNEL Real2 negate2(Real2 x) {
        return mul_scalar2(x, -1);
    }

    WITHIN_KERNEL Real2 normalize2(Real2 x) {
        return div_scalar2(x, length2(x));
    }

    WITHIN_KERNEL Real2 make2(
        Real x
            ,
        Real y
    ) {
        Real2 out;
            out.x = x;
            out.y = y;
        return out;
    }
    

WITHIN_KERNEL Real3 add3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
    return out;
}

    

WITHIN_KERNEL Real3 sub3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
    return out;
}

    

WITHIN_KERNEL Real3 mul3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
    return out;
}

    

WITHIN_KERNEL Real3 div3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
    return out;
}


    

WITHIN_KERNEL Real3 add_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
    return out;
}

    

WITHIN_KERNEL Real3 sub_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
    return out;
}

    

WITHIN_KERNEL Real3 mul_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
    return out;
}

    

WITHIN_KERNEL Real3 div_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
    return out;
}


    WITHIN_KERNEL void print3(Real3 x) {
        
        printf("%f %f %f  \n", x.x,x.y,x.z);
    }

    WITHIN_KERNEL Real sum3(Real3 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
        return out;
    }

    WITHIN_KERNEL Real dot3(Real3 x, Real3 y) {
        return sum3(mul3(x, y));
    }

    WITHIN_KERNEL Real length3(Real3 x) {
        return sqrt(dot3(x, x));
    }

    WITHIN_KERNEL Real3 negate3(Real3 x) {
        return mul_scalar3(x, -1);
    }

    WITHIN_KERNEL Real3 normalize3(Real3 x) {
        return div_scalar3(x, length3(x));
    }

    WITHIN_KERNEL Real3 make3(
        Real x
            ,
        Real y
            ,
        Real z
    ) {
        Real3 out;
            out.x = x;
            out.y = y;
            out.z = z;
        return out;
    }
    

WITHIN_KERNEL Real6 add6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
        
        out.a = a.a + b.a;
        
        out.b = a.b + b.b;
        
        out.c = a.c + b.c;
    return out;
}

    

WITHIN_KERNEL Real6 sub6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
        
        out.a = a.a - b.a;
        
        out.b = a.b - b.b;
        
        out.c = a.c - b.c;
    return out;
}

    

WITHIN_KERNEL Real6 mul6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
        
        out.a = a.a * b.a;
        
        out.b = a.b * b.b;
        
        out.c = a.c * b.c;
    return out;
}

    

WITHIN_KERNEL Real6 div6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
        
        out.a = a.a / b.a;
        
        out.b = a.b / b.b;
        
        out.c = a.c / b.c;
    return out;
}


    

WITHIN_KERNEL Real6 add_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
        
        out.a = a.a + b;
        
        out.b = a.b + b;
        
        out.c = a.c + b;
    return out;
}

    

WITHIN_KERNEL Real6 sub_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
        
        out.a = a.a - b;
        
        out.b = a.b - b;
        
        out.c = a.c - b;
    return out;
}

    

WITHIN_KERNEL Real6 mul_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
        
        out.a = a.a * b;
        
        out.b = a.b * b;
        
        out.c = a.c * b;
    return out;
}

    

WITHIN_KERNEL Real6 div_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
        
        out.a = a.a / b;
        
        out.b = a.b / b;
        
        out.c = a.c / b;
    return out;
}


    WITHIN_KERNEL void print6(Real6 x) {
        
        printf("%f %f %f %f %f %f  \n", x.x,x.y,x.z,x.a,x.b,x.c);
    }

    WITHIN_KERNEL Real sum6(Real6 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
            out += x.a;
            out += x.b;
            out += x.c;
        return out;
    }

    WITHIN_KERNEL Real dot6(Real6 x, Real6 y) {
        return sum6(mul6(x, y));
    }

    WITHIN_KERNEL Real length6(Real6 x) {
        return sqrt(dot6(x, x));
    }

    WITHIN_KERNEL Real6 negate6(Real6 x) {
        return mul_scalar6(x, -1);
    }

    WITHIN_KERNEL Real6 normalize6(Real6 x) {
        return div_scalar6(x, length6(x));
    }

    WITHIN_KERNEL Real6 make6(
        Real x
            ,
        Real y
            ,
        Real z
            ,
        Real a
            ,
        Real b
            ,
        Real c
    ) {
        Real6 out;
            out.x = x;
            out.y = y;
            out.z = z;
            out.a = a;
            out.b = b;
            out.c = c;
        return out;
    }


WITHIN_KERNEL Real2 transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = dot2(a,v);
    out.y = dot2(b,v);
    return out;
}

WITHIN_KERNEL Real2 inv_transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = a.x * v.x + b.x * v.y;
    out.y = a.y * v.x + b.y * v.y;
    return out;
}

WITHIN_KERNEL Real3 transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = dot3(a,v);
    out.y = dot3(b,v);
    out.z = dot3(c,v);
    return out;
}

WITHIN_KERNEL Real3 inv_transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = a.x * v.x + b.x * v.y + c.x * v.z;
    out.y = a.y * v.x + b.y * v.y + c.y * v.z;
    out.z = a.z * v.x + b.z * v.y + c.z * v.z;
    return out;
}

WITHIN_KERNEL Real3 cross3(Real3 x, Real3 y) {
    Real3 out;
    out.x = x.y * y.z - x.z * y.y;
    out.y = x.z * y.x - x.x * y.z;
    out.z = x.x * y.y - x.y * y.x;
    return out;
}

WITHIN_KERNEL Real6 tensor_transform3(Real3 a, Real3 b, Real3 c, Real6 tensor) {
    Real A[9];
    A[0] = a.x; A[1] = a.y; A[2] = a.z;
    A[3] = b.x; A[4] = b.y; A[5] = b.z;
    A[6] = c.x; A[7] = c.y; A[8] = c.z;
    Real6 out;
    out.x = A[0]*A[0]*tensor.x+2*A[0]*A[3]*tensor.a+2*A[0]*A[6]*tensor.b+
        2*A[3]*A[6]*tensor.c+ A[3]*A[3]*tensor.y+A[6]*A[6]*tensor.z;
    out.y = A[1]*A[1]*tensor.x+2*A[1]*A[4]*tensor.a+2*A[1]*A[7]*tensor.b+
        2*A[4]*A[7]*tensor.c+A[4]*A[4]*tensor.y+A[7]*A[7]*tensor.z;
    out.z = A[2]*A[2]*tensor.x+2*A[2]*A[5]*tensor.a+2*A[2]*A[8]*tensor.b+
        2*A[5]*A[8]*tensor.c+A[5]*A[5]*tensor.y+A[8]*A[8]*tensor.z;
    out.a = A[0]*A[1]*tensor.x+(A[0]*A[4]+A[1]*A[3])*tensor.a+(A[0]*A[7]+
        A[1]*A[6])*tensor.b+(A[7]*A[3]+A[6]*A[4])*tensor.c+A[4]*A[3]*tensor.y+
        A[6]*A[7]*tensor.z;
    out.b = A[0]*A[2]*tensor.x+(A[0]*A[5]+A[2]*A[3])*tensor.a+(A[0]*A[8]+
        A[2]*A[6])*tensor.b+(A[8]*A[3]+A[6]*A[5])*tensor.c+A[5]*A[3]*tensor.y+
        A[6]*A[8]*tensor.z;
    out.c = A[1]*A[2]*tensor.x+(A[2]*A[4]+A[1]*A[5])*tensor.a+(A[2]*A[7]+
        A[1]*A[8])*tensor.b+(A[7]*A[5]+A[8]*A[4])*tensor.c+A[4]*A[5]*tensor.y+
        A[7]*A[8]*tensor.z;
    return out;
}

WITHIN_KERNEL int trimodefinder(Real3 obs, Real3 tri0, Real3 tri1, Real3 tri2) {
    // trimodefinder calculates the normalized barycentric coordinates of
    // the points with respect to the TD vertices and specifies the appropriate
    // artefact-free configuration of the angular dislocations for the
    // calculations. The input matrices x, y and z share the same size and
    // correspond to the y, z and x coordinates in the TDCS, respectively. p1,
    // p2 and p3 are two-component matrices representing the y and z coordinates
    // of the TD vertices in the TDCS, respectively.
    // The components of the output (trimode) corresponding to each calculation
    // points, are 1 for the first configuration, -1 for the second
    // configuration and 0 for the calculation point that lie on the TD sides.

    Real a = ((tri1.z-tri2.z)*(obs.y-tri2.y)+(tri2.y-tri1.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real b = ((tri2.z-tri0.z)*(obs.y-tri2.y)+(tri0.y-tri2.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real c = 1-a-b;

    int result = 1;
    if ((a<=0 && b>c && c>a) ||
            (b<=0 && c>a && a>b) ||
            (c<=0 && a>b && b>c)) {
        result = -1;
    }

    if ((a==0 && b>=0 && c>=0) ||
            (a>=0 && b==0 && c>=0) ||
            (a>=0 && b>=0 && c==0)) {
        result = 0;
    }
    if (result == 0 && obs.x != 0) {
        result = 1;
    }

    return result;
}

WITHIN_KERNEL Real3 AngDisDisp(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;
    Real r = sqrt(x * x + y * y + z * z);

    Real ux = bx/8/M_PI/(1-nu)*(x*y/r/(r-z)-x*eta/r/(r-zeta));
    Real vx = bx/8/M_PI/(1-nu)*(eta*sinA/(r-zeta)-y*eta/r/(r-zeta)+        y*y/r/(r-z)+(1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real wx = bx/8/M_PI/(1-nu)*(eta*cosA/(r-zeta)-y/r-eta*z/r/(r-zeta)-        (1-2*nu)*sinA*log(r-zeta));

    Real uy = by/8/M_PI/(1-nu)*(x*x*cosA/r/(r-zeta)-x*x/r/(r-z)-         (1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real vy = by*x/8/M_PI/(1-nu)*(y*cosA/r/(r-zeta)-sinA*cosA/(r-zeta)-y/r/(r-z));
    Real wy = by*x/8/M_PI/(1-nu)*(z*cosA/r/(r-zeta)-cosA*cosA/(r-zeta)+1/r);

    Real uz = bz*sinA/8/M_PI/(1-nu)*((1-2*nu)*log(r-zeta)-x*x/r/(r-zeta));
    Real vz = bz*x*sinA/8/M_PI/(1-nu)*(sinA/(r-zeta)-y/r/(r-zeta));
    Real wz = bz*x*sinA/8/M_PI/(1-nu)*(cosA/(r-zeta)-z/r/(r-zeta));
    return make3(ux+uy+uz, vx+vy+vz, wx+wy+wz);
}

WITHIN_KERNEL Real3 TDSetupD(Real3 obs, Real alpha, Real3 slip, Real nu, Real3 TriVertex, Real3 SideVec) {
    // TDSetupD transforms coordinates of the calculation points as well as
    // slip vector components from ADCS into TDCS. It then calculates the
    // displacements in ADCS and transforms them into TDCS.

    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    Real3 uvw = AngDisDisp(obs.x, y1, z1, -M_PI + alpha, slip.x, by1, bz1, nu);

    Real2 r3 = inv_transform2(A1, A2, make2(uvw.y, uvw.z));
    Real v = r3.x;
    Real w = r3.y;
    return make3(uvw.x, v, w);
}

WITHIN_KERNEL Real6 AngDisStrain(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {
    // AngDisStrain calculates the strains associated with an angular 
    // dislocation in an elastic full-space.

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;

    Real x2 = x*x;
    Real y2 = y*y;
    Real z2 = z*z;
    Real r2 = x2+y2+z2;
    Real r = sqrt(r2);
    Real r3 = r*r2;
    Real rz = r*(r-z);
    Real rmz = (r-z);
    Real r2z2 = r2*rmz*rmz;
    Real r3z = r3*rmz;

    Real W = zeta-r;
    Real W2 = W*W;
    Real Wr = W*r;
    Real W2r = W2*r;
    Real Wr3 = W*r3;
    Real W2r2 = W2*r2;

    Real C = (r*cosA-z)/Wr;
    Real S = (r*sinA-y)/Wr;

    // Partial derivatives of the Burgers' function
    Real rFi_rx = (eta/r/(r-zeta)-y/r/(r-z))/4/M_PI;
    Real rFi_ry = (x/r/(r-z)-cosA*x/r/(r-zeta))/4/M_PI;
    Real rFi_rz = (sinA*x/r/(r-zeta))/4/M_PI;

    Real6 out;
    out.x = bx*(rFi_rx)+
        bx/8/M_PI/(1-nu)*(eta/Wr+eta*x2/W2r2-eta*x2/Wr3+y/rz-
        x2*y/r2z2-x2*y/r3z)-
        by*x/8/M_PI/(1-nu)*(((2*nu+1)/Wr+x2/W2r2-x2/Wr3)*cosA+
        (2*nu+1)/rz-x2/r2z2-x2/r3z)+
        bz*x*sinA/8/M_PI/(1-nu)*((2*nu+1)/Wr+x2/W2r2-x2/Wr3);

    out.y = by*(rFi_ry)+
        bx/8/M_PI/(1-nu)*((1/Wr+S*S-y2/Wr3)*eta+(2*nu+1)*y/rz-y*y*y/r2z2-
        y*y*y/r3z-2*nu*cosA*S)-
        by*x/8/M_PI/(1-nu)*(1/rz-y2/r2z2-y2/r3z+
        (1/Wr+S*S-y2/Wr3)*cosA)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+S*S-y2/Wr3);

    out.z = bz*(rFi_rz)+
        bx/8/M_PI/(1-nu)*(eta/W/r+eta*C*C-eta*z2/Wr3+y*z/r3+
        2*nu*sinA*C)-
        by*x/8/M_PI/(1-nu)*((1/Wr+C*C-z2/Wr3)*cosA+z/r3)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+C*C-z2/Wr3);

    out.a = bx*(rFi_ry)/2+by*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(x*y2/r2z2-nu*x/rz+x*y2/r3z-nu*x*cosA/Wr+
        eta*x*S/Wr+eta*x*y/Wr3)+
        by/8/M_PI/(1-nu)*(x2*y/r2z2-nu*y/rz+x2*y/r3z+nu*cosA*S+
        x2*y*cosA/Wr3+x2*cosA*S/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*S+x2*S/Wr+x2*y/Wr3);

    out.b = bx*(rFi_rz)/2+bz*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(-x*y/r3+nu*x*sinA/Wr+eta*x*C/Wr+
        eta*x*z/Wr3)+
        by/8/M_PI/(1-nu)*(-x2/r3+nu/r+nu*cosA*C+x2*z*cosA/Wr3+
        x2*cosA*C/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*C+x2*C/Wr+x2*z/Wr3);

    out.c = by*(rFi_rz)/2+bz*(rFi_ry)/2+
        bx/8/M_PI/(1-nu)*(y2/r3-nu/r-nu*cosA*C+nu*sinA*S+eta*sinA*cosA/W2-
        eta*(y*cosA+z*sinA)/W2r+eta*y*z/W2r2-eta*y*z/Wr3)-
        by*x/8/M_PI/(1-nu)*(y/r3+sinA*cosA*cosA/W2-cosA*(y*cosA+z*sinA)/
        W2r+y*z*cosA/W2r2-y*z*cosA/Wr3)-
        bz*x*sinA/8/M_PI/(1-nu)*(y*z/Wr3-sinA*cosA/W2+(y*cosA+z*sinA)/
        W2r-y*z/W2r2);
    return out;
}


WITHIN_KERNEL Real6 TDSetupS(Real3 obs, Real alpha, Real3 slip, Real nu,
    Real3 TriVertex, Real3 SideVec) 
{
    // TDSetupS transforms coordinates of the calculation points as well as 
    // slip vector components from ADCS into TDCS. It then calculates the 
    // strains in ADCS and transforms them into TDCS.

    // Transformation matrix
    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);

    // Transform coordinates of the calculation points from TDCS into ADCS
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    // Transform the in-plane slip vector components from TDCS into ADCS
    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    // Calculate strains associated with an angular dislocation in ADCS
    Real6 out_adcs = AngDisStrain(obs.x,y1,z1,-M_PI+alpha,slip.x,by1,bz1,nu);

    // Transform strains from ADCS into TDCS
    Real3 B0 = make3(1.0, 0.0, 0.0);
    Real3 B1 = make3(0.0, A1.x, A1.y);
    Real3 B2 = make3(0.0, A2.x, A2.y);
    return tensor_transform3(B0, B1, B2, out_adcs);
}

WITHIN_KERNEL Real3 AngDisDispFSC(Real y1, Real y2, Real y3, Real beta, 
                                  Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real r2b = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(r2b);

    Real Fib = 2*atan(-y2/(-(rb+y3b)*(1.0 / tan(beta/2))+y1)); // The Burgers' function

    Real v1cb1 = b1/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*((1-2*nu-a/rb)*cotB-y1/(rb+y3b)*(nu+a/rb))+(1-2*nu)*
        y2*cosB*cotB/(rb+z3b)*(cosB+a/rb)+a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*(-(1-2*nu)*cotB+y1/(rb+y3b)*(2*nu+a/rb)+
        a*y1/(rb*rb))+y2*(y3b-a)/(rb*(rb+z3b))*(cosB/(rb+z3b)*((rb*
        cosB+y3b)*((1-2*nu)*cosB-a/rb)*cotB+2*(1-nu)*(rb*sinB-y1)*cosB)-
        a*y3b*cosB*cotB/(rb*rb)));

    Real v2cb1 = b1/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)-nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1-2*nu)*cosB*log(rb+z3b))-(1-2*nu)/(rb+y3b)*(y1*
        cotB*(1-2*nu-a/rb)+nu*y3b-a+(y2*y2)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*z1b*cotB/(rb+z3b)*(cosB+a/rb)-a*y1*(y3b-a)*cotB/(rb*rb*rb)+
        (y3b-a)/(rb+y3b)*(-2*nu+1/rb*((1-2*nu)*y1*cotB-a)+(y2*y2)/(rb*
        (rb+y3b))*(2*nu+a/rb)+a*(y2*y2)/(rb*rb*rb))+(y3b-a)/(rb+z3b)*((cosB*cosB)-
        1/rb*((1-2*nu)*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/(rb*
        (rb+z3b))*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*(rb*cosB+y3b))));

    Real v3cb1 = b1/4/M_PI/(1-nu)*(2*(1-nu)*(((1-2*nu)*Fib*cotB)+(y2/(rb+y3b)*(2*
        nu+a/rb))-(y2*cosB/(rb+z3b)*(cosB+a/rb)))+y2*(y3b-a)/rb*(2*
        nu/(rb+y3b)+a/(rb*rb))+y2*(y3b-a)*cosB/(rb*(rb+z3b))*(1-2*nu-
        (rb*cosB+y3b)/(rb+z3b)*(cosB+a/rb)-a*y3b/(rb*rb)));

    Real v1cb2 = b2/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)+nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1)*cosB*log(rb+z3b))+(1-2*nu)/(rb+y3b)*(-(1-2*nu)*
        y1*cotB+nu*y3b-a+a*y1*cotB/rb+(y1*y1)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*cotB/(rb+z3b)*(z1b*cosB-a*(rb*sinB-y1)/(rb*cosB))-a*y1*
        (y3b-a)*cotB/(rb*rb*rb)+(y3b-a)/(rb+y3b)*(2*nu+1/rb*((1-2*nu)*y1*
        cotB+a)-(y1*y1)/(rb*(rb+y3b))*(2*nu+a/rb)-a*(y1*y1)/(rb*rb*rb))+(y3b-a)*
        cotB/(rb+z3b)*(-cosB*sinB+a*y1*y3b/((rb*rb*rb)*cosB)+(rb*sinB-y1)/
        rb*(2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB)))));
                    
    Real v2cb2 = b2/4/M_PI/(1-nu)*(2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*(-(1-2*nu-a/rb)*cotB+y1/(rb+y3b)*(nu+a/rb))-(1-2*nu)*
        y2*cotB/(rb+z3b)*(1+a/(rb*cosB))-a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*y1/rb*
        (1/rb+1/(rb+y3b)))+y2*(y3b-a)*cotB/(rb*(rb+z3b))*(-2*(1-nu)*
        cosB+(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB))+a*y3b/((rb*rb)*cosB)));
                    
    Real v3cb2 = b2/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*cotB*(log(rb+y3b)-cosB*
        log(rb+z3b))-2*(1-nu)*y1/(rb+y3b)*(2*nu+a/rb)+2*(1-nu)*z1b/(rb+
        z3b)*(cosB+a/rb)+(y3b-a)/rb*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*
        y1/(rb*rb))-(y3b-a)/(rb+z3b)*(cosB*sinB+(rb*cosB+y3b)*cotB/rb*
        (2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b))+a/rb*(sinB-y3b*z1b/
        (rb*rb)-z1b*(rb*cosB+y3b)/(rb*(rb+z3b)))));

    Real v1cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(y2/(rb+y3b)*(1+a/rb)-y2*cosB/(rb+
        z3b)*(cosB+a/rb))-y2*(y3b-a)/rb*(a/(rb*rb)+1/(rb+y3b))+y2*
        (y3b-a)*cosB/(rb*(rb+z3b))*((rb*cosB+y3b)/(rb+z3b)*(cosB+a/
        rb)+a*y3b/(rb*rb)));
                    
    Real v2cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(-sinB*log(rb+z3b)-y1/(rb+y3b)*(1+a/
        rb)+z1b/(rb+z3b)*(cosB+a/rb))+y1*(y3b-a)/rb*(a/(rb*rb)+1/(rb+
        y3b))-(y3b-a)/(rb+z3b)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/
        (rb*rb))-1/(rb*(rb+z3b))*((y2*y2)*cosB*sinB-a*z1b/rb*(rb*cosB+y3b))));
                    
    Real v3cb3 = b3/4/M_PI/(1-nu)*(2*(1-nu)*Fib+2*(1-nu)*(y2*sinB/(rb+z3b)*(cosB+
        a/rb))+y2*(y3b-a)*sinB/(rb*(rb+z3b))*(1+(rb*cosB+y3b)/(rb+
        z3b)*(cosB+a/rb)+a*y3b/(rb*rb)));

    return make3(
        v1cb1+v1cb2+v1cb3,
        v2cb1+v2cb2+v2cb3,
        v3cb1+v3cb2+v3cb3
    );
}

WITHIN_KERNEL Real6 AngDisStrainFSC(Real y1, Real y2, Real y3, Real beta,
                                    Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real rb2 = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(rb2);

    Real W1 = rb*cosB+y3b;
    Real W2 = cosB+a/rb;
    Real W3 = cosB+y3b/rb;
    Real W4 = nu+a/rb;
    Real W5 = 2*nu+a/rb;
    Real W6 = rb+y3b;
    Real W7 = rb+z3b;
    Real W8 = y3+a;
    Real W9 = 1+a/rb/cosB;

    Real N1 = 1-2*nu;

    Real rFib_ry2 = z1b/rb/(rb+z3b)-y1/rb/(rb+y3b);
    Real rFib_ry1 = y2/rb/(rb+y3b)-cosB*y2/rb/(rb+z3b);
    Real rFib_ry3 = -sinB*y2/rb/(rb+z3b);

    Real6 out;
    out.x = b1*(1.0/4.0*((-2+2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)*cotB- y1/W6*W4)/rb*y1+N1*y2/W6*(a/(rb*rb*rb)*y1*cotB-1/W6*W4+(y1*y1)/ (W6*W6)*W4/rb+(y1*y1)/W6*a/(rb*rb*rb))-N1*y2*cosB*cotB/(W7*W7)*W2*(y1/ rb-sinB)-N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*y1-3*a*y2*W8*cotB/(rb*rb2*rb2)* y1-y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1-y2*W8/ rb2/(W6*W6)*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1+y2*W8/rb/W6* (1/W6*W5-(y1*y1)/(W6*W6)*W5/rb-(y1*y1)/W6*a/(rb*rb*rb)+a/rb2-2*a*(y1*y1) /(rb2*rb2))-y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*y1-y2*W8/rb/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)*(y1/rb-sinB)+y2*W8/rb/W7*(-cosB/ (W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)*(y1/ rb-sinB)+cosB/W7*(1/rb*cosB*y1*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb2) *y1*cotB+(2-2*nu)*(1/rb*sinB*y1-1)*cosB)+2*a*y3b*cosB*cotB/ (rb2*rb2)*y1))/M_PI/(1-nu))+ b2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1)* cosB*(y1/rb-sinB)/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1* cotB/rb+(y1*y1)/W6*W4)/rb*y1+N1/W6*(-N1*cotB+a*cotB/rb-a* (y1*y1)*cotB/(rb*rb*rb)+2*y1/W6*W4-(y1*y1*y1)/(W6*W6)*W4/rb-(y1*y1*y1)/W6*a/ (rb*rb*rb))+N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*(y1/ rb-sinB)-N1*cotB/W7*((cosB*cosB)-a*(1/rb*sinB*y1-1)/rb/cosB+a* (rb*sinB-y1)/(rb*rb*rb)/cosB*y1)-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8* cotB/(rb*rb2*rb2)-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6* W5-a*(y1*y1)/(rb*rb*rb))/rb*y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y1+ 1/rb*N1*cotB-2*y1/rb/W6*W5+(y1*y1*y1)/(rb*rb*rb)/W6*W5+(y1*y1*y1)/rb2/ (W6*W6)*W5+(y1*y1*y1)/(rb2*rb2)/W6*a-2*a/(rb*rb*rb)*y1+3*a*(y1*y1*y1)/(rb*rb2*rb2))-W8* cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb* ((2-2*nu)*cosB-W1/W7*W9))*(y1/rb-sinB)+W8*cotB/W7*(a*y3b/ (rb*rb*rb)/cosB-3*a*(y1*y1)*y3b/(rb*rb2*rb2)/cosB+(1/rb*sinB*y1-1)/rb* ((2-2*nu)*cosB-W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/ W7*W9)*y1+(rb*sinB-y1)/rb*(-1/rb*cosB*y1/W7*W9+W1/(W7*W7)* W9*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)/cosB*y1)))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)/rb*y1-y2/W6*a/(rb*rb*rb)*y1+y2* cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb*rb)*y1)+y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1-y2*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a*y3b/rb2)*y1-y2*W8* cosB/rb/(W7*W7)*(W1/W7*W2+a*y3b/rb2)*(y1/rb-sinB)+y2*W8* cosB/rb/W7*(1/rb*cosB*y1/W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/(rb2*rb2)*y1))/M_PI/(1-nu));

    out.y = b1*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB/rb*y2/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+(y2*y2) /W6*W4)/rb*y2-N1/W6*(a*y1*cotB/(rb*rb*rb)*y2+2*y2/W6*W4-(y2*y2*y2) /(W6*W6)*W4/rb-(y2*y2*y2)/W6*a/(rb*rb*rb))+N1*z1b*cotB/(W7*W7)*W2/rb* y2+N1*z1b*cotB/W7*a/(rb*rb*rb)*y2+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1-W8/ (W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y2+2*y2/rb/ W6*W5-(y2*y2*y2)/(rb*rb*rb)/W6*W5-(y2*y2*y2)/rb2/(W6*W6)*W5-(y2*y2*y2)/(rb2*rb2)/W6* a+2*a/(rb*rb*rb)*y2-3*a*(y2*y2*y2)/(rb*rb2*rb2))-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1* z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1))/rb*y2+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a* cosB)*y2-3*a*y3b*z1b*cotB/(rb*rb2*rb2)*y2+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1)*y2+1/rb2/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1)*y2-1/rb/W7*(2*y2*(cosB*cosB)+a*z1b*cotB/(rb*rb*rb)*W1*y2-a* z1b*cotB/rb2*cosB*y2)))/M_PI/(1-nu))+ b2*(1.0/4.0*((2-2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((W5-1)*cotB+y1/W6* W4)-N1*(y2*y2)/(W6*W6)*((W5-1)*cotB+y1/W6*W4)/rb+N1*y2/W6*(-a/ (rb*rb*rb)*y2*cotB-y1/(W6*W6)*W4/rb*y2-y2/W6*a/(rb*rb*rb)*y1)-N1*cotB/ W7*W9+N1*(y2*y2)*cotB/(W7*W7)*W9/rb+N1*(y2*y2)*cotB/W7*a/(rb*rb*rb)/ cosB-a*W8*cotB/(rb*rb*rb)+3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/rb/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/(rb*rb*rb)/W6* (N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/rb2/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))+y2*W8/rb/W6* (2*nu*y1/(W6*W6)/rb*y2+a*y1/(rb*rb*rb)*(1/rb+1/W6)*y2-a*y1/rb* (-1/(rb*rb*rb)*y2-1/(W6*W6)/rb*y2))+W8*cotB/rb/W7*((-2+2*nu)*cosB+ W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/rb2/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)+y2*W8*cotB/rb/W7*(1/ rb*cosB*y2/W7*W9-W1/(W7*W7)*W9/rb*y2-W1/W7*a/(rb*rb*rb)/cosB*y2- 2*a*y3b/(rb2*rb2)/cosB*y2))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-sinB/rb*y2/W7+y2/(W6*W6)*(1+a/rb)/rb*y1+y2/W6* a/(rb*rb*rb)*y1-z1b/(W7*W7)*W2/rb*y2-z1b/W7*a/(rb*rb*rb)*y2)-y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1+y1*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/ rb*y2)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))/rb*y2-W8/W7*(sinB*a/ (rb*rb*rb)*y2-z1b/(rb*rb*rb)*(1+a*y3b/rb2)*y2-2*z1b/(rb*rb2*rb2)*a*y3b*y2+ 1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2+1/rb2/(W7*W7)* ((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2-1/rb/W7*(2*y2*cosB*sinB+a* z1b/(rb*rb*rb)*W1*y2-a*z1b/rb2*cosB*y2)))/M_PI/(1-nu));

    out.z = b1*(1.0/4.0*((2-2*nu)*(N1*rFib_ry3*cotB-y2/(W6*W6)*W5*(y3b/rb+1)- 1.0/2.0*y2/W6*a/(rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7* a/(rb*rb*rb)*2*y3b)+y2/rb*(2*nu/W6+a/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)*2*y3b+y2*W8/rb*(-2*nu/(W6*W6)*(y3b/rb+1)-a/ (rb2*rb2)*2*y3b)+y2*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)- 1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)*2* y3b-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a*y3b/rb2)*W3+y2* W8*cosB/rb/W7*(-(cosB*y3b/rb+1)/W7*W2+W1/(W7*W7)*W2*W3+1.0/2.0* W1/W7*a/(rb*rb*rb)*2*y3b-a/rb2+a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2*(1.0/4.0*((-2+2*nu)*N1*cotB*((y3b/rb+1)/W6-cosB*W3/W7)+(2-2*nu)* y1/(W6*W6)*W5*(y3b/rb+1)+1.0/2.0*(2-2*nu)*y1/W6*a/(rb*rb*rb)*2*y3b+(2- 2*nu)*sinB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*W3-1.0/2.0*(2-2*nu)*z1b/ W7*a/(rb*rb*rb)*2*y3b+1/rb*(N1*cotB-2*nu*y1/W6-a*y1/rb2)-1.0/2.0* W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/W6-a*y1/rb2)*2*y3b+W8/rb*(2*nu* y1/(W6*W6)*(y3b/rb+1)+a*y1/(rb2*rb2)*2*y3b)-1/W7*(cosB*sinB+W1* cotB/rb*((2-2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b* W1/rb/W7))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)*cosB-W1/ W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*W3-W8/W7*((cosB* y3b/rb+1)*cotB/rb*((2-2*nu)*cosB-W1/W7)-1.0/2.0*W1*cotB/(rb*rb*rb)* ((2-2*nu)*cosB-W1/W7)*2*y3b+W1*cotB/rb*(-(cosB*y3b/rb+1)/W7+ W1/(W7*W7)*W3)-1.0/2.0*a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7)* 2*y3b+a/rb*(-z1b/rb2-y3b*sinB/rb2+y3b*z1b/(rb2*rb2)*2*y3b- sinB*W1/rb/W7-z1b*(cosB*y3b/rb+1)/rb/W7+1.0/2.0*z1b*W1/(rb*rb*rb)/ W7*2*y3b+z1b*W1/rb/(W7*W7)*W3)))/M_PI/(1-nu))+ b3*(1.0/4.0*((2-2*nu)*rFib_ry3-(2-2*nu)*y2*sinB/(W7*W7)*W2*W3-1.0/2.0* (2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*2*y3b+y2*sinB/rb/W7*(1+W1/W7* W2+a*y3b/rb2)-1.0/2.0*y2*W8*sinB/(rb*rb*rb)/W7*(1+W1/W7*W2+a*y3b/ rb2)*2*y3b-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+a*y3b/rb2)*W3+ y2*W8*sinB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/(W7*W7)*W2*W3- 1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu));

    out.a = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((1-W5)*cotB-y1/ W6*W4)-N1*(y2*y2)/(W6*W6)*((1-W5)*cotB-y1/W6*W4)/rb+N1*y2/W6* (a/(rb*rb*rb)*y2*cotB+y1/(W6*W6)*W4/rb*y2+y2/W6*a/(rb*rb*rb)*y1)+N1* cosB*cotB/W7*W2-N1*(y2*y2)*cosB*cotB/(W7*W7)*W2/rb-N1*(y2*y2)*cosB* cotB/W7*a/(rb*rb*rb)+a*W8*cotB/(rb*rb*rb)-3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/ rb/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/(rb*rb*rb)/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/rb2/(W6*W6)*(-N1*cotB+y1/ W6*W5+a*y1/rb2)+y2*W8/rb/W6*(-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1-2*a*y1/(rb2*rb2)*y2)+W8/rb/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)-(y2*y2)*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2- 2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)-(y2*y2)*W8/rb2/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)+y2*W8/rb/W7*(-cosB/(W7*W7)*(W1* (N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)/rb*y2+cosB/ W7*(1/rb*cosB*y2*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb*rb)*y2*cotB+(2-2* nu)/rb*sinB*y2*cosB)+2*a*y3b*cosB*cotB/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1)* cosB/rb*y2/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/rb+ (y1*y1)/W6*W4)/rb*y2+N1/W6*(-a*y1*cotB/(rb*rb*rb)*y2-(y1*y1)/(W6*W6) *W4/rb*y2-(y1*y1)/W6*a/(rb*rb*rb)*y2)+N1*cotB/(W7*W7)*(z1b*cosB-a* (rb*sinB-y1)/rb/cosB)/rb*y2-N1*cotB/W7*(-a/rb2*sinB*y2/ cosB+a*(rb*sinB-y1)/(rb*rb*rb)/cosB*y2)+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1- W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y2+(y1*y1)/(rb*rb2) /W6*W5*y2+(y1*y1)/rb2/(W6*W6)*W5*y2+(y1*y1)/(rb2*rb2)/W6*a*y2+3* a*(y1*y1)/(rb*rb2*rb2)*y2)-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/ cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))/rb*y2+W8*cotB/ W7*(-3*a*y1*y3b/(rb*rb2*rb2)/cosB*y2+1/rb2*sinB*y2*((2-2*nu)*cosB- W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7*W9)*y2+(rb* sinB-y1)/rb*(-1/rb*cosB*y2/W7*W9+W1/(W7*W7)*W9/rb*y2+W1/W7* a/(rb*rb*rb)/cosB*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(1/W6*(1+a/rb)-(y2*y2)/(W6*W6)*(1+a/rb)/rb-(y2*y2)/ W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)*cosB/W7* a/(rb*rb*rb))-W8/rb*(a/rb2+1/W6)+(y2*y2)*W8/(rb*rb*rb)*(a/rb2+1/W6)- y2*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/rb*y2)+W8*cosB/rb/W7* (W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a* y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7)*(W1/W7*W2+a*y3b/rb2)+y2* W8*cosB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)*W2/rb*y2-W1/ W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB*(y1/rb-sinB)/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b- a+(y2*y2)/W6*W4)/rb*y1-N1/W6*((1-W5)*cotB+a*(y1*y1)*cotB/(rb*rb*rb)- (y2*y2)/(W6*W6)*W4/rb*y1-(y2*y2)/W6*a/(rb*rb*rb)*y1)-N1*cosB*cotB/W7* W2+N1*z1b*cotB/(W7*W7)*W2*(y1/rb-sinB)+N1*z1b*cotB/W7*a/(rb*rb2) *y1-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8*cotB/(rb*rb2*rb2)-W8/(W6*W6)*(-2* nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/(rb*rb*rb))/rb* y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y1+1/rb*N1*cotB-(y2*y2)/ (rb*rb*rb)/W6*W5*y1-(y2*y2)/rb2/(W6*W6)*W5*y1-(y2*y2)/(rb2*rb2)/W6*a*y1- 3*a*(y2*y2)/(rb*rb2*rb2)*y1)-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1*z1b*cotB+a* cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1))*(y1/rb-sinB)+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a*cosB)* y1-1/rb*N1*cosB*cotB+a*y3b*cosB*cotB/(rb*rb*rb)-3*a*y3b*z1b*cotB/ (rb*rb2*rb2)*y1+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*y1+1/ rb/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*(y1/rb-sinB)-1/rb/ W7*(-a*cosB*cotB/rb*W1+a*z1b*cotB/(rb*rb*rb)*W1*y1-a*z1b*cotB/ rb2*cosB*y1)))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)/rb*y1+N1*y2/W6*(-a/(rb*rb*rb)*y1*cotB+1/W6*W4-(y1*y1) /(W6*W6)*W4/rb-(y1*y1)/W6*a/(rb*rb*rb))+N1*y2*cotB/(W7*W7)*W9*(y1/ rb-sinB)+N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*y1+3*a*y2*W8*cotB/(rb*rb2*rb2) *y1-y2*W8/(rb*rb*rb)/W6*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/ W6))*y1-y2*W8/rb2/(W6*W6)*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/ rb+1/W6))*y1+y2*W8/rb/W6*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/ rb*(1/rb+1/W6)+a*(y1*y1)/(rb*rb*rb)*(1/rb+1/W6)-a*y1/rb*(-1/ (rb*rb*rb)*y1-1/(W6*W6)/rb*y1))-y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)*y1-y2*W8*cotB/rb/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)*(y1/rb-sinB)+y2*W8* cotB/rb/W7*(1/rb*cosB*y1/W7*W9-W1/(W7*W7)*W9*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)/cosB*y1-2*a*y3b/(rb2*rb2)/cosB*y1))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*(y1/rb-sinB)/W7-1/W6*(1+a/rb)+(y1*y1)/(W6*W6) *(1+a/rb)/rb+(y1*y1)/W6*a/(rb*rb*rb)+cosB/W7*W2-z1b/(W7*W7)*W2* (y1/rb-sinB)-z1b/W7*a/(rb*rb*rb)*y1)+W8/rb*(a/rb2+1/W6)-(y1*y1)* W8/(rb*rb*rb)*(a/rb2+1/W6)+y1*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))*(y1/rb-sinB)-W8/W7* (sinB*a/(rb*rb*rb)*y1+cosB/rb*(1+a*y3b/rb2)-z1b/(rb*rb*rb)*(1+a*y3b/ rb2)*y1-2*z1b/(rb*rb2*rb2)*a*y3b*y1+1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a* z1b/rb*W1)*y1+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)* (y1/rb-sinB)-1/rb/W7*(-a*cosB/rb*W1+a*z1b/(rb*rb*rb)*W1*y1-a* z1b/rb2*cosB*y1)))/M_PI/(1-nu));

    out.b = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)* cotB-y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB+ y1/(W6*W6)*W4*(y3b/rb+1)+1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)-N1*y2*cosB* cotB/(W7*W7)*W2*W3-1.0/2.0*N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*2*y3b+a/ (rb*rb*rb)*y2*cotB-3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/ W6*W5+a*y1/rb2)*2*y3b-y2*W8/rb/(W6*W6)*(-N1*cotB+y1/W6*W5+ a*y1/rb2)*(y3b/rb+1)+y2*W8/rb/W6*(-y1/(W6*W6)*W5*(y3b/rb+ 1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b-a*y1/(rb2*rb2)*2*y3b)+y2/rb/W7* (cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)- a*y3b*cosB*cotB/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)*2*y3b-y2*W8/rb/(W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*W3+y2*W8/rb/ W7*(-cosB/(W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)*W3+cosB/W7*((cosB*y3b/rb+1)*(N1*cosB-a/rb)*cotB+1.0/2.0*W1* a/(rb*rb*rb)*2*y3b*cotB+1.0/2.0*(2-2*nu)/rb*sinB*2*y3b*cosB)-a*cosB* cotB/rb2+a*y3b*cosB*cotB/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)*(y3b/rb+1)/W6-((2-2*nu)*(cotB*cotB) +1)*cosB*W3/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/ rb+(y1*y1)/W6*W4)*(y3b/rb+1)+N1/W6*(nu-1.0/2.0*a*y1*cotB/(rb*rb*rb)*2* y3b-(y1*y1)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y1*y1)/W6*a/(rb*rb*rb)*2*y3b)+ N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*W3-N1*cotB/ W7*(cosB*sinB-1.0/2.0*a/rb2*sinB*2*y3b/cosB+1.0/2.0*a*(rb*sinB-y1)/ (rb*rb*rb)/cosB*2*y3b)-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2* y3b+1/W6*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a* (y1*y1)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB+a)*2* y3b+1.0/2.0*(y1*y1)/(rb*rb*rb)/W6*W5*2*y3b+(y1*y1)/rb/(W6*W6)*W5*(y3b/rb+ 1)+1.0/2.0*(y1*y1)/(rb2*rb2)/W6*a*2*y3b+3.0/2.0*a*(y1*y1)/(rb*rb2*rb2)*2*y3b)+ cotB/W7*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb*((2- 2*nu)*cosB-W1/W7*W9))-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb2) /cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))*W3+W8*cotB/ W7*(a/(rb*rb*rb)/cosB*y1-3.0/2.0*a*y1*y3b/(rb*rb2*rb2)/cosB*2*y3b+1.0/2.0/ rb2*sinB*2*y3b*((2-2*nu)*cosB-W1/W7*W9)-1.0/2.0*(rb*sinB-y1)/(rb*rb2) *((2-2*nu)*cosB-W1/W7*W9)*2*y3b+(rb*sinB-y1)/rb*(-(cosB*y3b/ rb+1)/W7*W9+W1/(W7*W7)*W9*W3+1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2* y3b)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)*(y3b/rb+1)-1.0/2.0*y2/W6*a/ (rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7*a/(rb*rb*rb)*2* y3b)-y2/rb*(a/rb2+1/W6)+1.0/2.0*y2*W8/(rb*rb*rb)*(a/rb2+1/W6)*2* y3b-y2*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)*(y3b/rb+1))+y2*cosB/ rb/W7*(W1/W7*W2+a*y3b/rb2)-1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(W1/ W7*W2+a*y3b/rb2)*2*y3b-y2*W8*cosB/rb/(W7*W7)*(W1/W7*W2+a* y3b/rb2)*W3+y2*W8*cosB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/ (W7*W7)*W2*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2* y3b))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry1*cotB-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1+y2*cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb2) *y1)-y2*W8/(rb*rb*rb)*(2*nu/W6+a/rb2)*y1+y2*W8/rb*(-2*nu/(W6*W6) /rb*y1-2*a/(rb2*rb2)*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7* W2-a*y3b/rb2)*y1-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a* y3b/rb2)*(y1/rb-sinB)+y2*W8*cosB/rb/W7*(-1/rb*cosB*y1/W7* W2+W1/(W7*W7)*W2*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)*y1+2*a*y3b/(rb2*rb2) *y1))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y1/W6-cosB*(y1/rb-sinB)/W7)- (2-2*nu)/W6*W5+(2-2*nu)*(y1*y1)/(W6*W6)*W5/rb+(2-2*nu)*(y1*y1)/W6* a/(rb*rb*rb)+(2-2*nu)*cosB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y1-W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/ W6-a*y1/rb2)*y1+W8/rb*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/rb2+ 2*a*(y1*y1)/(rb2*rb2))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)* cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*(y1/rb- sinB)-W8/W7*(1/rb2*cosB*y1*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y1+W1*cotB/rb*(-1/rb*cosB* y1/W7+W1/(W7*W7)*(y1/rb-sinB))-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2- z1b*W1/rb/W7)*y1+a/rb*(-y3b*cosB/rb2+2*y3b*z1b/(rb2*rb2)*y1- cosB*W1/rb/W7-z1b/rb2*cosB*y1/W7+z1b*W1/(rb*rb*rb)/W7*y1+z1b* W1/rb/(W7*W7)*(y1/rb-sinB))))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry1-(2-2*nu)*y2*sinB/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*y1-y2*W8*sinB/(rb*rb*rb)/W7*(1+ W1/W7*W2+a*y3b/rb2)*y1-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+ a*y3b/rb2)*(y1/rb-sinB)+y2*W8*sinB/rb/W7*(1/rb*cosB*y1/ W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)-W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/ (rb2*rb2)*y1))/M_PI/(1-nu));

    out.c = b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)*(y3b/rb+1)/W6-((2-2*nu)* (cotB*cotB)+1-2*nu)*cosB*W3/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+ (y2*y2)/W6*W4)*(y3b/rb+1)-N1/W6*(1.0/2.0*a*y1*cotB/(rb*rb*rb)*2*y3b+ nu-(y2*y2)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y2*y2)/W6*a/(rb*rb*rb)*2*y3b)-N1* sinB*cotB/W7*W2+N1*z1b*cotB/(W7*W7)*W2*W3+1.0/2.0*N1*z1b*cotB/W7* a/(rb*rb*rb)*2*y3b-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2*y3b+ 1/W6*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))-W8/(W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+ a*(y2*y2)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB-a)* 2*y3b-1.0/2.0*(y2*y2)/(rb*rb*rb)/W6*W5*2*y3b-(y2*y2)/rb/(W6*W6)*W5*(y3b/ rb+1)-1.0/2.0*(y2*y2)/(rb2*rb2)/W6*a*2*y3b-3.0/2.0*a*(y2*y2)/(rb*rb2*rb2)*2*y3b)+ 1/W7*((cosB*cosB)-1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb2) -1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))-W8/(W7*W7)*((cosB*cosB)- 1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7* ((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))*W3+W8/W7*(1.0/2.0/(rb*rb*rb)*(N1* z1b*cotB+a*cosB)*2*y3b-1/rb*N1*sinB*cotB+a*z1b*cotB/(rb*rb*rb)+a* y3b*sinB*cotB/(rb*rb*rb)-3.0/2.0*a*y3b*z1b*cotB/(rb*rb2*rb2)*2*y3b+1.0/2.0/(rb*rb2) /W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*2*y3b+1/rb/(W7*W7)*((y2*y2) *(cosB*cosB)-a*z1b*cotB/rb*W1)*W3-1/rb/W7*(-a*sinB*cotB/rb*W1+ 1.0/2.0*a*z1b*cotB/(rb*rb*rb)*W1*2*y3b-a*z1b*cotB/rb*(cosB*y3b/rb+ 1))))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(-1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB-y1/ (W6*W6)*W4*(y3b/rb+1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)+N1*y2*cotB/ (W7*W7)*W9*W3+1.0/2.0*N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*2*y3b-a/(rb*rb*rb)* y2*cotB+3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(N1*cotB-2* nu*y1/W6-a*y1/rb*(1/rb+1/W6))-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*2*y3b-y2*W8/rb/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*(y3b/rb+1)+y2* W8/rb/W6*(2*nu*y1/(W6*W6)*(y3b/rb+1)+1.0/2.0*a*y1/(rb*rb*rb)*(1/rb+ 1/W6)*2*y3b-a*y1/rb*(-1.0/2.0/(rb*rb*rb)*2*y3b-1/(W6*W6)*(y3b/rb+ 1)))+y2*cotB/rb/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)- 1.0/2.0*y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/ rb2/cosB)*2*y3b-y2*W8*cotB/rb/(W7*W7)*((-2+2*nu)*cosB+W1/W7* W9+a*y3b/rb2/cosB)*W3+y2*W8*cotB/rb/W7*((cosB*y3b/rb+1)/ W7*W9-W1/(W7*W7)*W9*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2*y3b+a/rb2/ cosB-a*y3b/(rb2*rb2)/cosB*2*y3b))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*W3/W7+y1/(W6*W6)*(1+a/rb)*(y3b/rb+1)+ 1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b+sinB/W7*W2-z1b/(W7*W7)*W2*W3-1.0/2.0* z1b/W7*a/(rb*rb*rb)*2*y3b)+y1/rb*(a/rb2+1/W6)-1.0/2.0*y1*W8/(rb*rb2) *(a/rb2+1/W6)*2*y3b+y1*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)* (y3b/rb+1))-1/W7*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))+W8/(W7*W7)*(sinB*(cosB- a/rb)+z1b/rb*(1+a*y3b/rb2)-1/rb/W7*((y2*y2)*cosB*sinB-a*z1b/ rb*W1))*W3-W8/W7*(1.0/2.0*sinB*a/(rb*rb*rb)*2*y3b+sinB/rb*(1+a*y3b/ rb2)-1.0/2.0*z1b/(rb*rb*rb)*(1+a*y3b/rb2)*2*y3b+z1b/rb*(a/rb2-a* y3b/(rb2*rb2)*2*y3b)+1.0/2.0/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb* W1)*2*y3b+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*W3-1/ rb/W7*(-a*sinB/rb*W1+1.0/2.0*a*z1b/(rb*rb*rb)*W1*2*y3b-a*z1b/rb* (cosB*y3b/rb+1))))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry2*cotB+1/W6*W5-(y2*y2)/(W6*W6)*W5/ rb-(y2*y2)/W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)* cosB/W7*a/(rb*rb*rb))+W8/rb*(2*nu/W6+a/rb2)-(y2*y2)*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)+y2*W8/rb*(-2*nu/(W6*W6)/rb*y2-2*a/(rb2*rb2)*y2)+ W8*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/ (rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7) *(1-2*nu-W1/W7*W2-a*y3b/rb2)+y2*W8*cosB/rb/W7*(-1/rb* cosB*y2/W7*W2+W1/(W7*W7)*W2/rb*y2+W1/W7*a/(rb*rb*rb)*y2+2*a* y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y2/W6-cosB/rb*y2/W7)+(2- 2*nu)*y1/(W6*W6)*W5/rb*y2+(2-2*nu)*y1/W6*a/(rb*rb*rb)*y2-(2-2* nu)*z1b/(W7*W7)*W2/rb*y2-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y2-W8/(rb*rb2) *(N1*cotB-2*nu*y1/W6-a*y1/rb2)*y2+W8/rb*(2*nu*y1/(W6*W6)/ rb*y2+2*a*y1/(rb2*rb2)*y2)+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2- 2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))/ rb*y2-W8/W7*(1/rb2*cosB*y2*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y2+W1*cotB/rb*(-cosB/rb* y2/W7+W1/(W7*W7)/rb*y2)-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/ rb/W7)*y2+a/rb*(2*y3b*z1b/(rb2*rb2)*y2-z1b/rb2*cosB*y2/W7+ z1b*W1/(rb*rb*rb)/W7*y2+z1b*W1/rb2/(W7*W7)*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry2+(2-2*nu)*sinB/W7*W2-(2-2*nu)*(y2*y2)* sinB/(W7*W7)*W2/rb-(2-2*nu)*(y2*y2)*sinB/W7*a/(rb*rb*rb)+W8*sinB/rb/ W7*(1+W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/(rb*rb*rb)/W7*(1+W1/ W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/rb2/(W7*W7)*(1+W1/W7*W2+a* y3b/rb2)+y2*W8*sinB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)* W2/rb*y2-W1/W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu));

    return out;
}

WITHIN_KERNEL Real3 AngSetupFSC(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make3(0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real3 vA = AngDisDispFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real3 vB = AngDisDispFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );

    Real3 v = sub3(vB, vA);
    return inv_transform3(ey1, ey2, ey3, v);
}

WITHIN_KERNEL Real6 AngSetupFSC_S(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make6(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real6 vA = AngDisStrainFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real6 vB = AngDisStrainFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );
    Real6 v = sub6(vB, vA);
    return tensor_transform3(ey1, ey2, ey3, v);
}

#endif





KERNEL
void free_disp_fs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src, int src_start, int src_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips,
    Real nu)
{
    int i = get_global_id(0);
    int group_id = get_local_id(0);
    int block_size = get_local_size(0);

        Real sum0 = 0.0;
        Real kahanc0 = 0.0;
        Real sum1 = 0.0;
        Real kahanc1 = 0.0;
        Real sum2 = 0.0;
        Real kahanc2 = 0.0;

    Real3 obs;
    if (i < n_obs) {
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];
    }

        LOCAL_MEM Real3 sh_tri0[256];
        LOCAL_MEM Real3 sh_tri1[256];
        LOCAL_MEM Real3 sh_tri2[256];
    LOCAL_MEM Real3 sh_slips[256];

    // NOTE: The blocking scheme set up here seems to be irrelevant because the
    // runtime is totally dominated by the floating point operations inside the
    // TDE evaluation.
    for (int block_start = src_start; block_start < src_end; block_start += block_size) {
        int j = block_start + group_id;
        if (j < src_end) {
                    sh_tri0[group_id].x = tris[j * 9 + 0 * 3 + 0];
                    sh_tri0[group_id].y = tris[j * 9 + 0 * 3 + 1];
                    sh_tri0[group_id].z = tris[j * 9 + 0 * 3 + 2];
                sh_slips[group_id].y = slips[j * 3 + 0];
                    sh_tri1[group_id].x = tris[j * 9 + 1 * 3 + 0];
                    sh_tri1[group_id].y = tris[j * 9 + 1 * 3 + 1];
                    sh_tri1[group_id].z = tris[j * 9 + 1 * 3 + 2];
                sh_slips[group_id].z = slips[j * 3 + 1];
                    sh_tri2[group_id].x = tris[j * 9 + 2 * 3 + 0];
                    sh_tri2[group_id].y = tris[j * 9 + 2 * 3 + 1];
                    sh_tri2[group_id].z = tris[j * 9 + 2 * 3 + 2];
                sh_slips[group_id].x = slips[j * 3 + 2];
        }

        


        if (i < n_obs) {
            int block_end = min(src_end, block_start + block_size);
            int block_length = block_end - block_start;
            for (int block_idx = 0; block_idx < block_length; block_idx++) {
                    Real3 tri0 = sh_tri0[block_idx];
                    Real3 tri1 = sh_tri1[block_idx];
                    Real3 tri2 = sh_tri2[block_idx];

                Real3 slip = sh_slips[block_idx];

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    Real input = full_out.x;
                    Real y = input - kahanc0;
                    Real t = sum0 + y;
                    kahanc0 = (t - sum0) - y;
                    sum0 = t;
                }
                {
                    Real input = full_out.y;
                    Real y = input - kahanc1;
                    Real t = sum1 + y;
                    kahanc1 = (t - sum1) - y;
                    sum1 = t;
                }
                {
                    Real input = full_out.z;
                    Real y = input - kahanc2;
                    Real t = sum2 + y;
                    kahanc2 = (t - sum2) - y;
                    sum2 = t;
                }
            }
        }
    }

    if (i < n_obs) {
            results[i * 3 + 0] = sum0;
            results[i * 3 + 1] = sum1;
            results[i * 3 + 2] = sum2;
    }
}


KERNEL
void free_disp_hs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src, int src_start, int src_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips,
    Real nu)
{
    int i = get_global_id(0);
    int group_id = get_local_id(0);
    int block_size = get_local_size(0);

        Real sum0 = 0.0;
        Real kahanc0 = 0.0;
        Real sum1 = 0.0;
        Real kahanc1 = 0.0;
        Real sum2 = 0.0;
        Real kahanc2 = 0.0;

    Real3 obs;
    if (i < n_obs) {
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];
    }

        LOCAL_MEM Real3 sh_tri0[256];
        LOCAL_MEM Real3 sh_tri1[256];
        LOCAL_MEM Real3 sh_tri2[256];
    LOCAL_MEM Real3 sh_slips[256];

    // NOTE: The blocking scheme set up here seems to be irrelevant because the
    // runtime is totally dominated by the floating point operations inside the
    // TDE evaluation.
    for (int block_start = src_start; block_start < src_end; block_start += block_size) {
        int j = block_start + group_id;
        if (j < src_end) {
                    sh_tri0[group_id].x = tris[j * 9 + 0 * 3 + 0];
                    sh_tri0[group_id].y = tris[j * 9 + 0 * 3 + 1];
                    sh_tri0[group_id].z = tris[j * 9 + 0 * 3 + 2];
                sh_slips[group_id].y = slips[j * 3 + 0];
                    sh_tri1[group_id].x = tris[j * 9 + 1 * 3 + 0];
                    sh_tri1[group_id].y = tris[j * 9 + 1 * 3 + 1];
                    sh_tri1[group_id].z = tris[j * 9 + 1 * 3 + 2];
                sh_slips[group_id].z = slips[j * 3 + 1];
                    sh_tri2[group_id].x = tris[j * 9 + 2 * 3 + 0];
                    sh_tri2[group_id].y = tris[j * 9 + 2 * 3 + 1];
                    sh_tri2[group_id].z = tris[j * 9 + 2 * 3 + 2];
                sh_slips[group_id].x = slips[j * 3 + 2];
        }

        


        if (i < n_obs) {
            int block_end = min(src_end, block_start + block_size);
            int block_length = block_end - block_start;
            for (int block_idx = 0; block_idx < block_length; block_idx++) {
                    Real3 tri0 = sh_tri0[block_idx];
                    Real3 tri1 = sh_tri1[block_idx];
                    Real3 tri2 = sh_tri2[block_idx];

                Real3 slip = sh_slips[block_idx];

                
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


                {
                    Real input = full_out.x;
                    Real y = input - kahanc0;
                    Real t = sum0 + y;
                    kahanc0 = (t - sum0) - y;
                    sum0 = t;
                }
                {
                    Real input = full_out.y;
                    Real y = input - kahanc1;
                    Real t = sum1 + y;
                    kahanc1 = (t - sum1) - y;
                    sum1 = t;
                }
                {
                    Real input = full_out.z;
                    Real y = input - kahanc2;
                    Real t = sum2 + y;
                    kahanc2 = (t - sum2) - y;
                    sum2 = t;
                }
            }
        }
    }

    if (i < n_obs) {
            results[i * 3 + 0] = sum0;
            results[i * 3 + 1] = sum1;
            results[i * 3 + 2] = sum2;
    }
}


KERNEL
void free_strain_fs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src, int src_start, int src_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips,
    Real nu)
{
    int i = get_global_id(0);
    int group_id = get_local_id(0);
    int block_size = get_local_size(0);

        Real sum0 = 0.0;
        Real kahanc0 = 0.0;
        Real sum1 = 0.0;
        Real kahanc1 = 0.0;
        Real sum2 = 0.0;
        Real kahanc2 = 0.0;
        Real sum3 = 0.0;
        Real kahanc3 = 0.0;
        Real sum4 = 0.0;
        Real kahanc4 = 0.0;
        Real sum5 = 0.0;
        Real kahanc5 = 0.0;

    Real3 obs;
    if (i < n_obs) {
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];
    }

        LOCAL_MEM Real3 sh_tri0[256];
        LOCAL_MEM Real3 sh_tri1[256];
        LOCAL_MEM Real3 sh_tri2[256];
    LOCAL_MEM Real3 sh_slips[256];

    // NOTE: The blocking scheme set up here seems to be irrelevant because the
    // runtime is totally dominated by the floating point operations inside the
    // TDE evaluation.
    for (int block_start = src_start; block_start < src_end; block_start += block_size) {
        int j = block_start + group_id;
        if (j < src_end) {
                    sh_tri0[group_id].x = tris[j * 9 + 0 * 3 + 0];
                    sh_tri0[group_id].y = tris[j * 9 + 0 * 3 + 1];
                    sh_tri0[group_id].z = tris[j * 9 + 0 * 3 + 2];
                sh_slips[group_id].y = slips[j * 3 + 0];
                    sh_tri1[group_id].x = tris[j * 9 + 1 * 3 + 0];
                    sh_tri1[group_id].y = tris[j * 9 + 1 * 3 + 1];
                    sh_tri1[group_id].z = tris[j * 9 + 1 * 3 + 2];
                sh_slips[group_id].z = slips[j * 3 + 1];
                    sh_tri2[group_id].x = tris[j * 9 + 2 * 3 + 0];
                    sh_tri2[group_id].y = tris[j * 9 + 2 * 3 + 1];
                    sh_tri2[group_id].z = tris[j * 9 + 2 * 3 + 2];
                sh_slips[group_id].x = slips[j * 3 + 2];
        }

        


        if (i < n_obs) {
            int block_end = min(src_end, block_start + block_size);
            int block_length = block_end - block_start;
            for (int block_idx = 0; block_idx < block_length; block_idx++) {
                    Real3 tri0 = sh_tri0[block_idx];
                    Real3 tri1 = sh_tri1[block_idx];
                    Real3 tri2 = sh_tri2[block_idx];

                Real3 slip = sh_slips[block_idx];

                
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


                {
                    Real input = full_out.x;
                    Real y = input - kahanc0;
                    Real t = sum0 + y;
                    kahanc0 = (t - sum0) - y;
                    sum0 = t;
                }
                {
                    Real input = full_out.y;
                    Real y = input - kahanc1;
                    Real t = sum1 + y;
                    kahanc1 = (t - sum1) - y;
                    sum1 = t;
                }
                {
                    Real input = full_out.z;
                    Real y = input - kahanc2;
                    Real t = sum2 + y;
                    kahanc2 = (t - sum2) - y;
                    sum2 = t;
                }
                {
                    Real input = full_out.a;
                    Real y = input - kahanc3;
                    Real t = sum3 + y;
                    kahanc3 = (t - sum3) - y;
                    sum3 = t;
                }
                {
                    Real input = full_out.b;
                    Real y = input - kahanc4;
                    Real t = sum4 + y;
                    kahanc4 = (t - sum4) - y;
                    sum4 = t;
                }
                {
                    Real input = full_out.c;
                    Real y = input - kahanc5;
                    Real t = sum5 + y;
                    kahanc5 = (t - sum5) - y;
                    sum5 = t;
                }
            }
        }
    }

    if (i < n_obs) {
            results[i * 6 + 0] = sum0;
            results[i * 6 + 1] = sum1;
            results[i * 6 + 2] = sum2;
            results[i * 6 + 3] = sum3;
            results[i * 6 + 4] = sum4;
            results[i * 6 + 5] = sum5;
    }
}


KERNEL
void free_strain_hs(GLOBAL_MEM Real* results, 
    int n_obs, int n_src, int src_start, int src_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM Real* slips,
    Real nu)
{
    int i = get_global_id(0);
    int group_id = get_local_id(0);
    int block_size = get_local_size(0);

        Real sum0 = 0.0;
        Real kahanc0 = 0.0;
        Real sum1 = 0.0;
        Real kahanc1 = 0.0;
        Real sum2 = 0.0;
        Real kahanc2 = 0.0;
        Real sum3 = 0.0;
        Real kahanc3 = 0.0;
        Real sum4 = 0.0;
        Real kahanc4 = 0.0;
        Real sum5 = 0.0;
        Real kahanc5 = 0.0;

    Real3 obs;
    if (i < n_obs) {
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];
    }

        LOCAL_MEM Real3 sh_tri0[256];
        LOCAL_MEM Real3 sh_tri1[256];
        LOCAL_MEM Real3 sh_tri2[256];
    LOCAL_MEM Real3 sh_slips[256];

    // NOTE: The blocking scheme set up here seems to be irrelevant because the
    // runtime is totally dominated by the floating point operations inside the
    // TDE evaluation.
    for (int block_start = src_start; block_start < src_end; block_start += block_size) {
        int j = block_start + group_id;
        if (j < src_end) {
                    sh_tri0[group_id].x = tris[j * 9 + 0 * 3 + 0];
                    sh_tri0[group_id].y = tris[j * 9 + 0 * 3 + 1];
                    sh_tri0[group_id].z = tris[j * 9 + 0 * 3 + 2];
                sh_slips[group_id].y = slips[j * 3 + 0];
                    sh_tri1[group_id].x = tris[j * 9 + 1 * 3 + 0];
                    sh_tri1[group_id].y = tris[j * 9 + 1 * 3 + 1];
                    sh_tri1[group_id].z = tris[j * 9 + 1 * 3 + 2];
                sh_slips[group_id].z = slips[j * 3 + 1];
                    sh_tri2[group_id].x = tris[j * 9 + 2 * 3 + 0];
                    sh_tri2[group_id].y = tris[j * 9 + 2 * 3 + 1];
                    sh_tri2[group_id].z = tris[j * 9 + 2 * 3 + 2];
                sh_slips[group_id].x = slips[j * 3 + 2];
        }

        


        if (i < n_obs) {
            int block_end = min(src_end, block_start + block_size);
            int block_length = block_end - block_start;
            for (int block_idx = 0; block_idx < block_length; block_idx++) {
                    Real3 tri0 = sh_tri0[block_idx];
                    Real3 tri1 = sh_tri1[block_idx];
                    Real3 tri2 = sh_tri2[block_idx];

                Real3 slip = sh_slips[block_idx];

                
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


                {
                    Real input = full_out.x;
                    Real y = input - kahanc0;
                    Real t = sum0 + y;
                    kahanc0 = (t - sum0) - y;
                    sum0 = t;
                }
                {
                    Real input = full_out.y;
                    Real y = input - kahanc1;
                    Real t = sum1 + y;
                    kahanc1 = (t - sum1) - y;
                    sum1 = t;
                }
                {
                    Real input = full_out.z;
                    Real y = input - kahanc2;
                    Real t = sum2 + y;
                    kahanc2 = (t - sum2) - y;
                    sum2 = t;
                }
                {
                    Real input = full_out.a;
                    Real y = input - kahanc3;
                    Real t = sum3 + y;
                    kahanc3 = (t - sum3) - y;
                    sum3 = t;
                }
                {
                    Real input = full_out.b;
                    Real y = input - kahanc4;
                    Real t = sum4 + y;
                    kahanc4 = (t - sum4) - y;
                    sum4 = t;
                }
                {
                    Real input = full_out.c;
                    Real y = input - kahanc5;
                    Real t = sum5 + y;
                    kahanc5 = (t - sum5) - y;
                    sum5 = t;
                }
            }
        }
    }

    if (i < n_obs) {
            results[i * 6 + 0] = sum0;
            results[i * 6 + 1] = sum1;
            results[i * 6 + 2] = sum2;
            results[i * 6 + 3] = sum3;
            results[i * 6 + 4] = sum4;
            results[i * 6 + 5] = sum5;
    }
}







#ifndef CUTDE_COMMON
#define CUTDE_COMMON



#define Real double
#define EPS 2.220446049250313e-16

#ifndef M_PI
  #define M_PI   3.14159265358979323846264338327950288
#endif

typedef struct Real2 {
    Real x;
    Real y;
} Real2;

typedef struct Real3 {
    Real x;
    Real y;
    Real z;
} Real3;

typedef struct Real6 {
    Real x;
    Real y;
    Real z;
    Real a;
    Real b;
    Real c;
} Real6;

WITHIN_KERNEL void print(Real x) {
    printf("%f \n", x);
}

    

WITHIN_KERNEL Real2 add2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
    return out;
}

    

WITHIN_KERNEL Real2 sub2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
    return out;
}

    

WITHIN_KERNEL Real2 mul2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
    return out;
}

    

WITHIN_KERNEL Real2 div2(Real2 a, Real2 b) {
    Real2 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
    return out;
}


    

WITHIN_KERNEL Real2 add_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
    return out;
}

    

WITHIN_KERNEL Real2 sub_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
    return out;
}

    

WITHIN_KERNEL Real2 mul_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
    return out;
}

    

WITHIN_KERNEL Real2 div_scalar2(Real2 a, Real b) {
    Real2 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
    return out;
}


    WITHIN_KERNEL void print2(Real2 x) {
        
        printf("%f %f  \n", x.x,x.y);
    }

    WITHIN_KERNEL Real sum2(Real2 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
        return out;
    }

    WITHIN_KERNEL Real dot2(Real2 x, Real2 y) {
        return sum2(mul2(x, y));
    }

    WITHIN_KERNEL Real length2(Real2 x) {
        return sqrt(dot2(x, x));
    }

    WITHIN_KERNEL Real2 negate2(Real2 x) {
        return mul_scalar2(x, -1);
    }

    WITHIN_KERNEL Real2 normalize2(Real2 x) {
        return div_scalar2(x, length2(x));
    }

    WITHIN_KERNEL Real2 make2(
        Real x
            ,
        Real y
    ) {
        Real2 out;
            out.x = x;
            out.y = y;
        return out;
    }
    

WITHIN_KERNEL Real3 add3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
    return out;
}

    

WITHIN_KERNEL Real3 sub3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
    return out;
}

    

WITHIN_KERNEL Real3 mul3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
    return out;
}

    

WITHIN_KERNEL Real3 div3(Real3 a, Real3 b) {
    Real3 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
    return out;
}


    

WITHIN_KERNEL Real3 add_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
    return out;
}

    

WITHIN_KERNEL Real3 sub_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
    return out;
}

    

WITHIN_KERNEL Real3 mul_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
    return out;
}

    

WITHIN_KERNEL Real3 div_scalar3(Real3 a, Real b) {
    Real3 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
    return out;
}


    WITHIN_KERNEL void print3(Real3 x) {
        
        printf("%f %f %f  \n", x.x,x.y,x.z);
    }

    WITHIN_KERNEL Real sum3(Real3 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
        return out;
    }

    WITHIN_KERNEL Real dot3(Real3 x, Real3 y) {
        return sum3(mul3(x, y));
    }

    WITHIN_KERNEL Real length3(Real3 x) {
        return sqrt(dot3(x, x));
    }

    WITHIN_KERNEL Real3 negate3(Real3 x) {
        return mul_scalar3(x, -1);
    }

    WITHIN_KERNEL Real3 normalize3(Real3 x) {
        return div_scalar3(x, length3(x));
    }

    WITHIN_KERNEL Real3 make3(
        Real x
            ,
        Real y
            ,
        Real z
    ) {
        Real3 out;
            out.x = x;
            out.y = y;
            out.z = z;
        return out;
    }
    

WITHIN_KERNEL Real6 add6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x + b.x;
        
        out.y = a.y + b.y;
        
        out.z = a.z + b.z;
        
        out.a = a.a + b.a;
        
        out.b = a.b + b.b;
        
        out.c = a.c + b.c;
    return out;
}

    

WITHIN_KERNEL Real6 sub6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x - b.x;
        
        out.y = a.y - b.y;
        
        out.z = a.z - b.z;
        
        out.a = a.a - b.a;
        
        out.b = a.b - b.b;
        
        out.c = a.c - b.c;
    return out;
}

    

WITHIN_KERNEL Real6 mul6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x * b.x;
        
        out.y = a.y * b.y;
        
        out.z = a.z * b.z;
        
        out.a = a.a * b.a;
        
        out.b = a.b * b.b;
        
        out.c = a.c * b.c;
    return out;
}

    

WITHIN_KERNEL Real6 div6(Real6 a, Real6 b) {
    Real6 out;
        
        out.x = a.x / b.x;
        
        out.y = a.y / b.y;
        
        out.z = a.z / b.z;
        
        out.a = a.a / b.a;
        
        out.b = a.b / b.b;
        
        out.c = a.c / b.c;
    return out;
}


    

WITHIN_KERNEL Real6 add_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x + b;
        
        out.y = a.y + b;
        
        out.z = a.z + b;
        
        out.a = a.a + b;
        
        out.b = a.b + b;
        
        out.c = a.c + b;
    return out;
}

    

WITHIN_KERNEL Real6 sub_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x - b;
        
        out.y = a.y - b;
        
        out.z = a.z - b;
        
        out.a = a.a - b;
        
        out.b = a.b - b;
        
        out.c = a.c - b;
    return out;
}

    

WITHIN_KERNEL Real6 mul_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x * b;
        
        out.y = a.y * b;
        
        out.z = a.z * b;
        
        out.a = a.a * b;
        
        out.b = a.b * b;
        
        out.c = a.c * b;
    return out;
}

    

WITHIN_KERNEL Real6 div_scalar6(Real6 a, Real b) {
    Real6 out;
        
        out.x = a.x / b;
        
        out.y = a.y / b;
        
        out.z = a.z / b;
        
        out.a = a.a / b;
        
        out.b = a.b / b;
        
        out.c = a.c / b;
    return out;
}


    WITHIN_KERNEL void print6(Real6 x) {
        
        printf("%f %f %f %f %f %f  \n", x.x,x.y,x.z,x.a,x.b,x.c);
    }

    WITHIN_KERNEL Real sum6(Real6 x) {
        Real out = 0.0;
            out += x.x;
            out += x.y;
            out += x.z;
            out += x.a;
            out += x.b;
            out += x.c;
        return out;
    }

    WITHIN_KERNEL Real dot6(Real6 x, Real6 y) {
        return sum6(mul6(x, y));
    }

    WITHIN_KERNEL Real length6(Real6 x) {
        return sqrt(dot6(x, x));
    }

    WITHIN_KERNEL Real6 negate6(Real6 x) {
        return mul_scalar6(x, -1);
    }

    WITHIN_KERNEL Real6 normalize6(Real6 x) {
        return div_scalar6(x, length6(x));
    }

    WITHIN_KERNEL Real6 make6(
        Real x
            ,
        Real y
            ,
        Real z
            ,
        Real a
            ,
        Real b
            ,
        Real c
    ) {
        Real6 out;
            out.x = x;
            out.y = y;
            out.z = z;
            out.a = a;
            out.b = b;
            out.c = c;
        return out;
    }


WITHIN_KERNEL Real2 transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = dot2(a,v);
    out.y = dot2(b,v);
    return out;
}

WITHIN_KERNEL Real2 inv_transform2(Real2 a, Real2 b, Real2 v) {
    Real2 out;
    out.x = a.x * v.x + b.x * v.y;
    out.y = a.y * v.x + b.y * v.y;
    return out;
}

WITHIN_KERNEL Real3 transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = dot3(a,v);
    out.y = dot3(b,v);
    out.z = dot3(c,v);
    return out;
}

WITHIN_KERNEL Real3 inv_transform3(Real3 a, Real3 b, Real3 c, Real3 v) {
    Real3 out;
    out.x = a.x * v.x + b.x * v.y + c.x * v.z;
    out.y = a.y * v.x + b.y * v.y + c.y * v.z;
    out.z = a.z * v.x + b.z * v.y + c.z * v.z;
    return out;
}

WITHIN_KERNEL Real3 cross3(Real3 x, Real3 y) {
    Real3 out;
    out.x = x.y * y.z - x.z * y.y;
    out.y = x.z * y.x - x.x * y.z;
    out.z = x.x * y.y - x.y * y.x;
    return out;
}

WITHIN_KERNEL Real6 tensor_transform3(Real3 a, Real3 b, Real3 c, Real6 tensor) {
    Real A[9];
    A[0] = a.x; A[1] = a.y; A[2] = a.z;
    A[3] = b.x; A[4] = b.y; A[5] = b.z;
    A[6] = c.x; A[7] = c.y; A[8] = c.z;
    Real6 out;
    out.x = A[0]*A[0]*tensor.x+2*A[0]*A[3]*tensor.a+2*A[0]*A[6]*tensor.b+
        2*A[3]*A[6]*tensor.c+ A[3]*A[3]*tensor.y+A[6]*A[6]*tensor.z;
    out.y = A[1]*A[1]*tensor.x+2*A[1]*A[4]*tensor.a+2*A[1]*A[7]*tensor.b+
        2*A[4]*A[7]*tensor.c+A[4]*A[4]*tensor.y+A[7]*A[7]*tensor.z;
    out.z = A[2]*A[2]*tensor.x+2*A[2]*A[5]*tensor.a+2*A[2]*A[8]*tensor.b+
        2*A[5]*A[8]*tensor.c+A[5]*A[5]*tensor.y+A[8]*A[8]*tensor.z;
    out.a = A[0]*A[1]*tensor.x+(A[0]*A[4]+A[1]*A[3])*tensor.a+(A[0]*A[7]+
        A[1]*A[6])*tensor.b+(A[7]*A[3]+A[6]*A[4])*tensor.c+A[4]*A[3]*tensor.y+
        A[6]*A[7]*tensor.z;
    out.b = A[0]*A[2]*tensor.x+(A[0]*A[5]+A[2]*A[3])*tensor.a+(A[0]*A[8]+
        A[2]*A[6])*tensor.b+(A[8]*A[3]+A[6]*A[5])*tensor.c+A[5]*A[3]*tensor.y+
        A[6]*A[8]*tensor.z;
    out.c = A[1]*A[2]*tensor.x+(A[2]*A[4]+A[1]*A[5])*tensor.a+(A[2]*A[7]+
        A[1]*A[8])*tensor.b+(A[7]*A[5]+A[8]*A[4])*tensor.c+A[4]*A[5]*tensor.y+
        A[7]*A[8]*tensor.z;
    return out;
}

WITHIN_KERNEL int trimodefinder(Real3 obs, Real3 tri0, Real3 tri1, Real3 tri2) {
    // trimodefinder calculates the normalized barycentric coordinates of
    // the points with respect to the TD vertices and specifies the appropriate
    // artefact-free configuration of the angular dislocations for the
    // calculations. The input matrices x, y and z share the same size and
    // correspond to the y, z and x coordinates in the TDCS, respectively. p1,
    // p2 and p3 are two-component matrices representing the y and z coordinates
    // of the TD vertices in the TDCS, respectively.
    // The components of the output (trimode) corresponding to each calculation
    // points, are 1 for the first configuration, -1 for the second
    // configuration and 0 for the calculation point that lie on the TD sides.

    Real a = ((tri1.z-tri2.z)*(obs.y-tri2.y)+(tri2.y-tri1.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real b = ((tri2.z-tri0.z)*(obs.y-tri2.y)+(tri0.y-tri2.y)*(obs.z-tri2.z))/
        ((tri1.z-tri2.z)*(tri0.y-tri2.y)+(tri2.y-tri1.y)*(tri0.z-tri2.z));
    Real c = 1-a-b;

    int result = 1;
    if ((a<=0 && b>c && c>a) ||
            (b<=0 && c>a && a>b) ||
            (c<=0 && a>b && b>c)) {
        result = -1;
    }

    if ((a==0 && b>=0 && c>=0) ||
            (a>=0 && b==0 && c>=0) ||
            (a>=0 && b>=0 && c==0)) {
        result = 0;
    }
    if (result == 0 && obs.x != 0) {
        result = 1;
    }

    return result;
}

WITHIN_KERNEL Real3 AngDisDisp(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;
    Real r = sqrt(x * x + y * y + z * z);

    Real ux = bx/8/M_PI/(1-nu)*(x*y/r/(r-z)-x*eta/r/(r-zeta));
    Real vx = bx/8/M_PI/(1-nu)*(eta*sinA/(r-zeta)-y*eta/r/(r-zeta)+        y*y/r/(r-z)+(1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real wx = bx/8/M_PI/(1-nu)*(eta*cosA/(r-zeta)-y/r-eta*z/r/(r-zeta)-        (1-2*nu)*sinA*log(r-zeta));

    Real uy = by/8/M_PI/(1-nu)*(x*x*cosA/r/(r-zeta)-x*x/r/(r-z)-         (1-2*nu)*(cosA*log(r-zeta)-log(r-z)));
    Real vy = by*x/8/M_PI/(1-nu)*(y*cosA/r/(r-zeta)-sinA*cosA/(r-zeta)-y/r/(r-z));
    Real wy = by*x/8/M_PI/(1-nu)*(z*cosA/r/(r-zeta)-cosA*cosA/(r-zeta)+1/r);

    Real uz = bz*sinA/8/M_PI/(1-nu)*((1-2*nu)*log(r-zeta)-x*x/r/(r-zeta));
    Real vz = bz*x*sinA/8/M_PI/(1-nu)*(sinA/(r-zeta)-y/r/(r-zeta));
    Real wz = bz*x*sinA/8/M_PI/(1-nu)*(cosA/(r-zeta)-z/r/(r-zeta));
    return make3(ux+uy+uz, vx+vy+vz, wx+wy+wz);
}

WITHIN_KERNEL Real3 TDSetupD(Real3 obs, Real alpha, Real3 slip, Real nu, Real3 TriVertex, Real3 SideVec) {
    // TDSetupD transforms coordinates of the calculation points as well as
    // slip vector components from ADCS into TDCS. It then calculates the
    // displacements in ADCS and transforms them into TDCS.

    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    Real3 uvw = AngDisDisp(obs.x, y1, z1, -M_PI + alpha, slip.x, by1, bz1, nu);

    Real2 r3 = inv_transform2(A1, A2, make2(uvw.y, uvw.z));
    Real v = r3.x;
    Real w = r3.y;
    return make3(uvw.x, v, w);
}

WITHIN_KERNEL Real6 AngDisStrain(Real x, Real y, Real z, Real alpha, Real bx, Real by, Real bz, Real nu) {
    // AngDisStrain calculates the strains associated with an angular 
    // dislocation in an elastic full-space.

    Real cosA = cos(alpha);
    Real sinA = sin(alpha);
    Real eta = y*cosA-z*sinA;
    Real zeta = y*sinA+z*cosA;

    Real x2 = x*x;
    Real y2 = y*y;
    Real z2 = z*z;
    Real r2 = x2+y2+z2;
    Real r = sqrt(r2);
    Real r3 = r*r2;
    Real rz = r*(r-z);
    Real rmz = (r-z);
    Real r2z2 = r2*rmz*rmz;
    Real r3z = r3*rmz;

    Real W = zeta-r;
    Real W2 = W*W;
    Real Wr = W*r;
    Real W2r = W2*r;
    Real Wr3 = W*r3;
    Real W2r2 = W2*r2;

    Real C = (r*cosA-z)/Wr;
    Real S = (r*sinA-y)/Wr;

    // Partial derivatives of the Burgers' function
    Real rFi_rx = (eta/r/(r-zeta)-y/r/(r-z))/4/M_PI;
    Real rFi_ry = (x/r/(r-z)-cosA*x/r/(r-zeta))/4/M_PI;
    Real rFi_rz = (sinA*x/r/(r-zeta))/4/M_PI;

    Real6 out;
    out.x = bx*(rFi_rx)+
        bx/8/M_PI/(1-nu)*(eta/Wr+eta*x2/W2r2-eta*x2/Wr3+y/rz-
        x2*y/r2z2-x2*y/r3z)-
        by*x/8/M_PI/(1-nu)*(((2*nu+1)/Wr+x2/W2r2-x2/Wr3)*cosA+
        (2*nu+1)/rz-x2/r2z2-x2/r3z)+
        bz*x*sinA/8/M_PI/(1-nu)*((2*nu+1)/Wr+x2/W2r2-x2/Wr3);

    out.y = by*(rFi_ry)+
        bx/8/M_PI/(1-nu)*((1/Wr+S*S-y2/Wr3)*eta+(2*nu+1)*y/rz-y*y*y/r2z2-
        y*y*y/r3z-2*nu*cosA*S)-
        by*x/8/M_PI/(1-nu)*(1/rz-y2/r2z2-y2/r3z+
        (1/Wr+S*S-y2/Wr3)*cosA)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+S*S-y2/Wr3);

    out.z = bz*(rFi_rz)+
        bx/8/M_PI/(1-nu)*(eta/W/r+eta*C*C-eta*z2/Wr3+y*z/r3+
        2*nu*sinA*C)-
        by*x/8/M_PI/(1-nu)*((1/Wr+C*C-z2/Wr3)*cosA+z/r3)+
        bz*x*sinA/8/M_PI/(1-nu)*(1/Wr+C*C-z2/Wr3);

    out.a = bx*(rFi_ry)/2+by*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(x*y2/r2z2-nu*x/rz+x*y2/r3z-nu*x*cosA/Wr+
        eta*x*S/Wr+eta*x*y/Wr3)+
        by/8/M_PI/(1-nu)*(x2*y/r2z2-nu*y/rz+x2*y/r3z+nu*cosA*S+
        x2*y*cosA/Wr3+x2*cosA*S/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*S+x2*S/Wr+x2*y/Wr3);

    out.b = bx*(rFi_rz)/2+bz*(rFi_rx)/2-
        bx/8/M_PI/(1-nu)*(-x*y/r3+nu*x*sinA/Wr+eta*x*C/Wr+
        eta*x*z/Wr3)+
        by/8/M_PI/(1-nu)*(-x2/r3+nu/r+nu*cosA*C+x2*z*cosA/Wr3+
        x2*cosA*C/Wr)-
        bz*sinA/8/M_PI/(1-nu)*(nu*C+x2*C/Wr+x2*z/Wr3);

    out.c = by*(rFi_rz)/2+bz*(rFi_ry)/2+
        bx/8/M_PI/(1-nu)*(y2/r3-nu/r-nu*cosA*C+nu*sinA*S+eta*sinA*cosA/W2-
        eta*(y*cosA+z*sinA)/W2r+eta*y*z/W2r2-eta*y*z/Wr3)-
        by*x/8/M_PI/(1-nu)*(y/r3+sinA*cosA*cosA/W2-cosA*(y*cosA+z*sinA)/
        W2r+y*z*cosA/W2r2-y*z*cosA/Wr3)-
        bz*x*sinA/8/M_PI/(1-nu)*(y*z/Wr3-sinA*cosA/W2+(y*cosA+z*sinA)/
        W2r-y*z/W2r2);
    return out;
}


WITHIN_KERNEL Real6 TDSetupS(Real3 obs, Real alpha, Real3 slip, Real nu,
    Real3 TriVertex, Real3 SideVec) 
{
    // TDSetupS transforms coordinates of the calculation points as well as 
    // slip vector components from ADCS into TDCS. It then calculates the 
    // strains in ADCS and transforms them into TDCS.

    // Transformation matrix
    Real2 A1 = make2(SideVec.z, -SideVec.y);
    Real2 A2 = make2(SideVec.y, SideVec.z);

    // Transform coordinates of the calculation points from TDCS into ADCS
    Real2 r1 = transform2(A1, A2, make2(obs.y - TriVertex.y, obs.z - TriVertex.z));
    Real y1 = r1.x;
    Real z1 = r1.y;

    // Transform the in-plane slip vector components from TDCS into ADCS
    Real2 r2 = transform2(A1, A2, make2(slip.y, slip.z));
    Real by1 = r2.x;
    Real bz1 = r2.y;

    // Calculate strains associated with an angular dislocation in ADCS
    Real6 out_adcs = AngDisStrain(obs.x,y1,z1,-M_PI+alpha,slip.x,by1,bz1,nu);

    // Transform strains from ADCS into TDCS
    Real3 B0 = make3(1.0, 0.0, 0.0);
    Real3 B1 = make3(0.0, A1.x, A1.y);
    Real3 B2 = make3(0.0, A2.x, A2.y);
    return tensor_transform3(B0, B1, B2, out_adcs);
}

WITHIN_KERNEL Real3 AngDisDispFSC(Real y1, Real y2, Real y3, Real beta, 
                                  Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real r2b = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(r2b);

    Real Fib = 2*atan(-y2/(-(rb+y3b)*(1.0 / tan(beta/2))+y1)); // The Burgers' function

    Real v1cb1 = b1/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*((1-2*nu-a/rb)*cotB-y1/(rb+y3b)*(nu+a/rb))+(1-2*nu)*
        y2*cosB*cotB/(rb+z3b)*(cosB+a/rb)+a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*(-(1-2*nu)*cotB+y1/(rb+y3b)*(2*nu+a/rb)+
        a*y1/(rb*rb))+y2*(y3b-a)/(rb*(rb+z3b))*(cosB/(rb+z3b)*((rb*
        cosB+y3b)*((1-2*nu)*cosB-a/rb)*cotB+2*(1-nu)*(rb*sinB-y1)*cosB)-
        a*y3b*cosB*cotB/(rb*rb)));

    Real v2cb1 = b1/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)-nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1-2*nu)*cosB*log(rb+z3b))-(1-2*nu)/(rb+y3b)*(y1*
        cotB*(1-2*nu-a/rb)+nu*y3b-a+(y2*y2)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*z1b*cotB/(rb+z3b)*(cosB+a/rb)-a*y1*(y3b-a)*cotB/(rb*rb*rb)+
        (y3b-a)/(rb+y3b)*(-2*nu+1/rb*((1-2*nu)*y1*cotB-a)+(y2*y2)/(rb*
        (rb+y3b))*(2*nu+a/rb)+a*(y2*y2)/(rb*rb*rb))+(y3b-a)/(rb+z3b)*((cosB*cosB)-
        1/rb*((1-2*nu)*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/(rb*
        (rb+z3b))*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*(rb*cosB+y3b))));

    Real v3cb1 = b1/4/M_PI/(1-nu)*(2*(1-nu)*(((1-2*nu)*Fib*cotB)+(y2/(rb+y3b)*(2*
        nu+a/rb))-(y2*cosB/(rb+z3b)*(cosB+a/rb)))+y2*(y3b-a)/rb*(2*
        nu/(rb+y3b)+a/(rb*rb))+y2*(y3b-a)*cosB/(rb*(rb+z3b))*(1-2*nu-
        (rb*cosB+y3b)/(rb+z3b)*(cosB+a/rb)-a*y3b/(rb*rb)));

    Real v1cb2 = b2/4/M_PI/(1-nu)*((1-2*nu)*((2*(1-nu)*(cotB*cotB)+nu)*log(rb+y3b)-(2*
        (1-nu)*(cotB*cotB)+1)*cosB*log(rb+z3b))+(1-2*nu)/(rb+y3b)*(-(1-2*nu)*
        y1*cotB+nu*y3b-a+a*y1*cotB/rb+(y1*y1)/(rb+y3b)*(nu+a/rb))-(1-2*
        nu)*cotB/(rb+z3b)*(z1b*cosB-a*(rb*sinB-y1)/(rb*cosB))-a*y1*
        (y3b-a)*cotB/(rb*rb*rb)+(y3b-a)/(rb+y3b)*(2*nu+1/rb*((1-2*nu)*y1*
        cotB+a)-(y1*y1)/(rb*(rb+y3b))*(2*nu+a/rb)-a*(y1*y1)/(rb*rb*rb))+(y3b-a)*
        cotB/(rb+z3b)*(-cosB*sinB+a*y1*y3b/((rb*rb*rb)*cosB)+(rb*sinB-y1)/
        rb*(2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB)))));
                    
    Real v2cb2 = b2/4/M_PI/(1-nu)*(2*(1-nu)*(1-2*nu)*Fib*(cotB*cotB)+(1-2*nu)*y2/
        (rb+y3b)*(-(1-2*nu-a/rb)*cotB+y1/(rb+y3b)*(nu+a/rb))-(1-2*nu)*
        y2*cotB/(rb+z3b)*(1+a/(rb*cosB))-a*y2*(y3b-a)*cotB/(rb*rb*rb)+y2*
        (y3b-a)/(rb*(rb+y3b))*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*y1/rb*
        (1/rb+1/(rb+y3b)))+y2*(y3b-a)*cotB/(rb*(rb+z3b))*(-2*(1-nu)*
        cosB+(rb*cosB+y3b)/(rb+z3b)*(1+a/(rb*cosB))+a*y3b/((rb*rb)*cosB)));
                    
    Real v3cb2 = b2/4/M_PI/(1-nu)*(-2*(1-nu)*(1-2*nu)*cotB*(log(rb+y3b)-cosB*
        log(rb+z3b))-2*(1-nu)*y1/(rb+y3b)*(2*nu+a/rb)+2*(1-nu)*z1b/(rb+
        z3b)*(cosB+a/rb)+(y3b-a)/rb*((1-2*nu)*cotB-2*nu*y1/(rb+y3b)-a*
        y1/(rb*rb))-(y3b-a)/(rb+z3b)*(cosB*sinB+(rb*cosB+y3b)*cotB/rb*
        (2*(1-nu)*cosB-(rb*cosB+y3b)/(rb+z3b))+a/rb*(sinB-y3b*z1b/
        (rb*rb)-z1b*(rb*cosB+y3b)/(rb*(rb+z3b)))));

    Real v1cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(y2/(rb+y3b)*(1+a/rb)-y2*cosB/(rb+
        z3b)*(cosB+a/rb))-y2*(y3b-a)/rb*(a/(rb*rb)+1/(rb+y3b))+y2*
        (y3b-a)*cosB/(rb*(rb+z3b))*((rb*cosB+y3b)/(rb+z3b)*(cosB+a/
        rb)+a*y3b/(rb*rb)));
                    
    Real v2cb3 = b3/4/M_PI/(1-nu)*((1-2*nu)*(-sinB*log(rb+z3b)-y1/(rb+y3b)*(1+a/
        rb)+z1b/(rb+z3b)*(cosB+a/rb))+y1*(y3b-a)/rb*(a/(rb*rb)+1/(rb+
        y3b))-(y3b-a)/(rb+z3b)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/
        (rb*rb))-1/(rb*(rb+z3b))*((y2*y2)*cosB*sinB-a*z1b/rb*(rb*cosB+y3b))));
                    
    Real v3cb3 = b3/4/M_PI/(1-nu)*(2*(1-nu)*Fib+2*(1-nu)*(y2*sinB/(rb+z3b)*(cosB+
        a/rb))+y2*(y3b-a)*sinB/(rb*(rb+z3b))*(1+(rb*cosB+y3b)/(rb+
        z3b)*(cosB+a/rb)+a*y3b/(rb*rb)));

    return make3(
        v1cb1+v1cb2+v1cb3,
        v2cb1+v2cb2+v2cb3,
        v3cb1+v3cb2+v3cb3
    );
}

WITHIN_KERNEL Real6 AngDisStrainFSC(Real y1, Real y2, Real y3, Real beta,
                                    Real b1, Real b2, Real b3, Real nu, Real a) {

    Real sinB = sin(beta);
    Real cosB = cos(beta);
    Real cotB = cosB / sinB;
    Real y3b = y3+2*a;
    Real z1b = y1*cosB+y3b*sinB;
    Real z3b = -y1*sinB+y3b*cosB;
    Real rb2 = y1*y1+y2*y2+y3b*y3b;
    Real rb = sqrt(rb2);

    Real W1 = rb*cosB+y3b;
    Real W2 = cosB+a/rb;
    Real W3 = cosB+y3b/rb;
    Real W4 = nu+a/rb;
    Real W5 = 2*nu+a/rb;
    Real W6 = rb+y3b;
    Real W7 = rb+z3b;
    Real W8 = y3+a;
    Real W9 = 1+a/rb/cosB;

    Real N1 = 1-2*nu;

    Real rFib_ry2 = z1b/rb/(rb+z3b)-y1/rb/(rb+y3b);
    Real rFib_ry1 = y2/rb/(rb+y3b)-cosB*y2/rb/(rb+z3b);
    Real rFib_ry3 = -sinB*y2/rb/(rb+z3b);

    Real6 out;
    out.x = b1*(1.0/4.0*((-2+2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)*cotB- y1/W6*W4)/rb*y1+N1*y2/W6*(a/(rb*rb*rb)*y1*cotB-1/W6*W4+(y1*y1)/ (W6*W6)*W4/rb+(y1*y1)/W6*a/(rb*rb*rb))-N1*y2*cosB*cotB/(W7*W7)*W2*(y1/ rb-sinB)-N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*y1-3*a*y2*W8*cotB/(rb*rb2*rb2)* y1-y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1-y2*W8/ rb2/(W6*W6)*(-N1*cotB+y1/W6*W5+a*y1/rb2)*y1+y2*W8/rb/W6* (1/W6*W5-(y1*y1)/(W6*W6)*W5/rb-(y1*y1)/W6*a/(rb*rb*rb)+a/rb2-2*a*(y1*y1) /(rb2*rb2))-y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*y1-y2*W8/rb/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)*(y1/rb-sinB)+y2*W8/rb/W7*(-cosB/ (W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)*(y1/ rb-sinB)+cosB/W7*(1/rb*cosB*y1*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb2) *y1*cotB+(2-2*nu)*(1/rb*sinB*y1-1)*cosB)+2*a*y3b*cosB*cotB/ (rb2*rb2)*y1))/M_PI/(1-nu))+ b2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1)* cosB*(y1/rb-sinB)/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1* cotB/rb+(y1*y1)/W6*W4)/rb*y1+N1/W6*(-N1*cotB+a*cotB/rb-a* (y1*y1)*cotB/(rb*rb*rb)+2*y1/W6*W4-(y1*y1*y1)/(W6*W6)*W4/rb-(y1*y1*y1)/W6*a/ (rb*rb*rb))+N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*(y1/ rb-sinB)-N1*cotB/W7*((cosB*cosB)-a*(1/rb*sinB*y1-1)/rb/cosB+a* (rb*sinB-y1)/(rb*rb*rb)/cosB*y1)-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8* cotB/(rb*rb2*rb2)-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6* W5-a*(y1*y1)/(rb*rb*rb))/rb*y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y1+ 1/rb*N1*cotB-2*y1/rb/W6*W5+(y1*y1*y1)/(rb*rb*rb)/W6*W5+(y1*y1*y1)/rb2/ (W6*W6)*W5+(y1*y1*y1)/(rb2*rb2)/W6*a-2*a/(rb*rb*rb)*y1+3*a*(y1*y1*y1)/(rb*rb2*rb2))-W8* cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb* ((2-2*nu)*cosB-W1/W7*W9))*(y1/rb-sinB)+W8*cotB/W7*(a*y3b/ (rb*rb*rb)/cosB-3*a*(y1*y1)*y3b/(rb*rb2*rb2)/cosB+(1/rb*sinB*y1-1)/rb* ((2-2*nu)*cosB-W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/ W7*W9)*y1+(rb*sinB-y1)/rb*(-1/rb*cosB*y1/W7*W9+W1/(W7*W7)* W9*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)/cosB*y1)))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)/rb*y1-y2/W6*a/(rb*rb*rb)*y1+y2* cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb*rb)*y1)+y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1-y2*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a*y3b/rb2)*y1-y2*W8* cosB/rb/(W7*W7)*(W1/W7*W2+a*y3b/rb2)*(y1/rb-sinB)+y2*W8* cosB/rb/W7*(1/rb*cosB*y1/W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/(rb2*rb2)*y1))/M_PI/(1-nu));

    out.y = b1*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB/rb*y2/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+(y2*y2) /W6*W4)/rb*y2-N1/W6*(a*y1*cotB/(rb*rb*rb)*y2+2*y2/W6*W4-(y2*y2*y2) /(W6*W6)*W4/rb-(y2*y2*y2)/W6*a/(rb*rb*rb))+N1*z1b*cotB/(W7*W7)*W2/rb* y2+N1*z1b*cotB/W7*a/(rb*rb*rb)*y2+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1-W8/ (W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y2+2*y2/rb/ W6*W5-(y2*y2*y2)/(rb*rb*rb)/W6*W5-(y2*y2*y2)/rb2/(W6*W6)*W5-(y2*y2*y2)/(rb2*rb2)/W6* a+2*a/(rb*rb*rb)*y2-3*a*(y2*y2*y2)/(rb*rb2*rb2))-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1* z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1))/rb*y2+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a* cosB)*y2-3*a*y3b*z1b*cotB/(rb*rb2*rb2)*y2+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)- a*z1b*cotB/rb*W1)*y2+1/rb2/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1)*y2-1/rb/W7*(2*y2*(cosB*cosB)+a*z1b*cotB/(rb*rb*rb)*W1*y2-a* z1b*cotB/rb2*cosB*y2)))/M_PI/(1-nu))+ b2*(1.0/4.0*((2-2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((W5-1)*cotB+y1/W6* W4)-N1*(y2*y2)/(W6*W6)*((W5-1)*cotB+y1/W6*W4)/rb+N1*y2/W6*(-a/ (rb*rb*rb)*y2*cotB-y1/(W6*W6)*W4/rb*y2-y2/W6*a/(rb*rb*rb)*y1)-N1*cotB/ W7*W9+N1*(y2*y2)*cotB/(W7*W7)*W9/rb+N1*(y2*y2)*cotB/W7*a/(rb*rb*rb)/ cosB-a*W8*cotB/(rb*rb*rb)+3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/rb/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/(rb*rb*rb)/W6* (N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))-(y2*y2)*W8/rb2/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))+y2*W8/rb/W6* (2*nu*y1/(W6*W6)/rb*y2+a*y1/(rb*rb*rb)*(1/rb+1/W6)*y2-a*y1/rb* (-1/(rb*rb*rb)*y2-1/(W6*W6)/rb*y2))+W8*cotB/rb/W7*((-2+2*nu)*cosB+ W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)-(y2*y2)*W8*cotB/rb2/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)+y2*W8*cotB/rb/W7*(1/ rb*cosB*y2/W7*W9-W1/(W7*W7)*W9/rb*y2-W1/W7*a/(rb*rb*rb)/cosB*y2- 2*a*y3b/(rb2*rb2)/cosB*y2))/M_PI/(1-nu))+ b3*(1.0/4.0*(N1*(-sinB/rb*y2/W7+y2/(W6*W6)*(1+a/rb)/rb*y1+y2/W6* a/(rb*rb*rb)*y1-z1b/(W7*W7)*W2/rb*y2-z1b/W7*a/(rb*rb*rb)*y2)-y2*W8/ (rb*rb*rb)*(a/rb2+1/W6)*y1+y1*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/ rb*y2)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))/rb*y2-W8/W7*(sinB*a/ (rb*rb*rb)*y2-z1b/(rb*rb*rb)*(1+a*y3b/rb2)*y2-2*z1b/(rb*rb2*rb2)*a*y3b*y2+ 1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2+1/rb2/(W7*W7)* ((y2*y2)*cosB*sinB-a*z1b/rb*W1)*y2-1/rb/W7*(2*y2*cosB*sinB+a* z1b/(rb*rb*rb)*W1*y2-a*z1b/rb2*cosB*y2)))/M_PI/(1-nu));

    out.z = b1*(1.0/4.0*((2-2*nu)*(N1*rFib_ry3*cotB-y2/(W6*W6)*W5*(y3b/rb+1)- 1.0/2.0*y2/W6*a/(rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7* a/(rb*rb*rb)*2*y3b)+y2/rb*(2*nu/W6+a/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)*2*y3b+y2*W8/rb*(-2*nu/(W6*W6)*(y3b/rb+1)-a/ (rb2*rb2)*2*y3b)+y2*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)- 1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)*2* y3b-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a*y3b/rb2)*W3+y2* W8*cosB/rb/W7*(-(cosB*y3b/rb+1)/W7*W2+W1/(W7*W7)*W2*W3+1.0/2.0* W1/W7*a/(rb*rb*rb)*2*y3b-a/rb2+a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2*(1.0/4.0*((-2+2*nu)*N1*cotB*((y3b/rb+1)/W6-cosB*W3/W7)+(2-2*nu)* y1/(W6*W6)*W5*(y3b/rb+1)+1.0/2.0*(2-2*nu)*y1/W6*a/(rb*rb*rb)*2*y3b+(2- 2*nu)*sinB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*W3-1.0/2.0*(2-2*nu)*z1b/ W7*a/(rb*rb*rb)*2*y3b+1/rb*(N1*cotB-2*nu*y1/W6-a*y1/rb2)-1.0/2.0* W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/W6-a*y1/rb2)*2*y3b+W8/rb*(2*nu* y1/(W6*W6)*(y3b/rb+1)+a*y1/(rb2*rb2)*2*y3b)-1/W7*(cosB*sinB+W1* cotB/rb*((2-2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b* W1/rb/W7))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)*cosB-W1/ W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*W3-W8/W7*((cosB* y3b/rb+1)*cotB/rb*((2-2*nu)*cosB-W1/W7)-1.0/2.0*W1*cotB/(rb*rb*rb)* ((2-2*nu)*cosB-W1/W7)*2*y3b+W1*cotB/rb*(-(cosB*y3b/rb+1)/W7+ W1/(W7*W7)*W3)-1.0/2.0*a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7)* 2*y3b+a/rb*(-z1b/rb2-y3b*sinB/rb2+y3b*z1b/(rb2*rb2)*2*y3b- sinB*W1/rb/W7-z1b*(cosB*y3b/rb+1)/rb/W7+1.0/2.0*z1b*W1/(rb*rb*rb)/ W7*2*y3b+z1b*W1/rb/(W7*W7)*W3)))/M_PI/(1-nu))+ b3*(1.0/4.0*((2-2*nu)*rFib_ry3-(2-2*nu)*y2*sinB/(W7*W7)*W2*W3-1.0/2.0* (2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*2*y3b+y2*sinB/rb/W7*(1+W1/W7* W2+a*y3b/rb2)-1.0/2.0*y2*W8*sinB/(rb*rb*rb)/W7*(1+W1/W7*W2+a*y3b/ rb2)*2*y3b-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+a*y3b/rb2)*W3+ y2*W8*sinB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/(W7*W7)*W2*W3- 1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2*y3b))/M_PI/(1-nu));

    out.a = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry2*(cotB*cotB)+N1/W6*((1-W5)*cotB-y1/ W6*W4)-N1*(y2*y2)/(W6*W6)*((1-W5)*cotB-y1/W6*W4)/rb+N1*y2/W6* (a/(rb*rb*rb)*y2*cotB+y1/(W6*W6)*W4/rb*y2+y2/W6*a/(rb*rb*rb)*y1)+N1* cosB*cotB/W7*W2-N1*(y2*y2)*cosB*cotB/(W7*W7)*W2/rb-N1*(y2*y2)*cosB* cotB/W7*a/(rb*rb*rb)+a*W8*cotB/(rb*rb*rb)-3*a*(y2*y2)*W8*cotB/(rb*rb2*rb2)+W8/ rb/W6*(-N1*cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/(rb*rb*rb)/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-(y2*y2)*W8/rb2/(W6*W6)*(-N1*cotB+y1/ W6*W5+a*y1/rb2)+y2*W8/rb/W6*(-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1-2*a*y1/(rb2*rb2)*y2)+W8/rb/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)-(y2*y2)*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2- 2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)-(y2*y2)*W8/rb2/ (W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)-a*y3b*cosB*cotB/rb2)+y2*W8/rb/W7*(-cosB/(W7*W7)*(W1* (N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)/rb*y2+cosB/ W7*(1/rb*cosB*y2*(N1*cosB-a/rb)*cotB+W1*a/(rb*rb*rb)*y2*cotB+(2-2* nu)/rb*sinB*y2*cosB)+2*a*y3b*cosB*cotB/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)/rb*y2/W6-((2-2*nu)*(cotB*cotB)+1)* cosB/rb*y2/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/rb+ (y1*y1)/W6*W4)/rb*y2+N1/W6*(-a*y1*cotB/(rb*rb*rb)*y2-(y1*y1)/(W6*W6) *W4/rb*y2-(y1*y1)/W6*a/(rb*rb*rb)*y2)+N1*cotB/(W7*W7)*(z1b*cosB-a* (rb*sinB-y1)/rb/cosB)/rb*y2-N1*cotB/W7*(-a/rb2*sinB*y2/ cosB+a*(rb*sinB-y1)/(rb*rb*rb)/cosB*y2)+3*a*y2*W8*cotB/(rb*rb2*rb2)*y1- W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))/rb*y2+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB+a)*y2+(y1*y1)/(rb*rb2) /W6*W5*y2+(y1*y1)/rb2/(W6*W6)*W5*y2+(y1*y1)/(rb2*rb2)/W6*a*y2+3* a*(y1*y1)/(rb*rb2*rb2)*y2)-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/ cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))/rb*y2+W8*cotB/ W7*(-3*a*y1*y3b/(rb*rb2*rb2)/cosB*y2+1/rb2*sinB*y2*((2-2*nu)*cosB- W1/W7*W9)-(rb*sinB-y1)/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7*W9)*y2+(rb* sinB-y1)/rb*(-1/rb*cosB*y2/W7*W9+W1/(W7*W7)*W9/rb*y2+W1/W7* a/(rb*rb*rb)/cosB*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(1/W6*(1+a/rb)-(y2*y2)/(W6*W6)*(1+a/rb)/rb-(y2*y2)/ W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)*cosB/W7* a/(rb*rb*rb))-W8/rb*(a/rb2+1/W6)+(y2*y2)*W8/(rb*rb*rb)*(a/rb2+1/W6)- y2*W8/rb*(-2*a/(rb2*rb2)*y2-1/(W6*W6)/rb*y2)+W8*cosB/rb/W7* (W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*cosB/(rb*rb*rb)/W7*(W1/W7*W2+a* y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7)*(W1/W7*W2+a*y3b/rb2)+y2* W8*cosB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)*W2/rb*y2-W1/ W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)/rb*y1/W6-((2-2*nu)*(cotB*cotB)+1- 2*nu)*cosB*(y1/rb-sinB)/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b- a+(y2*y2)/W6*W4)/rb*y1-N1/W6*((1-W5)*cotB+a*(y1*y1)*cotB/(rb*rb*rb)- (y2*y2)/(W6*W6)*W4/rb*y1-(y2*y2)/W6*a/(rb*rb*rb)*y1)-N1*cosB*cotB/W7* W2+N1*z1b*cotB/(W7*W7)*W2*(y1/rb-sinB)+N1*z1b*cotB/W7*a/(rb*rb2) *y1-a*W8*cotB/(rb*rb*rb)+3*a*(y1*y1)*W8*cotB/(rb*rb2*rb2)-W8/(W6*W6)*(-2* nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/(rb*rb*rb))/rb* y1+W8/W6*(-1/(rb*rb*rb)*(N1*y1*cotB-a)*y1+1/rb*N1*cotB-(y2*y2)/ (rb*rb*rb)/W6*W5*y1-(y2*y2)/rb2/(W6*W6)*W5*y1-(y2*y2)/(rb2*rb2)/W6*a*y1- 3*a*(y2*y2)/(rb*rb2*rb2)*y1)-W8/(W7*W7)*((cosB*cosB)-1/rb*(N1*z1b*cotB+a* cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/ rb*W1))*(y1/rb-sinB)+W8/W7*(1/(rb*rb*rb)*(N1*z1b*cotB+a*cosB)* y1-1/rb*N1*cosB*cotB+a*y3b*cosB*cotB/(rb*rb*rb)-3*a*y3b*z1b*cotB/ (rb*rb2*rb2)*y1+1/(rb*rb*rb)/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*y1+1/ rb/(W7*W7)*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*(y1/rb-sinB)-1/rb/ W7*(-a*cosB*cotB/rb*W1+a*z1b*cotB/(rb*rb*rb)*W1*y1-a*z1b*cotB/ rb2*cosB*y1)))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry1*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)/rb*y1+N1*y2/W6*(-a/(rb*rb*rb)*y1*cotB+1/W6*W4-(y1*y1) /(W6*W6)*W4/rb-(y1*y1)/W6*a/(rb*rb*rb))+N1*y2*cotB/(W7*W7)*W9*(y1/ rb-sinB)+N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*y1+3*a*y2*W8*cotB/(rb*rb2*rb2) *y1-y2*W8/(rb*rb*rb)/W6*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/ W6))*y1-y2*W8/rb2/(W6*W6)*(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/ rb+1/W6))*y1+y2*W8/rb/W6*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/ rb*(1/rb+1/W6)+a*(y1*y1)/(rb*rb*rb)*(1/rb+1/W6)-a*y1/rb*(-1/ (rb*rb*rb)*y1-1/(W6*W6)/rb*y1))-y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)* cosB+W1/W7*W9+a*y3b/rb2/cosB)*y1-y2*W8*cotB/rb/(W7*W7)*((-2+ 2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)*(y1/rb-sinB)+y2*W8* cotB/rb/W7*(1/rb*cosB*y1/W7*W9-W1/(W7*W7)*W9*(y1/rb-sinB)- W1/W7*a/(rb*rb*rb)/cosB*y1-2*a*y3b/(rb2*rb2)/cosB*y1))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*(y1/rb-sinB)/W7-1/W6*(1+a/rb)+(y1*y1)/(W6*W6) *(1+a/rb)/rb+(y1*y1)/W6*a/(rb*rb*rb)+cosB/W7*W2-z1b/(W7*W7)*W2* (y1/rb-sinB)-z1b/W7*a/(rb*rb*rb)*y1)+W8/rb*(a/rb2+1/W6)-(y1*y1)* W8/(rb*rb*rb)*(a/rb2+1/W6)+y1*W8/rb*(-2*a/(rb2*rb2)*y1-1/(W6*W6)/ rb*y1)+W8/(W7*W7)*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))*(y1/rb-sinB)-W8/W7* (sinB*a/(rb*rb*rb)*y1+cosB/rb*(1+a*y3b/rb2)-z1b/(rb*rb*rb)*(1+a*y3b/ rb2)*y1-2*z1b/(rb*rb2*rb2)*a*y3b*y1+1/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a* z1b/rb*W1)*y1+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)* (y1/rb-sinB)-1/rb/W7*(-a*cosB/rb*W1+a*z1b/(rb*rb*rb)*W1*y1-a* z1b/rb2*cosB*y1)))/M_PI/(1-nu));

    out.b = b1/2*(1.0/4.0*((-2+2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((1-W5)* cotB-y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB+ y1/(W6*W6)*W4*(y3b/rb+1)+1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)-N1*y2*cosB* cotB/(W7*W7)*W2*W3-1.0/2.0*N1*y2*cosB*cotB/W7*a/(rb*rb*rb)*2*y3b+a/ (rb*rb*rb)*y2*cotB-3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(-N1* cotB+y1/W6*W5+a*y1/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(-N1*cotB+y1/ W6*W5+a*y1/rb2)*2*y3b-y2*W8/rb/(W6*W6)*(-N1*cotB+y1/W6*W5+ a*y1/rb2)*(y3b/rb+1)+y2*W8/rb/W6*(-y1/(W6*W6)*W5*(y3b/rb+ 1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b-a*y1/(rb2*rb2)*2*y3b)+y2/rb/W7* (cosB/W7*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)- a*y3b*cosB*cotB/rb2)-1.0/2.0*y2*W8/(rb*rb*rb)/W7*(cosB/W7*(W1*(N1* cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/ rb2)*2*y3b-y2*W8/rb/(W7*W7)*(cosB/W7*(W1*(N1*cosB-a/rb)*cotB+ (2-2*nu)*(rb*sinB-y1)*cosB)-a*y3b*cosB*cotB/rb2)*W3+y2*W8/rb/ W7*(-cosB/(W7*W7)*(W1*(N1*cosB-a/rb)*cotB+(2-2*nu)*(rb*sinB-y1)* cosB)*W3+cosB/W7*((cosB*y3b/rb+1)*(N1*cosB-a/rb)*cotB+1.0/2.0*W1* a/(rb*rb*rb)*2*y3b*cotB+1.0/2.0*(2-2*nu)/rb*sinB*2*y3b*cosB)-a*cosB* cotB/rb2+a*y3b*cosB*cotB/(rb2*rb2)*2*y3b))/M_PI/(1-nu))+ b2/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)+nu)*(y3b/rb+1)/W6-((2-2*nu)*(cotB*cotB) +1)*cosB*W3/W7)-N1/(W6*W6)*(-N1*y1*cotB+nu*y3b-a+a*y1*cotB/ rb+(y1*y1)/W6*W4)*(y3b/rb+1)+N1/W6*(nu-1.0/2.0*a*y1*cotB/(rb*rb*rb)*2* y3b-(y1*y1)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y1*y1)/W6*a/(rb*rb*rb)*2*y3b)+ N1*cotB/(W7*W7)*(z1b*cosB-a*(rb*sinB-y1)/rb/cosB)*W3-N1*cotB/ W7*(cosB*sinB-1.0/2.0*a/rb2*sinB*2*y3b/cosB+1.0/2.0*a*(rb*sinB-y1)/ (rb*rb*rb)/cosB*2*y3b)-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2* y3b+1/W6*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a*(y1*y1)/ (rb*rb*rb))-W8/(W6*W6)*(2*nu+1/rb*(N1*y1*cotB+a)-(y1*y1)/rb/W6*W5-a* (y1*y1)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB+a)*2* y3b+1.0/2.0*(y1*y1)/(rb*rb*rb)/W6*W5*2*y3b+(y1*y1)/rb/(W6*W6)*W5*(y3b/rb+ 1)+1.0/2.0*(y1*y1)/(rb2*rb2)/W6*a*2*y3b+3.0/2.0*a*(y1*y1)/(rb*rb2*rb2)*2*y3b)+ cotB/W7*(-cosB*sinB+a*y1*y3b/(rb*rb*rb)/cosB+(rb*sinB-y1)/rb*((2- 2*nu)*cosB-W1/W7*W9))-W8*cotB/(W7*W7)*(-cosB*sinB+a*y1*y3b/(rb*rb2) /cosB+(rb*sinB-y1)/rb*((2-2*nu)*cosB-W1/W7*W9))*W3+W8*cotB/ W7*(a/(rb*rb*rb)/cosB*y1-3.0/2.0*a*y1*y3b/(rb*rb2*rb2)/cosB*2*y3b+1.0/2.0/ rb2*sinB*2*y3b*((2-2*nu)*cosB-W1/W7*W9)-1.0/2.0*(rb*sinB-y1)/(rb*rb2) *((2-2*nu)*cosB-W1/W7*W9)*2*y3b+(rb*sinB-y1)/rb*(-(cosB*y3b/ rb+1)/W7*W9+W1/(W7*W7)*W9*W3+1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2* y3b)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-y2/(W6*W6)*(1+a/rb)*(y3b/rb+1)-1.0/2.0*y2/W6*a/ (rb*rb*rb)*2*y3b+y2*cosB/(W7*W7)*W2*W3+1.0/2.0*y2*cosB/W7*a/(rb*rb*rb)*2* y3b)-y2/rb*(a/rb2+1/W6)+1.0/2.0*y2*W8/(rb*rb*rb)*(a/rb2+1/W6)*2* y3b-y2*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)*(y3b/rb+1))+y2*cosB/ rb/W7*(W1/W7*W2+a*y3b/rb2)-1.0/2.0*y2*W8*cosB/(rb*rb*rb)/W7*(W1/ W7*W2+a*y3b/rb2)*2*y3b-y2*W8*cosB/rb/(W7*W7)*(W1/W7*W2+a* y3b/rb2)*W3+y2*W8*cosB/rb/W7*((cosB*y3b/rb+1)/W7*W2-W1/ (W7*W7)*W2*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)*2*y3b+a/rb2-a*y3b/(rb2*rb2)*2* y3b))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry1*cotB-y1/(W6*W6)*W5/rb*y2-y2/W6* a/(rb*rb*rb)*y1+y2*cosB/(W7*W7)*W2*(y1/rb-sinB)+y2*cosB/W7*a/(rb*rb2) *y1)-y2*W8/(rb*rb*rb)*(2*nu/W6+a/rb2)*y1+y2*W8/rb*(-2*nu/(W6*W6) /rb*y1-2*a/(rb2*rb2)*y1)-y2*W8*cosB/(rb*rb*rb)/W7*(1-2*nu-W1/W7* W2-a*y3b/rb2)*y1-y2*W8*cosB/rb/(W7*W7)*(1-2*nu-W1/W7*W2-a* y3b/rb2)*(y1/rb-sinB)+y2*W8*cosB/rb/W7*(-1/rb*cosB*y1/W7* W2+W1/(W7*W7)*W2*(y1/rb-sinB)+W1/W7*a/(rb*rb*rb)*y1+2*a*y3b/(rb2*rb2) *y1))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y1/W6-cosB*(y1/rb-sinB)/W7)- (2-2*nu)/W6*W5+(2-2*nu)*(y1*y1)/(W6*W6)*W5/rb+(2-2*nu)*(y1*y1)/W6* a/(rb*rb*rb)+(2-2*nu)*cosB/W7*W2-(2-2*nu)*z1b/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y1-W8/(rb*rb*rb)*(N1*cotB-2*nu*y1/ W6-a*y1/rb2)*y1+W8/rb*(-2*nu/W6+2*nu*(y1*y1)/(W6*W6)/rb-a/rb2+ 2*a*(y1*y1)/(rb2*rb2))+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2-2*nu)* cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))*(y1/rb- sinB)-W8/W7*(1/rb2*cosB*y1*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y1+W1*cotB/rb*(-1/rb*cosB* y1/W7+W1/(W7*W7)*(y1/rb-sinB))-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2- z1b*W1/rb/W7)*y1+a/rb*(-y3b*cosB/rb2+2*y3b*z1b/(rb2*rb2)*y1- cosB*W1/rb/W7-z1b/rb2*cosB*y1/W7+z1b*W1/(rb*rb*rb)/W7*y1+z1b* W1/rb/(W7*W7)*(y1/rb-sinB))))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry1-(2-2*nu)*y2*sinB/(W7*W7)*W2*(y1/rb- sinB)-(2-2*nu)*y2*sinB/W7*a/(rb*rb*rb)*y1-y2*W8*sinB/(rb*rb*rb)/W7*(1+ W1/W7*W2+a*y3b/rb2)*y1-y2*W8*sinB/rb/(W7*W7)*(1+W1/W7*W2+ a*y3b/rb2)*(y1/rb-sinB)+y2*W8*sinB/rb/W7*(1/rb*cosB*y1/ W7*W2-W1/(W7*W7)*W2*(y1/rb-sinB)-W1/W7*a/(rb*rb*rb)*y1-2*a*y3b/ (rb2*rb2)*y1))/M_PI/(1-nu));

    out.c = b1/2*(1.0/4.0*(N1*(((2-2*nu)*(cotB*cotB)-nu)*(y3b/rb+1)/W6-((2-2*nu)* (cotB*cotB)+1-2*nu)*cosB*W3/W7)+N1/(W6*W6)*(y1*cotB*(1-W5)+nu*y3b-a+ (y2*y2)/W6*W4)*(y3b/rb+1)-N1/W6*(1.0/2.0*a*y1*cotB/(rb*rb*rb)*2*y3b+ nu-(y2*y2)/(W6*W6)*W4*(y3b/rb+1)-1.0/2.0*(y2*y2)/W6*a/(rb*rb*rb)*2*y3b)-N1* sinB*cotB/W7*W2+N1*z1b*cotB/(W7*W7)*W2*W3+1.0/2.0*N1*z1b*cotB/W7* a/(rb*rb*rb)*2*y3b-a/(rb*rb*rb)*y1*cotB+3.0/2.0*a*y1*W8*cotB/(rb*rb2*rb2)*2*y3b+ 1/W6*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+a*(y2*y2)/ (rb*rb*rb))-W8/(W6*W6)*(-2*nu+1/rb*(N1*y1*cotB-a)+(y2*y2)/rb/W6*W5+ a*(y2*y2)/(rb*rb*rb))*(y3b/rb+1)+W8/W6*(-1.0/2.0/(rb*rb*rb)*(N1*y1*cotB-a)* 2*y3b-1.0/2.0*(y2*y2)/(rb*rb*rb)/W6*W5*2*y3b-(y2*y2)/rb/(W6*W6)*W5*(y3b/ rb+1)-1.0/2.0*(y2*y2)/(rb2*rb2)/W6*a*2*y3b-3.0/2.0*a*(y2*y2)/(rb*rb2*rb2)*2*y3b)+ 1/W7*((cosB*cosB)-1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb2) -1/rb/W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))-W8/(W7*W7)*((cosB*cosB)- 1/rb*(N1*z1b*cotB+a*cosB)+a*y3b*z1b*cotB/(rb*rb*rb)-1/rb/W7* ((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1))*W3+W8/W7*(1.0/2.0/(rb*rb*rb)*(N1* z1b*cotB+a*cosB)*2*y3b-1/rb*N1*sinB*cotB+a*z1b*cotB/(rb*rb*rb)+a* y3b*sinB*cotB/(rb*rb*rb)-3.0/2.0*a*y3b*z1b*cotB/(rb*rb2*rb2)*2*y3b+1.0/2.0/(rb*rb2) /W7*((y2*y2)*(cosB*cosB)-a*z1b*cotB/rb*W1)*2*y3b+1/rb/(W7*W7)*((y2*y2) *(cosB*cosB)-a*z1b*cotB/rb*W1)*W3-1/rb/W7*(-a*sinB*cotB/rb*W1+ 1.0/2.0*a*z1b*cotB/(rb*rb*rb)*W1*2*y3b-a*z1b*cotB/rb*(cosB*y3b/rb+ 1))))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((2-2*nu)*N1*rFib_ry3*(cotB*cotB)-N1*y2/(W6*W6)*((W5-1)*cotB+ y1/W6*W4)*(y3b/rb+1)+N1*y2/W6*(-1.0/2.0*a/(rb*rb*rb)*2*y3b*cotB-y1/ (W6*W6)*W4*(y3b/rb+1)-1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b)+N1*y2*cotB/ (W7*W7)*W9*W3+1.0/2.0*N1*y2*cotB/W7*a/(rb*rb*rb)/cosB*2*y3b-a/(rb*rb*rb)* y2*cotB+3.0/2.0*a*y2*W8*cotB/(rb*rb2*rb2)*2*y3b+y2/rb/W6*(N1*cotB-2* nu*y1/W6-a*y1/rb*(1/rb+1/W6))-1.0/2.0*y2*W8/(rb*rb*rb)/W6*(N1* cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*2*y3b-y2*W8/rb/(W6*W6) *(N1*cotB-2*nu*y1/W6-a*y1/rb*(1/rb+1/W6))*(y3b/rb+1)+y2* W8/rb/W6*(2*nu*y1/(W6*W6)*(y3b/rb+1)+1.0/2.0*a*y1/(rb*rb*rb)*(1/rb+ 1/W6)*2*y3b-a*y1/rb*(-1.0/2.0/(rb*rb*rb)*2*y3b-1/(W6*W6)*(y3b/rb+ 1)))+y2*cotB/rb/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/rb2/cosB)- 1.0/2.0*y2*W8*cotB/(rb*rb*rb)/W7*((-2+2*nu)*cosB+W1/W7*W9+a*y3b/ rb2/cosB)*2*y3b-y2*W8*cotB/rb/(W7*W7)*((-2+2*nu)*cosB+W1/W7* W9+a*y3b/rb2/cosB)*W3+y2*W8*cotB/rb/W7*((cosB*y3b/rb+1)/ W7*W9-W1/(W7*W7)*W9*W3-1.0/2.0*W1/W7*a/(rb*rb*rb)/cosB*2*y3b+a/rb2/ cosB-a*y3b/(rb2*rb2)/cosB*2*y3b))/M_PI/(1-nu))+ b3/2*(1.0/4.0*(N1*(-sinB*W3/W7+y1/(W6*W6)*(1+a/rb)*(y3b/rb+1)+ 1.0/2.0*y1/W6*a/(rb*rb*rb)*2*y3b+sinB/W7*W2-z1b/(W7*W7)*W2*W3-1.0/2.0* z1b/W7*a/(rb*rb*rb)*2*y3b)+y1/rb*(a/rb2+1/W6)-1.0/2.0*y1*W8/(rb*rb2) *(a/rb2+1/W6)*2*y3b+y1*W8/rb*(-a/(rb2*rb2)*2*y3b-1/(W6*W6)* (y3b/rb+1))-1/W7*(sinB*(cosB-a/rb)+z1b/rb*(1+a*y3b/rb2)-1/ rb/W7*((y2*y2)*cosB*sinB-a*z1b/rb*W1))+W8/(W7*W7)*(sinB*(cosB- a/rb)+z1b/rb*(1+a*y3b/rb2)-1/rb/W7*((y2*y2)*cosB*sinB-a*z1b/ rb*W1))*W3-W8/W7*(1.0/2.0*sinB*a/(rb*rb*rb)*2*y3b+sinB/rb*(1+a*y3b/ rb2)-1.0/2.0*z1b/(rb*rb*rb)*(1+a*y3b/rb2)*2*y3b+z1b/rb*(a/rb2-a* y3b/(rb2*rb2)*2*y3b)+1.0/2.0/(rb*rb*rb)/W7*((y2*y2)*cosB*sinB-a*z1b/rb* W1)*2*y3b+1/rb/(W7*W7)*((y2*y2)*cosB*sinB-a*z1b/rb*W1)*W3-1/ rb/W7*(-a*sinB/rb*W1+1.0/2.0*a*z1b/(rb*rb*rb)*W1*2*y3b-a*z1b/rb* (cosB*y3b/rb+1))))/M_PI/(1-nu))+ b1/2.0*(1.0/4.0*((2-2*nu)*(N1*rFib_ry2*cotB+1/W6*W5-(y2*y2)/(W6*W6)*W5/ rb-(y2*y2)/W6*a/(rb*rb*rb)-cosB/W7*W2+(y2*y2)*cosB/(W7*W7)*W2/rb+(y2*y2)* cosB/W7*a/(rb*rb*rb))+W8/rb*(2*nu/W6+a/rb2)-(y2*y2)*W8/(rb*rb*rb)*(2* nu/W6+a/rb2)+y2*W8/rb*(-2*nu/(W6*W6)/rb*y2-2*a/(rb2*rb2)*y2)+ W8*cosB/rb/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/ (rb*rb*rb)/W7*(1-2*nu-W1/W7*W2-a*y3b/rb2)-(y2*y2)*W8*cosB/rb2/(W7*W7) *(1-2*nu-W1/W7*W2-a*y3b/rb2)+y2*W8*cosB/rb/W7*(-1/rb* cosB*y2/W7*W2+W1/(W7*W7)*W2/rb*y2+W1/W7*a/(rb*rb*rb)*y2+2*a* y3b/(rb2*rb2)*y2))/M_PI/(1-nu))+ b2/2*(1.0/4.0*((-2+2*nu)*N1*cotB*(1/rb*y2/W6-cosB/rb*y2/W7)+(2- 2*nu)*y1/(W6*W6)*W5/rb*y2+(2-2*nu)*y1/W6*a/(rb*rb*rb)*y2-(2-2* nu)*z1b/(W7*W7)*W2/rb*y2-(2-2*nu)*z1b/W7*a/(rb*rb*rb)*y2-W8/(rb*rb2) *(N1*cotB-2*nu*y1/W6-a*y1/rb2)*y2+W8/rb*(2*nu*y1/(W6*W6)/ rb*y2+2*a*y1/(rb2*rb2)*y2)+W8/(W7*W7)*(cosB*sinB+W1*cotB/rb*((2- 2*nu)*cosB-W1/W7)+a/rb*(sinB-y3b*z1b/rb2-z1b*W1/rb/W7))/ rb*y2-W8/W7*(1/rb2*cosB*y2*cotB*((2-2*nu)*cosB-W1/W7)-W1* cotB/(rb*rb*rb)*((2-2*nu)*cosB-W1/W7)*y2+W1*cotB/rb*(-cosB/rb* y2/W7+W1/(W7*W7)/rb*y2)-a/(rb*rb*rb)*(sinB-y3b*z1b/rb2-z1b*W1/ rb/W7)*y2+a/rb*(2*y3b*z1b/(rb2*rb2)*y2-z1b/rb2*cosB*y2/W7+ z1b*W1/(rb*rb*rb)/W7*y2+z1b*W1/rb2/(W7*W7)*y2)))/M_PI/(1-nu))+ b3/2*(1.0/4.0*((2-2*nu)*rFib_ry2+(2-2*nu)*sinB/W7*W2-(2-2*nu)*(y2*y2)* sinB/(W7*W7)*W2/rb-(2-2*nu)*(y2*y2)*sinB/W7*a/(rb*rb*rb)+W8*sinB/rb/ W7*(1+W1/W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/(rb*rb*rb)/W7*(1+W1/ W7*W2+a*y3b/rb2)-(y2*y2)*W8*sinB/rb2/(W7*W7)*(1+W1/W7*W2+a* y3b/rb2)+y2*W8*sinB/rb/W7*(1/rb*cosB*y2/W7*W2-W1/(W7*W7)* W2/rb*y2-W1/W7*a/(rb*rb*rb)*y2-2*a*y3b/(rb2*rb2)*y2))/M_PI/(1-nu));

    return out;
}

WITHIN_KERNEL Real3 AngSetupFSC(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make3(0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real3 vA = AngDisDispFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real3 vB = AngDisDispFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );

    Real3 v = sub3(vB, vA);
    return inv_transform3(ey1, ey2, ey3, v);
}

WITHIN_KERNEL Real6 AngSetupFSC_S(Real3 obs, Real3 slip, Real3 PA, Real3 PB, Real nu) {
    Real3 SideVec = sub3(PB, PA);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real beta = acos(-dot3(normalize3(SideVec), eZ));
    if (fabs(beta) < EPS || fabs(M_PI-beta) < EPS) {
        return make6(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
    }
    Real3 ey1 = SideVec;
    ey1.z = 0;
    ey1 = normalize3(ey1);

    Real3 ey3 = negate3(eZ);
    Real3 ey2 = cross3(ey3, ey1);

    Real3 yA = transform3(ey1, ey2, ey3, sub3(obs, PA));
    Real3 yAB = transform3(ey1, ey2, ey3, SideVec);
    Real3 yB = sub3(yA, yAB);

    Real3 slip_adcs = transform3(ey1, ey2, ey3, slip);

    Real configuration = beta;
    if (beta*yA.x >= 0) {
        configuration = -M_PI+beta;
    }

    Real6 vA = AngDisStrainFSC(
        yA.x, yA.y, yA.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PA.z
    );
    Real6 vB = AngDisStrainFSC(
        yB.x, yB.y, yB.z, configuration,
        slip_adcs.x, slip_adcs.y, slip_adcs.z, 
        nu, -PB.z
    );
    Real6 v = sub6(vB, vA);
    return tensor_transform3(ey1, ey2, ey3, v);
}

#endif


WITHIN_KERNEL 
int buffer_alloc(GLOBAL_MEM int* next_ptr, int n_values) {
    int out;
        #pragma omp critical 
        {
            out = *next_ptr;
            *next_ptr += n_values;
        }
    return out;
}

WITHIN_KERNEL
bool in(int target, GLOBAL_MEM int* arr, int n_arr) {
    // Could be faster by keeping arr sorted and doing binary search. 
    // but that is probably premature optimization.
    for (int i = 0; i < n_arr; i++) {
        if (target == arr[i]) {
            return true;
        }
    }
    return false;
}

struct MatrixIndex {
    int row;
    int col;
};

WITHIN_KERNEL struct MatrixIndex argmax_abs_not_in_list_rows(GLOBAL_MEM Real* data, int n_data_rows, int n_data_cols, GLOBAL_MEM int* prev, int n_prev) 
{
    struct MatrixIndex max_idx;
    Real max_val = -1;
    for (int i = 0; i < n_data_rows; i++) {
        for (int j = 0; j < n_data_cols; j++) {
            Real v = fabs(data[i * n_data_cols + j]);
            int relevant_idx = i;
            if (v > max_val && !in(relevant_idx, prev, n_prev)) {
                max_idx.row = i;
                max_idx.col = j;
                max_val = v;
            }
        }
    }
    return max_idx;
}
WITHIN_KERNEL struct MatrixIndex argmax_abs_not_in_list_cols(GLOBAL_MEM Real* data, int n_data_rows, int n_data_cols, GLOBAL_MEM int* prev, int n_prev) 
{
    struct MatrixIndex max_idx;
    Real max_val = -1;
    for (int i = 0; i < n_data_rows; i++) {
        for (int j = 0; j < n_data_cols; j++) {
            Real v = fabs(data[i * n_data_cols + j]);
            int relevant_idx = j;
            if (v > max_val && !in(relevant_idx, prev, n_prev)) {
                max_idx.row = i;
                max_idx.col = j;
                max_val = v;
            }
        }
    }
    return max_idx;
}







WITHIN_KERNEL 
void calc_rows_disp_fs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_i = floor(((float)rowcol_start) / 3);
    int obs_dim_start = rowcol_start - block_i * 3;
    int obs_dim_end = rowcol_end - block_i * 3;
    int i = os + block_i;
    int obs_idx = 0;

    Real3 obs;
        obs.x = obs_pts[i * 3 + 0];
        obs.y = obs_pts[i * 3 + 1];
        obs.z = obs_pts[i * 3 + 2];

    int src_dim_start = 0;
    int src_dim_end = 3;
    int n_output_src = se - ss;
    for (int j = ss + team_idx; j < se; j += team_size) {
        int src_idx = j - ss;

            Real3 tri0;
                tri0.x = tris[j * 9 + 0 * 3 + 0];
                tri0.y = tris[j * 9 + 0 * 3 + 1];
                tri0.z = tris[j * 9 + 0 * 3 + 2];
            Real3 tri1;
                tri1.x = tris[j * 9 + 1 * 3 + 0];
                tri1.y = tris[j * 9 + 1 * 3 + 1];
                tri1.z = tris[j * 9 + 1 * 3 + 2];
            Real3 tri2;
                tri2.x = tris[j * 9 + 2 * 3 + 0];
                tri2.y = tris[j * 9 + 2 * 3 + 1];
                tri2.z = tris[j * 9 + 2 * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
        }
    }
}
WITHIN_KERNEL 
void calc_cols_disp_fs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_j = floor(((float)rowcol_start) / 3);
    int src_dim_start = rowcol_start - block_j * 3;
    int src_dim_end = rowcol_end - block_j * 3;
    int j = ss + block_j;
    int src_idx = 0;
    int n_output_src = 1;

        Real3 tri0;
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    int obs_dim_start = 0;
    int obs_dim_end = 3;
    for (int i = os + team_idx; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
        }
    }
}

KERNEL
void aca_disp_fs(
    // out parameters here
    GLOBAL_MEM Real* buffer,
    GLOBAL_MEM int* uv_ptrs, 
    GLOBAL_MEM int* n_terms,
    // mutable workspace parameters
    GLOBAL_MEM int* next_buffer_ptr,
    GLOBAL_MEM Real* fworkspace,
    GLOBAL_MEM int* iworkspace,
    // immutable parameters below here
    GLOBAL_MEM int* uv_ptrs_starts,
    GLOBAL_MEM int* fworkspace_starts,
    GLOBAL_MEM int* Iref0, GLOBAL_MEM int* Jref0,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM Real* p_tol,
    GLOBAL_MEM int* p_max_iter,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_idx = get_local_id(0);
    int team_size = get_local_size(0);
    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];
    int n_obs = oe - os;
    int n_rows = n_obs * 3;

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;
    int n_cols = n_src * 3;

    int uv_ptr0 = uv_ptrs_starts[block_idx];

    GLOBAL_MEM int* block_iworkspace = &iworkspace[uv_ptr0];
    GLOBAL_MEM int* prevIstar = block_iworkspace;
    GLOBAL_MEM int* prevJstar = &block_iworkspace[min(n_cols, n_rows) / 2];

    GLOBAL_MEM Real* block_fworkspace = &fworkspace[fworkspace_starts[block_idx]];
    GLOBAL_MEM Real* RIstar = block_fworkspace;
    GLOBAL_MEM Real* RJstar = &block_fworkspace[n_cols];

    GLOBAL_MEM Real* RIref = &block_fworkspace[n_cols + n_rows];
    GLOBAL_MEM Real* RJref = &block_fworkspace[n_cols + n_rows + 3 * n_cols];

    int Iref = Iref0[block_idx];
    Iref -= Iref % 3;
    int Jref = Jref0[block_idx];
    Jref -= Jref % 3;

    calc_rows_disp_fs(
        RIref, Iref, Iref + 3,
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    calc_cols_disp_fs(
        RJref, Jref, Jref + 3, 
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    int max_iter = min(p_max_iter[block_idx], min(n_rows / 2, n_cols / 2));
    Real tol = p_tol[block_idx];

    // Some OpenCL implementations require LOCAL_MEM to be defined at the
    // outermost scope of a function. Otherwise this would be defined next to
    // the single-threaded section that uses it.
    LOCAL_MEM bool done[1];
    done[0] = true;

    Real frob_est = 0;
    int k = 0;
    for (; k < max_iter; k++) {
        


        struct MatrixIndex Istar_entry = argmax_abs_not_in_list_rows(RJref, n_rows, 3, prevIstar, k);
        struct MatrixIndex Jstar_entry = argmax_abs_not_in_list_cols(RIref, 3, n_cols, prevJstar, k);
        int Istar = Istar_entry.row;
        int Jstar = Jstar_entry.col;

        Real Istar_val = fabs(RJref[Istar_entry.row * 3 + Istar_entry.col]);
        Real Jstar_val = fabs(RIref[Jstar_entry.row * n_cols + Jstar_entry.col]);


        if (Istar_val > Jstar_val) {
            calc_rows_disp_fs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

            


            Jstar_entry = argmax_abs_not_in_list_cols(RIstar, 1, n_cols, prevJstar, k);
            Jstar = Jstar_entry.col;

            calc_cols_disp_fs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            calc_cols_disp_fs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

            



            Istar_entry = argmax_abs_not_in_list_rows(RJstar, n_rows, 1, prevIstar, k);
            Istar = Istar_entry.row;

            calc_rows_disp_fs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        }
        


        // claim a block of space for the first U and first V vectors and collect
        // the corresponding Real* pointers
        if (team_idx == 0) {
            done[0] = false;

            prevIstar[k] = Istar;
            prevJstar[k] = Jstar;

            int next_buffer_u_ptr = buffer_alloc(next_buffer_ptr, n_rows + n_cols);
            int next_buffer_v_ptr = next_buffer_u_ptr + n_rows;
            GLOBAL_MEM Real* next_buffer_u = &buffer[next_buffer_u_ptr];
            GLOBAL_MEM Real* next_buffer_v = &buffer[next_buffer_v_ptr];

            // Assign our uv_ptr to point to the u,v buffer location.
            uv_ptrs[uv_ptr0 + k] = next_buffer_u_ptr;

            Real v2 = 0;
            // TODO: team_idx!!!
            for (int i = 0; i < n_cols; i++) {
                next_buffer_v[i] = RIstar[i] / RIstar[Jstar];
                v2 += next_buffer_v[i] * next_buffer_v[i];
            }

            Real u2 = 0;
            for (int j = 0; j < n_rows; j++) {
                next_buffer_u[j] = RJstar[j];
                u2 += next_buffer_u[j] * next_buffer_u[j];
            }


            Real step_size = sqrt(u2 * v2);

            frob_est += step_size;

            if (step_size < tol) {
                done[0] = true;
            }

            if (k == max_iter - 1) {
                done[0] = true;
            }
        }
        


        if (done[0]) {
            break;
        }

        if (Iref <= Istar && Istar < Iref + 3) {
            while (true) {
                Iref = (Iref + 3) % n_rows;
                Iref -= Iref % 3;
                if (!in(Iref, prevIstar, k + 1)) {
                    break; 
                }
            }
            calc_rows_disp_fs(
                RIref, Iref, Iref + 3,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Iref + 3) - (Iref);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Iref];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIref[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = 0; i < 3; i++) {
                for (int j = team_idx; j < n_cols; j += team_size) {
                    RIref[i * n_cols + j] -= next_buffer_u[i + Iref] * next_buffer_v[j];
                }
            }
        }

        if (Jref <= Jstar && Jstar < Jref + 3) {
            while (true) {
                Jref = (Jref + 3) % n_cols;
                Jref -= Jref % 3;
                if (!in(Jref, prevJstar, k + 1)) {
                    break; 
                }
            }
            calc_cols_disp_fs(
                RJref, Jref, Jref + 3, 
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jref + 3) - (Jref);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jref];
                    RJref[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = team_idx; i < n_rows; i += team_size) {
                for (int j = 0; j < 3; j++) {
                    RJref[i * 3 + j] -= next_buffer_u[i] * next_buffer_v[j + Jref];
                }
            }
        }
    }

    if (team_idx == 0) {
        n_terms[block_idx] = k + 1;
    }
}



WITHIN_KERNEL 
void calc_rows_disp_hs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_i = floor(((float)rowcol_start) / 3);
    int obs_dim_start = rowcol_start - block_i * 3;
    int obs_dim_end = rowcol_end - block_i * 3;
    int i = os + block_i;
    int obs_idx = 0;

    Real3 obs;
        obs.x = obs_pts[i * 3 + 0];
        obs.y = obs_pts[i * 3 + 1];
        obs.z = obs_pts[i * 3 + 2];

    int src_dim_start = 0;
    int src_dim_end = 3;
    int n_output_src = se - ss;
    for (int j = ss + team_idx; j < se; j += team_size) {
        int src_idx = j - ss;

            Real3 tri0;
                tri0.x = tris[j * 9 + 0 * 3 + 0];
                tri0.y = tris[j * 9 + 0 * 3 + 1];
                tri0.z = tris[j * 9 + 0 * 3 + 2];
            Real3 tri1;
                tri1.x = tris[j * 9 + 1 * 3 + 0];
                tri1.y = tris[j * 9 + 1 * 3 + 1];
                tri1.z = tris[j * 9 + 1 * 3 + 2];
            Real3 tri2;
                tri2.x = tris[j * 9 + 2 * 3 + 0];
                tri2.y = tris[j * 9 + 2 * 3 + 1];
                tri2.z = tris[j * 9 + 2 * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
        }
    }
}
WITHIN_KERNEL 
void calc_cols_disp_hs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_j = floor(((float)rowcol_start) / 3);
    int src_dim_start = rowcol_start - block_j * 3;
    int src_dim_end = rowcol_end - block_j * 3;
    int j = ss + block_j;
    int src_idx = 0;
    int n_output_src = 1;

        Real3 tri0;
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    int obs_dim_start = 0;
    int obs_dim_end = 3;
    for (int i = os + team_idx; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    Real3 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real3 uvw0 = AngSetupFSC(obs, efcs_slip, tri0, tri1, nu);
        Real3 uvw1 = AngSetupFSC(obs, efcs_slip, tri1, tri2, nu);
        Real3 uvw2 = AngSetupFSC(obs, efcs_slip, tri2, tri0, nu);
        Real3 fsc_term = add3(add3(uvw0, uvw1), uvw2);

        summed_terms = add3(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real3 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tp = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real3 r2Tp = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real3 r3Tp = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add3(add3(r1Tp, r2Tp), r3Tp);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real3 r1Tn = TDSetupD(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real3 r2Tn = TDSetupD(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real3 r3Tn = TDSetupD(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add3(add3(r1Tn, r2Tn), r3Tn);
    } else {
        out = make3(NAN, NAN, NAN);
    }

    Real3 a = make3(
        -transformed_obs.x,
        transformed_tri0.y - transformed_obs.y,
        transformed_tri0.z - transformed_obs.z
    );
    Real3 b = negate3(transformed_obs);
    Real3 c = make3(
        -transformed_obs.x,
        transformed_tri2.y - transformed_obs.y,
        transformed_tri2.z - transformed_obs.z
    );
    Real na = length3(a);
    Real nb = length3(b);
    Real nc = length3(c);

    Real FiN = (a.x*(b.y*c.z-b.z*c.y)-            a.y*(b.x*c.z-b.z*c.x)+            a.z*(b.x*c.y-b.y*c.x));
    Real FiD = (na*nb*nc+dot3(a,b)*nc+dot3(a,c)*nb+dot3(b,c)*na);
    Real Fi = -2*atan2(FiN,FiD)/4/M_PI;

    // Calculate the complete displacement vector components in TDCS
    out = add3(out, mul_scalar3(slip,Fi));

    // Transform the complete displacement vector components from TDCS into EFCS
    Real3 full_out = inv_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add3(summed_terms, full_out);
    }
    Real3 full_out = summed_terms;


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 3 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
        }
    }
}

KERNEL
void aca_disp_hs(
    // out parameters here
    GLOBAL_MEM Real* buffer,
    GLOBAL_MEM int* uv_ptrs, 
    GLOBAL_MEM int* n_terms,
    // mutable workspace parameters
    GLOBAL_MEM int* next_buffer_ptr,
    GLOBAL_MEM Real* fworkspace,
    GLOBAL_MEM int* iworkspace,
    // immutable parameters below here
    GLOBAL_MEM int* uv_ptrs_starts,
    GLOBAL_MEM int* fworkspace_starts,
    GLOBAL_MEM int* Iref0, GLOBAL_MEM int* Jref0,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM Real* p_tol,
    GLOBAL_MEM int* p_max_iter,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_idx = get_local_id(0);
    int team_size = get_local_size(0);
    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];
    int n_obs = oe - os;
    int n_rows = n_obs * 3;

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;
    int n_cols = n_src * 3;

    int uv_ptr0 = uv_ptrs_starts[block_idx];

    GLOBAL_MEM int* block_iworkspace = &iworkspace[uv_ptr0];
    GLOBAL_MEM int* prevIstar = block_iworkspace;
    GLOBAL_MEM int* prevJstar = &block_iworkspace[min(n_cols, n_rows) / 2];

    GLOBAL_MEM Real* block_fworkspace = &fworkspace[fworkspace_starts[block_idx]];
    GLOBAL_MEM Real* RIstar = block_fworkspace;
    GLOBAL_MEM Real* RJstar = &block_fworkspace[n_cols];

    GLOBAL_MEM Real* RIref = &block_fworkspace[n_cols + n_rows];
    GLOBAL_MEM Real* RJref = &block_fworkspace[n_cols + n_rows + 3 * n_cols];

    int Iref = Iref0[block_idx];
    Iref -= Iref % 3;
    int Jref = Jref0[block_idx];
    Jref -= Jref % 3;

    calc_rows_disp_hs(
        RIref, Iref, Iref + 3,
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    calc_cols_disp_hs(
        RJref, Jref, Jref + 3, 
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    int max_iter = min(p_max_iter[block_idx], min(n_rows / 2, n_cols / 2));
    Real tol = p_tol[block_idx];

    // Some OpenCL implementations require LOCAL_MEM to be defined at the
    // outermost scope of a function. Otherwise this would be defined next to
    // the single-threaded section that uses it.
    LOCAL_MEM bool done[1];
    done[0] = true;

    Real frob_est = 0;
    int k = 0;
    for (; k < max_iter; k++) {
        


        struct MatrixIndex Istar_entry = argmax_abs_not_in_list_rows(RJref, n_rows, 3, prevIstar, k);
        struct MatrixIndex Jstar_entry = argmax_abs_not_in_list_cols(RIref, 3, n_cols, prevJstar, k);
        int Istar = Istar_entry.row;
        int Jstar = Jstar_entry.col;

        Real Istar_val = fabs(RJref[Istar_entry.row * 3 + Istar_entry.col]);
        Real Jstar_val = fabs(RIref[Jstar_entry.row * n_cols + Jstar_entry.col]);


        if (Istar_val > Jstar_val) {
            calc_rows_disp_hs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

            


            Jstar_entry = argmax_abs_not_in_list_cols(RIstar, 1, n_cols, prevJstar, k);
            Jstar = Jstar_entry.col;

            calc_cols_disp_hs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            calc_cols_disp_hs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

            



            Istar_entry = argmax_abs_not_in_list_rows(RJstar, n_rows, 1, prevIstar, k);
            Istar = Istar_entry.row;

            calc_rows_disp_hs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        }
        


        // claim a block of space for the first U and first V vectors and collect
        // the corresponding Real* pointers
        if (team_idx == 0) {
            done[0] = false;

            prevIstar[k] = Istar;
            prevJstar[k] = Jstar;

            int next_buffer_u_ptr = buffer_alloc(next_buffer_ptr, n_rows + n_cols);
            int next_buffer_v_ptr = next_buffer_u_ptr + n_rows;
            GLOBAL_MEM Real* next_buffer_u = &buffer[next_buffer_u_ptr];
            GLOBAL_MEM Real* next_buffer_v = &buffer[next_buffer_v_ptr];

            // Assign our uv_ptr to point to the u,v buffer location.
            uv_ptrs[uv_ptr0 + k] = next_buffer_u_ptr;

            Real v2 = 0;
            // TODO: team_idx!!!
            for (int i = 0; i < n_cols; i++) {
                next_buffer_v[i] = RIstar[i] / RIstar[Jstar];
                v2 += next_buffer_v[i] * next_buffer_v[i];
            }

            Real u2 = 0;
            for (int j = 0; j < n_rows; j++) {
                next_buffer_u[j] = RJstar[j];
                u2 += next_buffer_u[j] * next_buffer_u[j];
            }


            Real step_size = sqrt(u2 * v2);

            frob_est += step_size;

            if (step_size < tol) {
                done[0] = true;
            }

            if (k == max_iter - 1) {
                done[0] = true;
            }
        }
        


        if (done[0]) {
            break;
        }

        if (Iref <= Istar && Istar < Iref + 3) {
            while (true) {
                Iref = (Iref + 3) % n_rows;
                Iref -= Iref % 3;
                if (!in(Iref, prevIstar, k + 1)) {
                    break; 
                }
            }
            calc_rows_disp_hs(
                RIref, Iref, Iref + 3,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Iref + 3) - (Iref);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Iref];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIref[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = 0; i < 3; i++) {
                for (int j = team_idx; j < n_cols; j += team_size) {
                    RIref[i * n_cols + j] -= next_buffer_u[i + Iref] * next_buffer_v[j];
                }
            }
        }

        if (Jref <= Jstar && Jstar < Jref + 3) {
            while (true) {
                Jref = (Jref + 3) % n_cols;
                Jref -= Jref % 3;
                if (!in(Jref, prevJstar, k + 1)) {
                    break; 
                }
            }
            calc_cols_disp_hs(
                RJref, Jref, Jref + 3, 
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jref + 3) - (Jref);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jref];
                    RJref[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = team_idx; i < n_rows; i += team_size) {
                for (int j = 0; j < 3; j++) {
                    RJref[i * 3 + j] -= next_buffer_u[i] * next_buffer_v[j + Jref];
                }
            }
        }
    }

    if (team_idx == 0) {
        n_terms[block_idx] = k + 1;
    }
}



WITHIN_KERNEL 
void calc_rows_strain_fs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_i = floor(((float)rowcol_start) / 6);
    int obs_dim_start = rowcol_start - block_i * 6;
    int obs_dim_end = rowcol_end - block_i * 6;
    int i = os + block_i;
    int obs_idx = 0;

    Real3 obs;
        obs.x = obs_pts[i * 3 + 0];
        obs.y = obs_pts[i * 3 + 1];
        obs.z = obs_pts[i * 3 + 2];

    int src_dim_start = 0;
    int src_dim_end = 3;
    int n_output_src = se - ss;
    for (int j = ss + team_idx; j < se; j += team_size) {
        int src_idx = j - ss;

            Real3 tri0;
                tri0.x = tris[j * 9 + 0 * 3 + 0];
                tri0.y = tris[j * 9 + 0 * 3 + 1];
                tri0.z = tris[j * 9 + 0 * 3 + 2];
            Real3 tri1;
                tri1.x = tris[j * 9 + 1 * 3 + 0];
                tri1.y = tris[j * 9 + 1 * 3 + 1];
                tri1.z = tris[j * 9 + 1 * 3 + 2];
            Real3 tri2;
                tri2.x = tris[j * 9 + 2 * 3 + 0];
                tri2.y = tris[j * 9 + 2 * 3 + 1];
                tri2.z = tris[j * 9 + 2 * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
            {
                if (3 >= obs_dim_start && 3 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (3 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.a;
                }
            }
            {
                if (4 >= obs_dim_start && 4 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (4 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.b;
                }
            }
            {
                if (5 >= obs_dim_start && 5 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (5 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.c;
                }
            }
        }
    }
}
WITHIN_KERNEL 
void calc_cols_strain_fs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_j = floor(((float)rowcol_start) / 3);
    int src_dim_start = rowcol_start - block_j * 3;
    int src_dim_end = rowcol_end - block_j * 3;
    int j = ss + block_j;
    int src_idx = 0;
    int n_output_src = 1;

        Real3 tri0;
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    int obs_dim_start = 0;
    int obs_dim_end = 6;
    for (int i = os + team_idx; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
            {
                if (3 >= obs_dim_start && 3 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (3 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.a;
                }
            }
            {
                if (4 >= obs_dim_start && 4 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (4 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.b;
                }
            }
            {
                if (5 >= obs_dim_start && 5 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (5 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.c;
                }
            }
        }
    }
}

KERNEL
void aca_strain_fs(
    // out parameters here
    GLOBAL_MEM Real* buffer,
    GLOBAL_MEM int* uv_ptrs, 
    GLOBAL_MEM int* n_terms,
    // mutable workspace parameters
    GLOBAL_MEM int* next_buffer_ptr,
    GLOBAL_MEM Real* fworkspace,
    GLOBAL_MEM int* iworkspace,
    // immutable parameters below here
    GLOBAL_MEM int* uv_ptrs_starts,
    GLOBAL_MEM int* fworkspace_starts,
    GLOBAL_MEM int* Iref0, GLOBAL_MEM int* Jref0,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM Real* p_tol,
    GLOBAL_MEM int* p_max_iter,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_idx = get_local_id(0);
    int team_size = get_local_size(0);
    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];
    int n_obs = oe - os;
    int n_rows = n_obs * 6;

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;
    int n_cols = n_src * 3;

    int uv_ptr0 = uv_ptrs_starts[block_idx];

    GLOBAL_MEM int* block_iworkspace = &iworkspace[uv_ptr0];
    GLOBAL_MEM int* prevIstar = block_iworkspace;
    GLOBAL_MEM int* prevJstar = &block_iworkspace[min(n_cols, n_rows) / 2];

    GLOBAL_MEM Real* block_fworkspace = &fworkspace[fworkspace_starts[block_idx]];
    GLOBAL_MEM Real* RIstar = block_fworkspace;
    GLOBAL_MEM Real* RJstar = &block_fworkspace[n_cols];

    GLOBAL_MEM Real* RIref = &block_fworkspace[n_cols + n_rows];
    GLOBAL_MEM Real* RJref = &block_fworkspace[n_cols + n_rows + 6 * n_cols];

    int Iref = Iref0[block_idx];
    Iref -= Iref % 6;
    int Jref = Jref0[block_idx];
    Jref -= Jref % 3;

    calc_rows_strain_fs(
        RIref, Iref, Iref + 6,
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    calc_cols_strain_fs(
        RJref, Jref, Jref + 3, 
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    int max_iter = min(p_max_iter[block_idx], min(n_rows / 2, n_cols / 2));
    Real tol = p_tol[block_idx];

    // Some OpenCL implementations require LOCAL_MEM to be defined at the
    // outermost scope of a function. Otherwise this would be defined next to
    // the single-threaded section that uses it.
    LOCAL_MEM bool done[1];
    done[0] = true;

    Real frob_est = 0;
    int k = 0;
    for (; k < max_iter; k++) {
        


        struct MatrixIndex Istar_entry = argmax_abs_not_in_list_rows(RJref, n_rows, 3, prevIstar, k);
        struct MatrixIndex Jstar_entry = argmax_abs_not_in_list_cols(RIref, 6, n_cols, prevJstar, k);
        int Istar = Istar_entry.row;
        int Jstar = Jstar_entry.col;

        Real Istar_val = fabs(RJref[Istar_entry.row * 3 + Istar_entry.col]);
        Real Jstar_val = fabs(RIref[Jstar_entry.row * n_cols + Jstar_entry.col]);


        if (Istar_val > Jstar_val) {
            calc_rows_strain_fs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

            


            Jstar_entry = argmax_abs_not_in_list_cols(RIstar, 1, n_cols, prevJstar, k);
            Jstar = Jstar_entry.col;

            calc_cols_strain_fs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            calc_cols_strain_fs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

            



            Istar_entry = argmax_abs_not_in_list_rows(RJstar, n_rows, 1, prevIstar, k);
            Istar = Istar_entry.row;

            calc_rows_strain_fs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        }
        


        // claim a block of space for the first U and first V vectors and collect
        // the corresponding Real* pointers
        if (team_idx == 0) {
            done[0] = false;

            prevIstar[k] = Istar;
            prevJstar[k] = Jstar;

            int next_buffer_u_ptr = buffer_alloc(next_buffer_ptr, n_rows + n_cols);
            int next_buffer_v_ptr = next_buffer_u_ptr + n_rows;
            GLOBAL_MEM Real* next_buffer_u = &buffer[next_buffer_u_ptr];
            GLOBAL_MEM Real* next_buffer_v = &buffer[next_buffer_v_ptr];

            // Assign our uv_ptr to point to the u,v buffer location.
            uv_ptrs[uv_ptr0 + k] = next_buffer_u_ptr;

            Real v2 = 0;
            // TODO: team_idx!!!
            for (int i = 0; i < n_cols; i++) {
                next_buffer_v[i] = RIstar[i] / RIstar[Jstar];
                v2 += next_buffer_v[i] * next_buffer_v[i];
            }

            Real u2 = 0;
            for (int j = 0; j < n_rows; j++) {
                next_buffer_u[j] = RJstar[j];
                u2 += next_buffer_u[j] * next_buffer_u[j];
            }


            Real step_size = sqrt(u2 * v2);

            frob_est += step_size;

            if (step_size < tol) {
                done[0] = true;
            }

            if (k == max_iter - 1) {
                done[0] = true;
            }
        }
        


        if (done[0]) {
            break;
        }

        if (Iref <= Istar && Istar < Iref + 6) {
            while (true) {
                Iref = (Iref + 6) % n_rows;
                Iref -= Iref % 6;
                if (!in(Iref, prevIstar, k + 1)) {
                    break; 
                }
            }
            calc_rows_strain_fs(
                RIref, Iref, Iref + 6,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Iref + 6) - (Iref);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Iref];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIref[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = 0; i < 6; i++) {
                for (int j = team_idx; j < n_cols; j += team_size) {
                    RIref[i * n_cols + j] -= next_buffer_u[i + Iref] * next_buffer_v[j];
                }
            }
        }

        if (Jref <= Jstar && Jstar < Jref + 3) {
            while (true) {
                Jref = (Jref + 3) % n_cols;
                Jref -= Jref % 3;
                if (!in(Jref, prevJstar, k + 1)) {
                    break; 
                }
            }
            calc_cols_strain_fs(
                RJref, Jref, Jref + 3, 
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jref + 3) - (Jref);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jref];
                    RJref[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = team_idx; i < n_rows; i += team_size) {
                for (int j = 0; j < 3; j++) {
                    RJref[i * 3 + j] -= next_buffer_u[i] * next_buffer_v[j + Jref];
                }
            }
        }
    }

    if (team_idx == 0) {
        n_terms[block_idx] = k + 1;
    }
}



WITHIN_KERNEL 
void calc_rows_strain_hs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_i = floor(((float)rowcol_start) / 6);
    int obs_dim_start = rowcol_start - block_i * 6;
    int obs_dim_end = rowcol_end - block_i * 6;
    int i = os + block_i;
    int obs_idx = 0;

    Real3 obs;
        obs.x = obs_pts[i * 3 + 0];
        obs.y = obs_pts[i * 3 + 1];
        obs.z = obs_pts[i * 3 + 2];

    int src_dim_start = 0;
    int src_dim_end = 3;
    int n_output_src = se - ss;
    for (int j = ss + team_idx; j < se; j += team_size) {
        int src_idx = j - ss;

            Real3 tri0;
                tri0.x = tris[j * 9 + 0 * 3 + 0];
                tri0.y = tris[j * 9 + 0 * 3 + 1];
                tri0.z = tris[j * 9 + 0 * 3 + 2];
            Real3 tri1;
                tri1.x = tris[j * 9 + 1 * 3 + 0];
                tri1.y = tris[j * 9 + 1 * 3 + 1];
                tri1.z = tris[j * 9 + 1 * 3 + 2];
            Real3 tri2;
                tri2.x = tris[j * 9 + 2 * 3 + 0];
                tri2.y = tris[j * 9 + 2 * 3 + 1];
                tri2.z = tris[j * 9 + 2 * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
            {
                if (3 >= obs_dim_start && 3 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (3 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.a;
                }
            }
            {
                if (4 >= obs_dim_start && 4 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (4 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.b;
                }
            }
            {
                if (5 >= obs_dim_start && 5 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (5 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.c;
                }
            }
        }
    }
}
WITHIN_KERNEL 
void calc_cols_strain_hs(
    GLOBAL_MEM Real* output, 
    int rowcol_start, int rowcol_end,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    int os, int oe, int ss, int se,
    Real nu, int team_idx, int team_size)
{
    /*
    * In the calc_rows/cols function, we will calculate a batch of rows or
    * columns corresponding to a particular triangular dislocation element. 
    * In most cases, this will be three rows/cols corresponding to the x/y/z
    * components of displacement or slip. But in the case of calculating rows
    * for a strain matrix, we will be calculating six components. See the use
    * of "vec_dim" below to specify the number of rows. 
    * 
    * But, we specify the element in terms of the rowcol_start and rowcol_end.
    * This allows grabbing just a subset of the rows when that is desirable.
    */

    int block_j = floor(((float)rowcol_start) / 3);
    int src_dim_start = rowcol_start - block_j * 3;
    int src_dim_end = rowcol_end - block_j * 3;
    int j = ss + block_j;
    int src_idx = 0;
    int n_output_src = 1;

        Real3 tri0;
            tri0.x = tris[j * 9 + 0 * 3 + 0];
            tri0.y = tris[j * 9 + 0 * 3 + 1];
            tri0.z = tris[j * 9 + 0 * 3 + 2];
        Real3 tri1;
            tri1.x = tris[j * 9 + 1 * 3 + 0];
            tri1.y = tris[j * 9 + 1 * 3 + 1];
            tri1.z = tris[j * 9 + 1 * 3 + 2];
        Real3 tri2;
            tri2.x = tris[j * 9 + 2 * 3 + 0];
            tri2.y = tris[j * 9 + 2 * 3 + 1];
            tri2.z = tris[j * 9 + 2 * 3 + 2];

    int obs_dim_start = 0;
    int obs_dim_end = 6;
    for (int i = os + team_idx; i < oe; i += team_size) {
        int obs_idx = i - os;

        Real3 obs;
            obs.x = obs_pts[i * 3 + 0];
            obs.y = obs_pts[i * 3 + 1];
            obs.z = obs_pts[i * 3 + 2];

        for (int d_src = src_dim_start; d_src < src_dim_end; d_src++) {
            Real3 slip = make3(0.0, 0.0, 0.0);
            if (d_src == 0) {
                slip.y = 1.0;
            } else if (d_src == 1) {
                slip.z = 1.0;
            } else {
                slip.x = 1.0;
            }

            
    Real6 summed_terms;
    {
        // Main dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(tri1, tri0),
        sub3(tri2, tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri0, tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(tri2, tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = full_out;

        // Harmonic free surface correction
        Real3 efcs_slip = inv_transform3(Vnorm, Vstrike, Vdip, slip);
        Real6 uvw0 = AngSetupFSC_S(obs, efcs_slip, tri0, tri1, nu);
        Real6 uvw1 = AngSetupFSC_S(obs, efcs_slip, tri1, tri2, nu);
        Real6 uvw2 = AngSetupFSC_S(obs, efcs_slip, tri2, tri0, nu);
        Real6 fsc_term = add6(add6(uvw0, uvw1), uvw2);

        summed_terms = add6(fsc_term, summed_terms);
    }
    {
        Real3 image_tri0 = tri0;
        Real3 image_tri1 = tri1;
        Real3 image_tri2 = tri2;

        image_tri0.z *= -1;
        image_tri1.z *= -1;
        image_tri2.z *= -1;

        // Image dislocation
        
    
    Real3 Vnorm = normalize3(cross3(
        sub3(image_tri1, image_tri0),
        sub3(image_tri2, image_tri0)
    ));
    Real3 eY = make3(0.0f, 1.0f, 0.0f);
    Real3 eZ = make3(0.0f,0.0f,1.0f);
    Real3 Vstrike = cross3(eZ, Vnorm);
    if (length3(Vstrike) == 0) {
        Vstrike = mul_scalar3(eY, Vnorm.z);
            // For horizontal elements in case of half-space calculation!!!
            // Correct the strike vector of image dislocation only
            if (image_tri0.z > 0) {
                Vstrike = negate3(Vstrike);
            }
    }
    Vstrike = normalize3(Vstrike); 
    Real3 Vdip = cross3(Vnorm, Vstrike);

    Real3 transformed_obs = transform3(
        Vnorm, Vstrike, Vdip, sub3(obs, image_tri1)
    );
    Real3 transformed_tri0 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri0, image_tri1)
    );
    Real3 transformed_tri1 = make3(0.0f, 0.0f, 0.0f);
    Real3 transformed_tri2 = transform3(
        Vnorm, Vstrike, Vdip, sub3(image_tri2, image_tri1)
    );

    Real3 e12 = normalize3(sub3(transformed_tri1, transformed_tri0));
    Real3 e13 = normalize3(sub3(transformed_tri2, transformed_tri0));
    Real3 e23 = normalize3(sub3(transformed_tri2, transformed_tri1));

    Real A = acos(dot3(e12, e13));
    Real B = acos(dot3(negate3(e12), e23));
    Real C = acos(dot3(e23, e13));

    int mode = trimodefinder(
        transformed_obs,
        transformed_tri0, transformed_tri1, transformed_tri2
    );


    Real6 out;
    if (mode == 1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0, negate3(e13));
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1, e12);
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2, e23);
        out = add6(add6(comp1, comp2), comp3);
    } else if (mode == -1) {
        // Calculate first angular dislocation contribution
        Real6 comp1 = TDSetupS(transformed_obs,A,slip,nu,transformed_tri0,e13);
        // Calculate second angular dislocation contribution
        Real6 comp2 = TDSetupS(transformed_obs,B,slip,nu,transformed_tri1,negate3(e12));
        // Calculate third angular dislocation contribution
        Real6 comp3 = TDSetupS(transformed_obs,C,slip,nu,transformed_tri2,negate3(e23));
        out = add6(add6(comp1, comp2), comp3);
    } else {
        out = make6(NAN, NAN, NAN, NAN, NAN, NAN);
    }


    Real6 full_out = tensor_transform3(Vnorm, Vstrike, Vdip, out);


        summed_terms = add6(summed_terms, full_out);
    }
    Real6 full_out = summed_terms;


            {
                if (0 >= obs_dim_start && 0 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (0 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.x;
                }
            }
            {
                if (1 >= obs_dim_start && 1 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (1 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.y;
                }
            }
            {
                if (2 >= obs_dim_start && 2 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (2 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.z;
                }
            }
            {
                if (3 >= obs_dim_start && 3 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (3 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.a;
                }
            }
            {
                if (4 >= obs_dim_start && 4 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (4 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.b;
                }
            }
            {
                if (5 >= obs_dim_start && 5 < obs_dim_end) {
                    int idx = (
                        (obs_idx * 6 + (5 - obs_dim_start)) * n_output_src + src_idx
                    ) * (src_dim_end - src_dim_start) + (d_src - src_dim_start);
                    output[idx] = full_out.c;
                }
            }
        }
    }
}

KERNEL
void aca_strain_hs(
    // out parameters here
    GLOBAL_MEM Real* buffer,
    GLOBAL_MEM int* uv_ptrs, 
    GLOBAL_MEM int* n_terms,
    // mutable workspace parameters
    GLOBAL_MEM int* next_buffer_ptr,
    GLOBAL_MEM Real* fworkspace,
    GLOBAL_MEM int* iworkspace,
    // immutable parameters below here
    GLOBAL_MEM int* uv_ptrs_starts,
    GLOBAL_MEM int* fworkspace_starts,
    GLOBAL_MEM int* Iref0, GLOBAL_MEM int* Jref0,
    GLOBAL_MEM Real* obs_pts, GLOBAL_MEM Real* tris,
    GLOBAL_MEM int* obs_start, GLOBAL_MEM int* obs_end,
    GLOBAL_MEM int* src_start, GLOBAL_MEM int* src_end,
    GLOBAL_MEM Real* p_tol,
    GLOBAL_MEM int* p_max_iter,
    Real nu)
{
    int block_idx = get_group_id(0);
    int team_idx = get_local_id(0);
    int team_size = get_local_size(0);
    int os = obs_start[block_idx];
    int oe = obs_end[block_idx];
    int n_obs = oe - os;
    int n_rows = n_obs * 6;

    int ss = src_start[block_idx];
    int se = src_end[block_idx];
    int n_src = se - ss;
    int n_cols = n_src * 3;

    int uv_ptr0 = uv_ptrs_starts[block_idx];

    GLOBAL_MEM int* block_iworkspace = &iworkspace[uv_ptr0];
    GLOBAL_MEM int* prevIstar = block_iworkspace;
    GLOBAL_MEM int* prevJstar = &block_iworkspace[min(n_cols, n_rows) / 2];

    GLOBAL_MEM Real* block_fworkspace = &fworkspace[fworkspace_starts[block_idx]];
    GLOBAL_MEM Real* RIstar = block_fworkspace;
    GLOBAL_MEM Real* RJstar = &block_fworkspace[n_cols];

    GLOBAL_MEM Real* RIref = &block_fworkspace[n_cols + n_rows];
    GLOBAL_MEM Real* RJref = &block_fworkspace[n_cols + n_rows + 6 * n_cols];

    int Iref = Iref0[block_idx];
    Iref -= Iref % 6;
    int Jref = Jref0[block_idx];
    Jref -= Jref % 3;

    calc_rows_strain_hs(
        RIref, Iref, Iref + 6,
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    calc_cols_strain_hs(
        RJref, Jref, Jref + 3, 
        obs_pts, tris, os, oe, ss, se, nu,
        team_idx, team_size
    );

    int max_iter = min(p_max_iter[block_idx], min(n_rows / 2, n_cols / 2));
    Real tol = p_tol[block_idx];

    // Some OpenCL implementations require LOCAL_MEM to be defined at the
    // outermost scope of a function. Otherwise this would be defined next to
    // the single-threaded section that uses it.
    LOCAL_MEM bool done[1];
    done[0] = true;

    Real frob_est = 0;
    int k = 0;
    for (; k < max_iter; k++) {
        


        struct MatrixIndex Istar_entry = argmax_abs_not_in_list_rows(RJref, n_rows, 3, prevIstar, k);
        struct MatrixIndex Jstar_entry = argmax_abs_not_in_list_cols(RIref, 6, n_cols, prevJstar, k);
        int Istar = Istar_entry.row;
        int Jstar = Jstar_entry.col;

        Real Istar_val = fabs(RJref[Istar_entry.row * 3 + Istar_entry.col]);
        Real Jstar_val = fabs(RIref[Jstar_entry.row * n_cols + Jstar_entry.col]);


        if (Istar_val > Jstar_val) {
            calc_rows_strain_hs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

            


            Jstar_entry = argmax_abs_not_in_list_cols(RIstar, 1, n_cols, prevJstar, k);
            Jstar = Jstar_entry.col;

            calc_cols_strain_hs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            calc_cols_strain_hs(
                RJstar, Jstar, Jstar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jstar + 1) - (Jstar);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jstar];
                    RJstar[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

            



            Istar_entry = argmax_abs_not_in_list_rows(RJstar, n_rows, 1, prevIstar, k);
            Istar = Istar_entry.row;

            calc_rows_strain_hs(
                RIstar, Istar, Istar + 1,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Istar + 1) - (Istar);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Istar];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIstar[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        }
        


        // claim a block of space for the first U and first V vectors and collect
        // the corresponding Real* pointers
        if (team_idx == 0) {
            done[0] = false;

            prevIstar[k] = Istar;
            prevJstar[k] = Jstar;

            int next_buffer_u_ptr = buffer_alloc(next_buffer_ptr, n_rows + n_cols);
            int next_buffer_v_ptr = next_buffer_u_ptr + n_rows;
            GLOBAL_MEM Real* next_buffer_u = &buffer[next_buffer_u_ptr];
            GLOBAL_MEM Real* next_buffer_v = &buffer[next_buffer_v_ptr];

            // Assign our uv_ptr to point to the u,v buffer location.
            uv_ptrs[uv_ptr0 + k] = next_buffer_u_ptr;

            Real v2 = 0;
            // TODO: team_idx!!!
            for (int i = 0; i < n_cols; i++) {
                next_buffer_v[i] = RIstar[i] / RIstar[Jstar];
                v2 += next_buffer_v[i] * next_buffer_v[i];
            }

            Real u2 = 0;
            for (int j = 0; j < n_rows; j++) {
                next_buffer_u[j] = RJstar[j];
                u2 += next_buffer_u[j] * next_buffer_u[j];
            }


            Real step_size = sqrt(u2 * v2);

            frob_est += step_size;

            if (step_size < tol) {
                done[0] = true;
            }

            if (k == max_iter - 1) {
                done[0] = true;
            }
        }
        


        if (done[0]) {
            break;
        }

        if (Iref <= Istar && Istar < Iref + 6) {
            while (true) {
                Iref = (Iref + 6) % n_rows;
                Iref -= Iref % 6;
                if (!in(Iref, prevIstar, k + 1)) {
                    break; 
                }
            }
            calc_rows_strain_hs(
                RIref, Iref, Iref + 6,
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Iref + 6) - (Iref);

            for (int i = 0; i < n_rowcol; i++) {
                Real uv = U_term[i + Iref];
                for (int j = team_idx; j < n_cols; j += team_size) {
                    Real vv = V_term[j];
                    RIref[i * n_cols + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = 0; i < 6; i++) {
                for (int j = team_idx; j < n_cols; j += team_size) {
                    RIref[i * n_cols + j] -= next_buffer_u[i + Iref] * next_buffer_v[j];
                }
            }
        }

        if (Jref <= Jstar && Jstar < Jref + 3) {
            while (true) {
                Jref = (Jref + 3) % n_cols;
                Jref -= Jref % 3;
                if (!in(Jref, prevJstar, k + 1)) {
                    break; 
                }
            }
            calc_cols_strain_hs(
                RJref, Jref, Jref + 3, 
                obs_pts, tris, os, oe, ss, se, nu,
                team_idx, team_size
            );
            

            
{
    for (int sr_idx = 0; sr_idx < k + 1; sr_idx++) {
        int buffer_ptr = uv_ptrs[uv_ptr0 + sr_idx];

        GLOBAL_MEM Real* U_term = &buffer[buffer_ptr];
        GLOBAL_MEM Real* V_term = &buffer[buffer_ptr + n_rows];
        int n_rowcol = (Jref + 3) - (Jref);

            for (int i = team_idx; i < n_rows; i += team_size) {
                Real uv = U_term[i];
                for (int j = 0; j < n_rowcol; j++) {
                    Real vv = V_term[j + Jref];
                    RJref[i * n_rowcol + j] -= uv * vv;
                }
            }
    }
}

        } else {
            GLOBAL_MEM Real* next_buffer_u = &buffer[uv_ptrs[uv_ptr0 + k]];
            GLOBAL_MEM Real* next_buffer_v = &buffer[uv_ptrs[uv_ptr0 + k] + n_rows];
            for (int i = team_idx; i < n_rows; i += team_size) {
                for (int j = 0; j < 3; j++) {
                    RJref[i * 3 + j] -= next_buffer_u[i] * next_buffer_v[j + Jref];
                }
            }
        }
    }

    if (team_idx == 0) {
        n_terms[block_idx] = k + 1;
    }
}



template <typename T>
T conv_arg(T arg) {
    return arg;
}

template <typename T>
T* conv_arg(py::array_t<T> arg) {
    return arg.mutable_data(0);
}

template <typename T>
struct pyarg_from_cpparg {
    using PyArgType = T;
};

template <typename T>
struct pyarg_from_cpparg<T*> {
    using PyArgType = py::array_t<T>;
};

template <typename R, typename ...Args>
decltype(auto) wrapper(R(*fn)(Args...))
{
    return [=](typename pyarg_from_cpparg<Args>::PyArgType... args, 
             std::tuple<int,int,int> grid,
             std::tuple<int,int,int> block) 
    {
        gridDim = {std::get<0>(grid), std::get<1>(grid), std::get<2>(grid)};
        blockIdx = {0,0,0};

        int Ngrid = gridDim.x * gridDim.y * gridDim.z;

        auto ptr_args = std::make_tuple(conv_arg(args)...);

        #pragma omp parallel for
        for (long i = 0; i < Ngrid; i++) {
            long i_r = i;
            blockIdx.z = i_r % gridDim.z;
            i_r /= gridDim.z;
            blockIdx.y = i_r % gridDim.y;
            i_r /= gridDim.y;
            blockIdx.x = i_r % gridDim.x;
            i_r /= gridDim.x;

            std::apply(fn, ptr_args);
        }
    };
}

PYBIND11_MODULE(cpp_backend_double, m) {
                
                m.def("pairs_disp_fs", wrapper(pairs_disp_fs));
                
                m.def("pairs_disp_hs", wrapper(pairs_disp_hs));
                
                m.def("pairs_strain_fs", wrapper(pairs_strain_fs));
                
                m.def("pairs_strain_hs", wrapper(pairs_strain_hs));
                
                m.def("blocks_disp_fs", wrapper(blocks_disp_fs));
                
                m.def("blocks_disp_hs", wrapper(blocks_disp_hs));
                
                m.def("blocks_strain_fs", wrapper(blocks_strain_fs));
                
                m.def("blocks_strain_hs", wrapper(blocks_strain_hs));
                
                m.def("matrix_disp_fs", wrapper(matrix_disp_fs));
                
                m.def("matrix_disp_hs", wrapper(matrix_disp_hs));
                
                m.def("matrix_strain_fs", wrapper(matrix_strain_fs));
                
                m.def("matrix_strain_hs", wrapper(matrix_strain_hs));
                
                m.def("free_disp_fs", wrapper(free_disp_fs));
                
                m.def("free_disp_hs", wrapper(free_disp_hs));
                
                m.def("free_strain_fs", wrapper(free_strain_fs));
                
                m.def("free_strain_hs", wrapper(free_strain_hs));
                
                m.def("aca_disp_fs", wrapper(aca_disp_fs));
                
                m.def("aca_disp_hs", wrapper(aca_disp_hs));
                
                m.def("aca_strain_fs", wrapper(aca_strain_fs));
                
                m.def("aca_strain_hs", wrapper(aca_strain_hs));
}
