segment .text
        global  corr

        alias   X, rcx
        alias   Y, rdx
        alias   N, r8
        alias   I, r9
        alias   Left, r10
;
;       rcx:  x array
;       rdx:  y array
;       r8:   n
;       r9:   loop counter

        fpalias Sum_x, 0
        fpalias Sum_y, 1
        fpalias Sum_xx, 2
        fpalias Sum_yy, 3
        fpalias Sum_xy, 4

        fpalias Xvalues, 5
        fpalias Yvalues, 6
        fpalias XYvalues, 7

        fpalias Sum_x2, 8
        fpalias Sum_y2, 9
        fpalias Sum_xx2, 10
        fpalias Sum_yy2, 11
        fpalias Sum_xy2, 12

        fpalias Xvalues2, 13
        fpalias Yvalues2, 14
        fpalias XYvalues2, 15

;       ymm0: 4 parts of sum_x
;       ymm1: 4 parts of sum_y
;       ymm2: 4 parts of sum_xx
;       ymm3: 4 parts of sum_yy
;       ymm4: 4 parts of sum_xy
;       ymm5: 4 x values - later squared
;       ymm6: 4 y values - later squared
;       ymm7: 4 xy values
corr:
        xor      dI, dI
        mov      qLeft, qN
        vzeroall
.more:   
        vmovupd  yXvalues, [qX+qI]               ; mov x    5
        vmovupd  yYvalues, [qY+qI]               ; mov y    6
        vmulpd   yXYvalues, yXvalues, yYvalues   ; xy       7 5 6
        vaddpd   ySum_x, ySum_x, yXvalues        ; sum_x    0 0 5
        vaddpd   ySum_y, ySum_y, yYvalues        ; sum_y    1 1 6
        vmulpd   yXvalues, yXvalues, yXvalues    ; xx       5 5 5
        vmulpd   yYvalues, yYvalues, yYvalues    ; yy       6 6 6
        vaddpd   ySum_xx, ySum_xx, yXvalues      ; sum_xx   2 2 5
        vaddpd   ySum_yy, ySum_yy, yYvalues      ; sum_yy   3 3 5
        vaddpd   ySum_xy, ySum_xy, yXYvalues     ; sum_xy   4 4 7
        vmovupd  yXvalues2, [qX+qI+32]  ; mov x            13
        vmovupd  yYvalues2, [qY+qI+32]  ; mov y            14
        vmulpd   yXYvalues2, yXvalues2, yYvalues2 ; xy     15 13 14
        vaddpd   ySum_x2, ySum_x2, yXvalues2     ; sum_x   8  8  13
        vaddpd   ySum_y2, ySum_y2, yYvalues2     ; sum_y   9  9  14
        vmulpd   yXvalues2, yXvalues2, yXvalues2 ; xx      13 13 13
        vmulpd   yYvalues2, yYvalues2, yYvalues2 ; yy      14 14 14
        vaddpd   ySum_xx2, ySum_xx2, yXvalues2   ; sum_xx  10 10 13
        vaddpd   ySum_yy2, ySum_yy2, yYvalues2   ; sum_yy  11 11 14
        vaddpd   ySum_xy2, ySum_xy2, yXYvalues2  ; sum_xy  12 12 15
        add      qI, 64
        sub      qLeft, 8
        jg       .more
        vaddpd   ySum_x, ySum_x, ySum_x2                  ; 0 0 8
        vaddpd   ySum_y, ySum_y, ySum_y2                  ; 1 1 9
        vaddpd   ySum_xx, ySum_xx, ySum_xx2               ; 2 2 10
        vaddpd   ySum_yy, ySum_yy, ySum_yy2               ; 3 3 11
        vaddpd   ySum_xy, ySum_xy, ySum_xy2               ; 4 4 12
        vhaddpd  ySum_x, ySum_x, ySum_x          ; sum_x    0 0 0
        vhaddpd  ySum_y, ySum_y, ySum_y          ; sum_y    1 1 1
        vhaddpd  ySum_xx, ySum_xx, ySum_xx       ; sum_xx   2 2 2
        vhaddpd  ySum_yy, ySum_yy, ySum_yy       ; sum_yy   3 3 3
        vhaddpd  ySum_xy, ySum_xy, ySum_xy       ; sum_xy   4 4 4
        vextractf128 xXvalues, ySum_x, 1                  ; 5 0
        vaddsd   xSum_x, xSum_x, xXvalues                 ; 0 0 5
        vextractf128 xYvalues, ySum_y, 1                  ; 6 1
        vaddsd   xSum_y, xSum_y, xYvalues                 ; 1 1 6

        fpalias  SumxSumx, 6
        vmulsd   xSumxSumx, xSum_x, xSum_x  ; sum_x*sum_x   6 0 0
        fpalias  SumySumy, 7                    
        vmulsd   xSumySumy, xSum_y, xSum_y  ; sum_y*sum_y   7 1 1
        vextractf128  xSum_x2, ySum_xx, 1                 ; 8 2
        vaddsd   xSum_xx, xSum_xx, xSum_x2                ; 2 2 8
        vextractf128  xSum_y2, ySum_yy, 1                 ; 9 3
        vaddsd   xSum_yy, xSum_yy, xSum_y2                ; 3 3 9
        fpalias  N, 8
        cvtsi2sd xN, qN      ; n                            8
        vmulsd   xSum_xx, xSum_xx, xN      ; n*sum_xx       2 2 8
        vmulsd   xSum_yy, xSum_yy, xN      ; n*sum_yy       3 3 8
        vsubsd   xSum_xx, xSum_xx, xSumxSumx      ; n*sum_xx-sum_x*sum_x  2 2 6
        vsubsd   xSum_yy, xSum_yy, xSumySumy      ; n*sum_yy-sum_y*sum_y  3 3 7
        fpalias  Denom, 2
        vmulsd   xDenom, xSum_xx, xSum_yy      ; denom*denom   2 2 3
        vsqrtsd  xDenom, xDenom, xDenom      ; denom           2
        vextractf128  xSumxSumx, ySum_xy, 1                  ; 6 4
        vaddsd   xSum_xy, xSum_xy, xSumxSumx                 ; 4 4 6
        vmulsd   xSum_xy, xSum_xy, xN      ; n*sum_xy          4 4 8
        fpalias  SumxSumy, 0
        vmulsd   xSumxSumy, xSum_x, xSum_y      ; sum_x*sum_y  0 0 1
        fpalias  Num, 4
        vsubsd   xNum, xSum_xy, xSumxSumy      ; n*sum_xy-sum_x*sum_y 4 4 0
        vdivsd   xSumxSumy, xNum, xDenom      ; correlation     0 3 2
        ret