segment .text
        global corr

;
;       rcx:  x array
;       rdx:  y array
;       r10:  loop counter
;       r8:  n
;       xmm0: 2 parts of sum_x
;       xmm1: 2 parts of sum_y
;       xmm2: 2 parts of sum_xx
;       xmm3: 2 parts of sum_yy
;       xmm4: 2 parts of sum_xy
;       xmm5: 2 x values - later squared
;       xmm6: 2 y values - later squared
;       xmm7: 2 xy values
corr:
        alias   X, rcx
        alias   Y, rdx
        alias   I, r10
        alias   N, r8

        fpalias Sum_x, 0
        fpalias Sum_y, 1
        fpalias Sum_xx, 2
        fpalias Sum_yy, 3
        fpalias Sum_xy, 4
        fpalias Xvals, 5
        fpalias Yvals, 6
        fpalias XYvals, 7
        fpalias Sum_x2, 8
        fpalias Sum_y2, 9
        fpalias Sum_xx2, 10
        fpalias Sum_yy2, 11
        fpalias Sum_xy2, 12
        fpalias Xvals2, 13
        fpalias Yvals2, 14
        fpalias XYvals2, 15

        xor     r9d, r9d
        mov     qI, qN
        subpd   xSum_x, xSum_x
        movapd   xSum_y, xSum_x
        movapd   xSum_xx, xSum_x
        movapd   xSum_yy, xSum_x
        movapd   xSum_xy, xSum_x
        movapd   xSum_x2, xSum_x
        movapd   xSum_y2, xSum_x
        movapd   xSum_xx2, xSum_x
        movapd   xSum_yy2, xSum_x
        movapd   xSum_xy2, xSum_x
.more:   
        movapd  xXvals, [qX+r9]     ; mov x
        movapd  xYvals, [qY+r9]     ; mov y
        movapd  xXYvals, xXvals     ; mov x
        mulpd   xXYvals, xYvals     ; xy
        addpd   xSum_x, xXvals      ; sum_x
        addpd   xSum_y, xYvals      ; sum_y
        mulpd   xXvals, xXvals      ; xx
        mulpd   xYvals, xYvals      ; yy
        addpd   xmm2, xXvals        ; sum_xx
        addpd   xmm3, xYvals        ; sum_yy
        addpd   xmm4, xXYvals       ; sum_xy
        movapd  xXvals2, [qX+r9+16] ; mov x
        movapd  xYvals2, [qY+r9+16] ; mov y
        movapd  xXYvals, xXvals2    ; mov x
        mulpd   xXYvals, xYvals2    ; xy
        addpd   xSum_x2, xXvals2    ; sum_x
        addpd   xSum_y2, xYvals2    ; sum_y
        mulpd   xXvals2, xXvals2    ; xx
        mulpd   xYvals2, xYvals2    ; yy
        addpd   xSum_xx2, xXvals2   ; sum_xx
        addpd   xSum_yy2, xYvals2   ; sum_yy
        addpd   xSum_xy2, xXYvals   ; sum_xy
        add     r9, 32
        sub     qI, 4
        jnz     .more
        addpd   xSum_x, xSum_x2
        addpd   xSum_y, xSum_y2
        addpd   xSum_xx, xSum_xx2
        addpd   xSum_yy, xSum_yy2
        addpd   xSum_xy, xSum_xy2
        haddpd  xSum_x, xSum_x      ; sum_x
        haddpd  xSum_y, xSum_y      ; sum_y
        haddpd  xSum_xx, xSum_xx    ; sum_xx
        haddpd  xSum_yy, xSum_yy    ; sum_yy
        haddpd  xSum_xy, xSum_xy    ; sum_xy
        movsd   xYvals, xSum_x      ; sum_x
        movsd   xXYvals, xSum_y     ; sum_y
        fpalias N, 15
        cvtsi2sd xN, qN       ; n

        mulsd   xYvals, xYvals      ; sum_x*sum_x
        mulsd   xXYvals, xXYvals    ; sum_y*sum_y
        fpalias NSum_xx, 2
        fpalias NSum_yy, 3
        mulsd   xNSum_xx, xN       ; n*sum_xx
        mulsd   xNSum_yy, xN       ; n*sum_yy
        subsd   xNSum_xx, xYvals   ; n*sum_xx-sum_x*sum_x
        subsd   xNSum_yy, xXYvals  ; n*sum_yy-sum_y*sum_y
        fpalias Denom, 2
        mulsd   xDenom, xNSum_yy   ; denom1*denom2
        sqrtsd  xDenom, xDenom     ; denom
        fpalias NSum_xy, 4
        mulsd   xNSum_xy, xN       ; n*sum_xy
        mulsd   xSum_x, xSum_y     ; sum_x*sum_y
        subsd   xNSum_xy, xSum_x   ; n*sum_xy-sum_x*sum_y
        divsd   xNSum_xy, xDenom   ; correlation
        movsd   xmm0, xNSum_xy     ; need in xmm0
        ret