segment .text
        cname  corr
        global corr

; rdi, rsi, rdx, rcx, r8, r9
;
;       rdi:  x array
;       rsi:  y array
;       rcx:  loop counter
;       rdx:  n
;       ymm0: 4 parts of sum_x
;       ymm1: 4 parts of sum_y
;       ymm2: 4 parts of sum_xx
;       ymm3: 4 parts of sum_yy
;       ymm4: 4 parts of sum_xy
;       ymm5: 4 x values - later squared
;       ymm6: 4 y values - later squared
;       ymm7: 4 xy values
corr:
        xor      r8, r8
        mov      rcx, rdx
        vzeroall
.more:   
        vmovupd  ymm5, [rdi+r8]  ; mov x
        vmovupd  ymm6, [rsi+r8]  ; mov y
        vmulpd   ymm7, ymm5, ymm6      ; xy
        vaddpd   ymm0, ymm0, ymm5      ; sum_x
        vaddpd   ymm1, ymm1, ymm6      ; sum_y
        vmulpd   ymm5, ymm5, ymm5      ; xx
        vmulpd   ymm6, ymm6, ymm6      ; yy
        vaddpd   ymm2, ymm2, ymm5      ; sum_xx
        vaddpd   ymm3, ymm3, ymm6      ; sum_yy
        vaddpd   ymm4, ymm4, ymm7      ; sum_xy
        vmovupd  ymm13, [rdi+r8+32]  ; mov x
        vmovupd  ymm14, [rsi+r8+32]  ; mov y
        vmulpd   ymm15, ymm13, ymm14    ; xy
        vaddpd   ymm8, ymm8, ymm13     ; sum_x
        vaddpd   ymm9, ymm9, ymm14     ; sum_y
        vmulpd   ymm13, ymm13, ymm13    ; xx
        vmulpd   ymm14, ymm14, ymm14    ; yy
        vaddpd   ymm10, ymm10, ymm13    ; sum_xx
        vaddpd   ymm11, ymm11, ymm14    ; sum_yy
        vaddpd   ymm12, ymm12, ymm15    ; sum_xy
        add     r8, 64
        sub     rcx, 8
        jnz     .more
        vaddpd   ymm0, ymm0, ymm8
        vaddpd   ymm1, ymm1, ymm9
        vaddpd   ymm2, ymm2, ymm10
        vaddpd   ymm3, ymm3, ymm11
        vaddpd   ymm4, ymm4, ymm12
        vhaddpd  ymm0, ymm0, ymm0      ; sum_x
        vhaddpd  ymm1, ymm1, ymm1      ; sum_y
        vhaddpd  ymm2, ymm2, ymm2      ; sum_xx
        vhaddpd  ymm3, ymm3, ymm3      ; sum_yy
        vhaddpd  ymm4, ymm4, ymm4      ; sum_xy
        vextractf128 xmm5, ymm0, 1
        vaddsd   xmm0, xmm0, xmm5
        vextractf128 xmm6, ymm1, 1
        vaddsd   xmm1, xmm1, xmm6
        vmulsd   xmm6, xmm0, xmm0      ; sum_x*sum_x
        vmulsd   xmm7, xmm1, xmm1      ; sum_y*sum_y
        vextractf128  xmm8, ymm2, 1
        vaddsd   xmm2, xmm2, xmm8
        vextractf128  xmm9, ymm3, 1
        vaddsd   xmm3, xmm3, xmm9
        cvtsi2sd xmm8, rdx      ; n
        vmulsd   xmm2, xmm2, xmm8      ; n*sum_xx
        vmulsd   xmm3, xmm3, xmm8      ; n*sum_yy
        vsubsd   xmm2, xmm2, xmm6      ; n*sum_xx-sum_x*sum_x
        vsubsd   xmm3, xmm3, xmm7      ; n*sum_yy-sum_y*sum_y
        vmulsd   xmm2, xmm2, xmm3      ; denom*denom
        vsqrtsd  xmm2, xmm2, xmm2      ; denom
        vextractf128  xmm6, ymm4, 1
        vaddsd   xmm4, xmm4, xmm6
        vmulsd   xmm4, xmm4, xmm8      ; n*sum_xy
        vmulsd   xmm0, xmm0, xmm1      ; sum_x*sum_y
        vsubsd   xmm4, xmm4, xmm0      ; n*sum_xy-sum_x*sum_y
        vdivsd   xmm0, xmm4, xmm2      ; correlation
        ret