segment .text cname corr global corr ; rdi, rsi, rdx, rcx, r8, r9 ; ; rdi: x array ; rsi: y array ; rcx: loop counter ; rdx: n ; ymm0: 4 parts of sum_x ; ymm1: 4 parts of sum_y ; ymm2: 4 parts of sum_xx ; ymm3: 4 parts of sum_yy ; ymm4: 4 parts of sum_xy ; ymm5: 4 x values - later squared ; ymm6: 4 y values - later squared ; ymm7: 4 xy values corr: xor r8, r8 mov rcx, rdx vzeroall .more: vmovupd ymm5, [rdi+r8] ; mov x vmovupd ymm6, [rsi+r8] ; mov y vmulpd ymm7, ymm5, ymm6 ; xy vaddpd ymm0, ymm0, ymm5 ; sum_x vaddpd ymm1, ymm1, ymm6 ; sum_y vmulpd ymm5, ymm5, ymm5 ; xx vmulpd ymm6, ymm6, ymm6 ; yy vaddpd ymm2, ymm2, ymm5 ; sum_xx vaddpd ymm3, ymm3, ymm6 ; sum_yy vaddpd ymm4, ymm4, ymm7 ; sum_xy vmovupd ymm13, [rdi+r8+32] ; mov x vmovupd ymm14, [rsi+r8+32] ; mov y vmulpd ymm15, ymm13, ymm14 ; xy vaddpd ymm8, ymm8, ymm13 ; sum_x vaddpd ymm9, ymm9, ymm14 ; sum_y vmulpd ymm13, ymm13, ymm13 ; xx vmulpd ymm14, ymm14, ymm14 ; yy vaddpd ymm10, ymm10, ymm13 ; sum_xx vaddpd ymm11, ymm11, ymm14 ; sum_yy vaddpd ymm12, ymm12, ymm15 ; sum_xy add r8, 64 sub rcx, 8 jnz .more vaddpd ymm0, ymm0, ymm8 vaddpd ymm1, ymm1, ymm9 vaddpd ymm2, ymm2, ymm10 vaddpd ymm3, ymm3, ymm11 vaddpd ymm4, ymm4, ymm12 vhaddpd ymm0, ymm0, ymm0 ; sum_x vhaddpd ymm1, ymm1, ymm1 ; sum_y vhaddpd ymm2, ymm2, ymm2 ; sum_xx vhaddpd ymm3, ymm3, ymm3 ; sum_yy vhaddpd ymm4, ymm4, ymm4 ; sum_xy vextractf128 xmm5, ymm0, 1 vaddsd xmm0, xmm0, xmm5 vextractf128 xmm6, ymm1, 1 vaddsd xmm1, xmm1, xmm6 vmulsd xmm6, xmm0, xmm0 ; sum_x*sum_x vmulsd xmm7, xmm1, xmm1 ; sum_y*sum_y vextractf128 xmm8, ymm2, 1 vaddsd xmm2, xmm2, xmm8 vextractf128 xmm9, ymm3, 1 vaddsd xmm3, xmm3, xmm9 cvtsi2sd xmm8, rdx ; n vmulsd xmm2, xmm2, xmm8 ; n*sum_xx vmulsd xmm3, xmm3, xmm8 ; n*sum_yy vsubsd xmm2, xmm2, xmm6 ; n*sum_xx-sum_x*sum_x vsubsd xmm3, xmm3, xmm7 ; n*sum_yy-sum_y*sum_y vmulsd xmm2, xmm2, xmm3 ; denom*denom vsqrtsd xmm2, xmm2, xmm2 ; denom vextractf128 xmm6, ymm4, 1 vaddsd xmm4, xmm4, xmm6 vmulsd xmm4, xmm4, xmm8 ; n*sum_xy vmulsd xmm0, xmm0, xmm1 ; sum_x*sum_y vsubsd xmm4, xmm4, xmm0 ; n*sum_xy-sum_x*sum_y vdivsd xmm0, xmm4, xmm2 ; correlation ret