segment .text global corr ; ; rcx: x array ; rdx: y array ; r10: loop counter ; r8: n ; xmm0: 2 parts of sum_x ; xmm1: 2 parts of sum_y ; xmm2: 2 parts of sum_xx ; xmm3: 2 parts of sum_yy ; xmm4: 2 parts of sum_xy ; xmm5: 2 x values - later squared ; xmm6: 2 y values - later squared ; xmm7: 2 xy values corr: alias X, rcx alias Y, rdx alias I, r10 alias N, r8 fpalias Sum_x, 0 fpalias Sum_y, 1 fpalias Sum_xx, 2 fpalias Sum_yy, 3 fpalias Sum_xy, 4 fpalias Xvals, 5 fpalias Yvals, 6 fpalias XYvals, 7 fpalias Sum_x2, 8 fpalias Sum_y2, 9 fpalias Sum_xx2, 10 fpalias Sum_yy2, 11 fpalias Sum_xy2, 12 fpalias Xvals2, 13 fpalias Yvals2, 14 fpalias XYvals2, 15 xor r9d, r9d mov qI, qN subpd xSum_x, xSum_x movapd xSum_y, xSum_x movapd xSum_xx, xSum_x movapd xSum_yy, xSum_x movapd xSum_xy, xSum_x movapd xSum_x2, xSum_x movapd xSum_y2, xSum_x movapd xSum_xx2, xSum_x movapd xSum_yy2, xSum_x movapd xSum_xy2, xSum_x .more: movapd xXvals, [qX+r9] ; mov x movapd xYvals, [qY+r9] ; mov y movapd xXYvals, xXvals ; mov x mulpd xXYvals, xYvals ; xy addpd xSum_x, xXvals ; sum_x addpd xSum_y, xYvals ; sum_y mulpd xXvals, xXvals ; xx mulpd xYvals, xYvals ; yy addpd xmm2, xXvals ; sum_xx addpd xmm3, xYvals ; sum_yy addpd xmm4, xXYvals ; sum_xy movapd xXvals2, [qX+r9+16] ; mov x movapd xYvals2, [qY+r9+16] ; mov y movapd xXYvals, xXvals2 ; mov x mulpd xXYvals, xYvals2 ; xy addpd xSum_x2, xXvals2 ; sum_x addpd xSum_y2, xYvals2 ; sum_y mulpd xXvals2, xXvals2 ; xx mulpd xYvals2, xYvals2 ; yy addpd xSum_xx2, xXvals2 ; sum_xx addpd xSum_yy2, xYvals2 ; sum_yy addpd xSum_xy2, xXYvals ; sum_xy add r9, 32 sub qI, 4 jnz .more addpd xSum_x, xSum_x2 addpd xSum_y, xSum_y2 addpd xSum_xx, xSum_xx2 addpd xSum_yy, xSum_yy2 addpd xSum_xy, xSum_xy2 haddpd xSum_x, xSum_x ; sum_x haddpd xSum_y, xSum_y ; sum_y haddpd xSum_xx, xSum_xx ; sum_xx haddpd xSum_yy, xSum_yy ; sum_yy haddpd xSum_xy, xSum_xy ; sum_xy movsd xYvals, xSum_x ; sum_x movsd xXYvals, xSum_y ; sum_y fpalias N, 15 cvtsi2sd xN, qN ; n mulsd xYvals, xYvals ; sum_x*sum_x mulsd xXYvals, xXYvals ; sum_y*sum_y fpalias NSum_xx, 2 fpalias NSum_yy, 3 mulsd xNSum_xx, xN ; n*sum_xx mulsd xNSum_yy, xN ; n*sum_yy subsd xNSum_xx, xYvals ; n*sum_xx-sum_x*sum_x subsd xNSum_yy, xXYvals ; n*sum_yy-sum_y*sum_y fpalias Denom, 2 mulsd xDenom, xNSum_yy ; denom1*denom2 sqrtsd xDenom, xDenom ; denom fpalias NSum_xy, 4 mulsd xNSum_xy, xN ; n*sum_xy mulsd xSum_x, xSum_y ; sum_x*sum_y subsd xNSum_xy, xSum_x ; n*sum_xy-sum_x*sum_y divsd xNSum_xy, xDenom ; correlation movsd xmm0, xNSum_xy ; need in xmm0 ret