segment .text global corr alias X, rcx alias Y, rdx alias N, r8 alias I, r9 alias Left, r10 ; ; rcx: x array ; rdx: y array ; r8: n ; r9: loop counter fpalias Sum_x, 0 fpalias Sum_y, 1 fpalias Sum_xx, 2 fpalias Sum_yy, 3 fpalias Sum_xy, 4 fpalias Xvalues, 5 fpalias Yvalues, 6 fpalias XYvalues, 7 fpalias Sum_x2, 8 fpalias Sum_y2, 9 fpalias Sum_xx2, 10 fpalias Sum_yy2, 11 fpalias Sum_xy2, 12 fpalias Xvalues2, 13 fpalias Yvalues2, 14 fpalias XYvalues2, 15 ; ymm0: 4 parts of sum_x ; ymm1: 4 parts of sum_y ; ymm2: 4 parts of sum_xx ; ymm3: 4 parts of sum_yy ; ymm4: 4 parts of sum_xy ; ymm5: 4 x values - later squared ; ymm6: 4 y values - later squared ; ymm7: 4 xy values corr: xor dI, dI mov qLeft, qN vzeroall .more: vmovupd yXvalues, [qX+qI] ; mov x 5 vmovupd yYvalues, [qY+qI] ; mov y 6 vmulpd yXYvalues, yXvalues, yYvalues ; xy 7 5 6 vaddpd ySum_x, ySum_x, yXvalues ; sum_x 0 0 5 vaddpd ySum_y, ySum_y, yYvalues ; sum_y 1 1 6 vmulpd yXvalues, yXvalues, yXvalues ; xx 5 5 5 vmulpd yYvalues, yYvalues, yYvalues ; yy 6 6 6 vaddpd ySum_xx, ySum_xx, yXvalues ; sum_xx 2 2 5 vaddpd ySum_yy, ySum_yy, yYvalues ; sum_yy 3 3 5 vaddpd ySum_xy, ySum_xy, yXYvalues ; sum_xy 4 4 7 vmovupd yXvalues2, [qX+qI+32] ; mov x 13 vmovupd yYvalues2, [qY+qI+32] ; mov y 14 vmulpd yXYvalues2, yXvalues2, yYvalues2 ; xy 15 13 14 vaddpd ySum_x2, ySum_x2, yXvalues2 ; sum_x 8 8 13 vaddpd ySum_y2, ySum_y2, yYvalues2 ; sum_y 9 9 14 vmulpd yXvalues2, yXvalues2, yXvalues2 ; xx 13 13 13 vmulpd yYvalues2, yYvalues2, yYvalues2 ; yy 14 14 14 vaddpd ySum_xx2, ySum_xx2, yXvalues2 ; sum_xx 10 10 13 vaddpd ySum_yy2, ySum_yy2, yYvalues2 ; sum_yy 11 11 14 vaddpd ySum_xy2, ySum_xy2, yXYvalues2 ; sum_xy 12 12 15 add qI, 64 sub qLeft, 8 jg .more vaddpd ySum_x, ySum_x, ySum_x2 ; 0 0 8 vaddpd ySum_y, ySum_y, ySum_y2 ; 1 1 9 vaddpd ySum_xx, ySum_xx, ySum_xx2 ; 2 2 10 vaddpd ySum_yy, ySum_yy, ySum_yy2 ; 3 3 11 vaddpd ySum_xy, ySum_xy, ySum_xy2 ; 4 4 12 vhaddpd ySum_x, ySum_x, ySum_x ; sum_x 0 0 0 vhaddpd ySum_y, ySum_y, ySum_y ; sum_y 1 1 1 vhaddpd ySum_xx, ySum_xx, ySum_xx ; sum_xx 2 2 2 vhaddpd ySum_yy, ySum_yy, ySum_yy ; sum_yy 3 3 3 vhaddpd ySum_xy, ySum_xy, ySum_xy ; sum_xy 4 4 4 vextractf128 xXvalues, ySum_x, 1 ; 5 0 vaddsd xSum_x, xSum_x, xXvalues ; 0 0 5 vextractf128 xYvalues, ySum_y, 1 ; 6 1 vaddsd xSum_y, xSum_y, xYvalues ; 1 1 6 fpalias SumxSumx, 6 vmulsd xSumxSumx, xSum_x, xSum_x ; sum_x*sum_x 6 0 0 fpalias SumySumy, 7 vmulsd xSumySumy, xSum_y, xSum_y ; sum_y*sum_y 7 1 1 vextractf128 xSum_x2, ySum_xx, 1 ; 8 2 vaddsd xSum_xx, xSum_xx, xSum_x2 ; 2 2 8 vextractf128 xSum_y2, ySum_yy, 1 ; 9 3 vaddsd xSum_yy, xSum_yy, xSum_y2 ; 3 3 9 fpalias N, 8 cvtsi2sd xN, qN ; n 8 vmulsd xSum_xx, xSum_xx, xN ; n*sum_xx 2 2 8 vmulsd xSum_yy, xSum_yy, xN ; n*sum_yy 3 3 8 vsubsd xSum_xx, xSum_xx, xSumxSumx ; n*sum_xx-sum_x*sum_x 2 2 6 vsubsd xSum_yy, xSum_yy, xSumySumy ; n*sum_yy-sum_y*sum_y 3 3 7 fpalias Denom, 2 vmulsd xDenom, xSum_xx, xSum_yy ; denom*denom 2 2 3 vsqrtsd xDenom, xDenom, xDenom ; denom 2 vextractf128 xSumxSumx, ySum_xy, 1 ; 6 4 vaddsd xSum_xy, xSum_xy, xSumxSumx ; 4 4 6 vmulsd xSum_xy, xSum_xy, xN ; n*sum_xy 4 4 8 fpalias SumxSumy, 0 vmulsd xSumxSumy, xSum_x, xSum_y ; sum_x*sum_y 0 0 1 fpalias Num, 4 vsubsd xNum, xSum_xy, xSumxSumy ; n*sum_xy-sum_x*sum_y 4 4 0 vdivsd xSumxSumy, xNum, xDenom ; correlation 0 3 2 ret