; xmm0: input data ; move xmm0 to xmm1 ; shift left ; move xmm0 to xmm2 ; shift right ; ; use punpcldw to convert low bytes to words ; use pslldq to shift left 2 bytes ; use psrldq to shift right 2 bytes ; use pmullw, paddw ; use cvtpi2ps to convert dwords to float ; use punpcklwd to convert low words to dwords ; use punpckhwd to convert high words to dwords ;rdi, rsi, rdx, rcx, r8, r9 ;xmm0-7 ;caller cleans up stack ;stack aligned on 16 bytes so that local data for functions can be properly ;aligned for SSE, ... ;callee must preserve rbx, rbp, r12-r15 %macro multipush 1-* %rep %0 push %1 %rotate 1 %endrep %endmacro %macro multipop 1-* %rep %0 %rotate -1 pop %1 %endrep %endmacro ; sobel ( input, output, rows, cols ); ; char input[rows][cols] ; float output[rows][cols] ; boundary of the output array will be unfilled ; segment .data rows equ 20000 cols equ 20000 dsize equ rows*cols segment .bss data: resb dsize result: resd dsize segment .text global sobel, main sobel: .cols equ 0 .rows equ 8 .output equ 16 .input equ 24 .bpir equ 32 .bpor equ 40 multipush rbx, rbx, r12, r13, r14, r15 sub rsp, 48 cmp rdx, 3 jle .noworktodo cmp rcx, 3 jle .noworktodo mov [rsp+.input], rdi mov [rsp+.output], rsi mov [rsp+.rows], rdx mov [rsp+.cols], rcx mov [rsp+.bpir], rcx imul rcx, 4 mov [rsp+.bpor], rcx mov rax, [rsp+.rows]; count of rows to process sub rax, 2 mov r8, [rsp+.input] add r8, rdx mov r9, r8 ; address of row mov r10, r8 sub r8, rdx ; address of first byte of row-1 add r10, rdx ; address of first byte of row+1 pxor xmm1, xmm1 pxor xmm2, xmm2 pxor xmm6, xmm6 pxor xmm7, xmm7 pxor xmm9, xmm9 pxor xmm10, xmm10 pxor xmm11, xmm11 pxor xmm12, xmm12 pxor xmm13, xmm13 pxor xmm14, xmm14 pxor xmm15, xmm15 .more_rows: mov rbx, 1 ; first column to process .more_cols: ; 8 low data row-1 ; 8 low data shifted left ; 8 low data shifted right ; 8 low data row ; 8 low data shifted left ; 8 low data shifted right ; 8 low data row+1 ; 8 low data shifted left ; 8 low data shifted right movdqu xmm0, [r8+rbx-1] ; data for 1st row of 3 movdqu xmm1, xmm0 movdqu xmm2, xmm0 psrldq xmm1, 1 psrldq xmm2, 2 movdqa xmm3, xmm0 movdqa xmm4, xmm1 movdqa xmm5, xmm2 punpcklbw xmm3, xmm13 punpcklbw xmm4, xmm14 punpcklbw xmm5, xmm15 ; 8 values for 1st row psubw xmm11, xmm3 psubw xmm9, xmm3 paddw xmm11, xmm5 psubw xmm9, xmm4 psubw xmm9, xmm4 psubw xmm9, xmm5 ; finished tally for 1st row, 1st 8 punpckhbw xmm0, xmm13 punpckhbw xmm1, xmm14 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 psubw xmm10, xmm0 paddw xmm12, xmm2 psubw xmm10, xmm1 psubw xmm10, xmm1 psubw xmm10, xmm2 ; finished tally for 1st row, last 6 movdqu xmm0, [r9+rbx-1] ; data for 1st row of 3 ;movdqu xmm1, xmm0 movdqu xmm2, xmm0 ;psrldq xmm1, 1 psrldq xmm2, 2 movdqa xmm3, xmm0 ;movdqa xmm4, xmm1 movdqa xmm5, xmm2 punpcklbw xmm3, xmm13 ;punpcklbw xmm4, xmm14 punpcklbw xmm5, xmm15 ; 8 values for 1st row psubw xmm11, xmm3 psubw xmm11, xmm3 paddw xmm11, xmm5 paddw xmm11, xmm5 punpckhbw xmm0, xmm13 ;punpckhbw xmm1, xmm14 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 psubw xmm12, xmm0 paddw xmm12, xmm2 paddw xmm12, xmm2 ; finished tally for 2nd row, last 6 movdqu xmm0, [r10+rbx-1] ; data for 3rd row of 3 movdqu xmm1, xmm0 movdqu xmm2, xmm0 psrldq xmm1, 1 psrldq xmm2, 2 movdqa xmm3, xmm0 movdqa xmm4, xmm1 movdqa xmm5, xmm2 punpcklbw xmm3, xmm13 punpcklbw xmm4, xmm14 punpcklbw xmm5, xmm15 ; 8 values for 3rd row psubw xmm11, xmm3 paddw xmm9, xmm3 paddw xmm11, xmm5 paddw xmm9, xmm4 paddw xmm9, xmm4 paddw xmm9, xmm5 ; finished tally for 3rd row, 1st 8 punpckhbw xmm0, xmm13 punpckhbw xmm1, xmm14 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 paddw xmm10, xmm0 paddw xmm12, xmm2 paddw xmm10, xmm1 paddw xmm10, xmm1 paddw xmm10, xmm2 ; finished tally for 3rd row, last 6 add rbx, 14 ; process 14 Sobel values cmp rbx, rdx jl .more_cols add r8, rdx add r9, rdx add r10, rdx dec rax ; 1 fewer row to process jp .more_rows .noworktodo: add rsp, 48 multipop rbx, rbx, r12, r13, r14, r15 ret main: push rbp mov rbp, rsp sub rsp, 32 mov rcx, dsize lea rdi, [data] mov al, 255 .more: inc al stosb dec rcx jnz .more lea rdi, [data] lea rsi, [result] mov rdx, rows mov rcx, cols call sobel leave ret