; xmm0: input data ; move xmm0 to xmm1 ; shift left ; move xmm0 to xmm2 ; shift right ; ; use punpcldw to convert low bytes to words ; use pslldq to shift left 2 bytes ; use psrldq to shift right 2 bytes ; use pmullw, paddw ; use cvtpi2ps to convert dwords to float ; use punpcklwd to convert low words to dwords ; use punpckhwd to convert high words to dwords ;rdi, rsi, rdx, rcx, r8, r9 ;xmm0-7 ;caller cleans up stack ;stack aligned on 16 bytes so that local data for functions can be properly ;aligned for SSE, ... ;callee must preserve rbx, rbp, r12-r15 %macro multipush 1-* %rep %0 push %1 %rotate 1 %endrep %endmacro %macro multipop 1-* %rep %0 %rotate -1 pop %1 %endrep %endmacro ; sobel ( input, output, rows, cols ); ; char input[rows][cols] ; float output[rows][cols] ; boundary of the output array will be unfilled ; segment .text global sobel, main sobel: .cols equ 0 .rows equ 8 .output equ 16 .input equ 24 .bpir equ 32 .bpor equ 40 multipush rbx, rbp, r12, r13, r14, r15 sub rsp, 48 cmp rdx, 3 jl .noworktodo cmp rcx, 3 jl .noworktodo mov [rsp+.input], rdi mov [rsp+.output], rsi mov [rsp+.rows], rdx mov [rsp+.cols], rcx mov [rsp+.bpir], rcx imul rcx, 4 mov [rsp+.bpor], rcx mov rax, [rsp+.rows]; count of rows to process mov rdx, [rsp+.cols] sub rax, 2 mov r8, [rsp+.input] add r8, rdx mov r9, r8 ; address of row mov r10, r8 sub r8, rdx ; address of first byte of row-1 add r10, rdx ; address of first byte of row+1 vzeroall .more_rows: mov rbx, 1 ; first column to process .more_cols: ; 8 low data row-1 ; 8 low data shifted left ; 8 low data shifted right ; 8 low data row ; 8 low data shifted left ; 8 low data shifted right ; 8 low data row+1 ; 8 low data shifted left ; 8 low data shifted right movdqu xmm0, [r8+rbx-1] ; data for 1st row of 3 vpsrldq xmm1, xmm0, 1 vpsrldq xmm2, xmm0, 2 movdqa xmm3, xmm0 movdqa xmm4, xmm1 movdqa xmm5, xmm2 vpunpcklbw xmm3, xmm0, xmm13 vpunpcklbw xmm4, xmm1, xmm14 vpunpcklbw xmm5, xmm2, xmm15 ; 8 values for 1st row psubw xmm11, xmm3 psubw xmm9, xmm3 paddw xmm11, xmm5 psubw xmm9, xmm4 psubw xmm9, xmm4 psubw xmm9, xmm5 ; finished tally for 1st row, 1st 8 punpckhbw xmm0, xmm13 punpckhbw xmm1, xmm14 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 psubw xmm10, xmm0 paddw xmm12, xmm2 psubw xmm10, xmm1 psubw xmm10, xmm1 psubw xmm10, xmm2 ; finished tally for 1st row, last 6 movdqu xmm0, [r9+rbx-1] ; data for 1st row of 3 vpsrldq xmm2, xmm0, 2 movdqa xmm5, xmm2 vpunpcklbw xmm3, xmm0, xmm13 vpunpcklbw xmm5, xmm2, xmm15 ; 8 values for 1st row psubw xmm11, xmm3 psubw xmm11, xmm3 paddw xmm11, xmm5 paddw xmm11, xmm5 punpckhbw xmm0, xmm13 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 psubw xmm12, xmm0 paddw xmm12, xmm2 paddw xmm12, xmm2 ; finished tally for 2nd row, last 6 movdqu xmm0, [r10+rbx-1] ; data for 3rd row of 3 vpsrldq xmm1, xmm0, 1 vpsrldq xmm2, xmm0, 2 vpunpcklbw xmm3, xmm0, xmm13 vpunpcklbw xmm4, xmm1, xmm14 vpunpcklbw xmm5, xmm2, xmm15 ; 8 values for 3rd row psubw xmm11, xmm3 paddw xmm9, xmm3 paddw xmm11, xmm5 paddw xmm9, xmm4 paddw xmm9, xmm4 paddw xmm9, xmm5 ; finished tally for 3rd row, 1st 8 punpckhbw xmm0, xmm13 punpckhbw xmm1, xmm14 punpckhbw xmm2, xmm15 psubw xmm12, xmm0 paddw xmm10, xmm0 paddw xmm12, xmm2 paddw xmm10, xmm1 paddw xmm10, xmm1 paddw xmm10, xmm2 ; finished tally for 3rd row, last 6 add rbx, 14 ; process 14 Sobel values cmp rbx, rdx jl .more_cols add r8, rdx add r9, rdx add r10, rdx sub rax, 1 ; 1 fewer row to process cmp rax, 0 jg .more_rows .noworktodo: add rsp, 48 multipop rbx, rbp, r12, r13, r14, r15 ret