x86-64 SIMD: Optimize argument collection

Expand collect_args/uncollect_args macros so that the number of
arguments can be specified.  This prevents unnecessary push and mov
instructions.

NOTE: On Windows, the push/pop of xmm6 and xmm7 had to be moved to the
other end of the macro to ensure that rsp is aligned on a 16-byte
boundary.
This commit is contained in:
DRC
2016-05-29 10:51:16 -05:00
parent 6cb27c31d9
commit 16b121890c
17 changed files with 144 additions and 106 deletions

View File

@@ -27,11 +27,11 @@
; JDIMENSION output_row, int num_rows); ; JDIMENSION output_row, int num_rows);
; ;
; r10 = JDIMENSION img_width ; r10d = JDIMENSION img_width
; r11 = JSAMPARRAY input_buf ; r11 = JSAMPARRAY input_buf
; r12 = JSAMPIMAGE output_buf ; r12 = JSAMPIMAGE output_buf
; r13 = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14 = int num_rows ; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8 %define WK_NUM 8
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -475,7 +475,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -27,11 +27,11 @@
; JDIMENSION output_row, int num_rows); ; JDIMENSION output_row, int num_rows);
; ;
; r10 = JDIMENSION img_width ; r10d = JDIMENSION img_width
; r11 = JSAMPARRAY input_buf ; r11 = JSAMPARRAY input_buf
; r12 = JSAMPIMAGE output_buf ; r12 = JSAMPIMAGE output_buf
; r13 = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14 = int num_rows ; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -354,7 +354,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -172,7 +172,7 @@ EXTN(jconst_huff_encode_one_block):
; r10 = working_state *state ; r10 = working_state *state
; r11 = JOCTET *buffer ; r11 = JOCTET *buffer
; r12 = JCOEFPTR block ; r12 = JCOEFPTR block
; r13 = int last_dc_val ; r13d = int last_dc_val
; r14 = c_derived_tbl *dctbl ; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl ; r15 = c_derived_tbl *actbl
@@ -193,7 +193,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2] lea rsp, [t2]
collect_args collect_args 6
%ifdef WIN64 %ifdef WIN64
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
@@ -349,7 +349,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD add rsp, 4*SIZEOF_XMMWORD
%endif %endif
uncollect_args uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -32,10 +32,10 @@
; JSAMPARRAY input_data, JSAMPARRAY output_data); ; JSAMPARRAY input_data, JSAMPARRAY output_data);
; ;
; r10 = JDIMENSION image_width ; r10d = JDIMENSION image_width
; r11 = int max_v_samp_factor ; r11 = int max_v_samp_factor
; r12 = JDIMENSION v_samp_factor ; r12d = JDIMENSION v_samp_factor
; r13 = JDIMENSION width_blocks ; r13d = JDIMENSION width_blocks
; r14 = JSAMPARRAY input_data ; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data ; r15 = JSAMPARRAY output_data
@@ -46,7 +46,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -160,7 +160,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args uncollect_args 6
pop rbp pop rbp
ret ret
@@ -176,10 +176,10 @@ EXTN(jsimd_h2v1_downsample_sse2):
; JSAMPARRAY input_data, JSAMPARRAY output_data); ; JSAMPARRAY input_data, JSAMPARRAY output_data);
; ;
; r10 = JDIMENSION image_width ; r10d = JDIMENSION image_width
; r11 = int max_v_samp_factor ; r11 = int max_v_samp_factor
; r12 = JDIMENSION v_samp_factor ; r12d = JDIMENSION v_samp_factor
; r13 = JDIMENSION width_blocks ; r13d = JDIMENSION width_blocks
; r14 = JSAMPARRAY input_data ; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data ; r15 = JSAMPARRAY output_data
@@ -190,7 +190,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args uncollect_args 6
pop rbp pop rbp
ret ret

View File

@@ -28,11 +28,11 @@
; JSAMPARRAY output_buf, int num_rows) ; JSAMPARRAY output_buf, int num_rows)
; ;
; r10 = JDIMENSION out_width ; r10d = JDIMENSION out_width
; r11 = JSAMPIMAGE input_buf ; r11 = JSAMPIMAGE input_buf
; r12 = JDIMENSION input_row ; r12d = JDIMENSION input_row
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
; r14 = int num_rows ; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
@@ -48,7 +48,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 5
push rbx push rbx
mov ecx, r10d ; num_cols mov ecx, r10d ; num_cols
@@ -429,7 +429,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -29,9 +29,9 @@
; JSAMPARRAY output_buf); ; JSAMPARRAY output_buf);
; ;
; r10 = JDIMENSION output_width ; r10d = JDIMENSION output_width
; r11 = JSAMPIMAGE input_buf ; r11 = JSAMPIMAGE input_buf
; r12 = JDIMENSION in_row_group_ctr ; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
@@ -48,7 +48,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 4
push rbx push rbx
mov ecx, r10d ; col mov ecx, r10d ; col
@@ -422,7 +422,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
@@ -439,9 +439,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; JSAMPARRAY output_buf); ; JSAMPARRAY output_buf);
; ;
; r10 = JDIMENSION output_width ; r10d = JDIMENSION output_width
; r11 = JSAMPIMAGE input_buf ; r11 = JSAMPIMAGE input_buf
; r12 = JDIMENSION in_row_group_ctr ; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
align 16 align 16
@@ -451,7 +451,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 4
push rbx push rbx
mov eax, r10d mov eax, r10d
@@ -528,7 +528,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
pop rdx pop rdx
pop rbx pop rbx
uncollect_args uncollect_args 4
pop rbp pop rbp
ret ret

View File

@@ -53,7 +53,7 @@ PW_EIGHT times 8 dw 8
; ;
; r10 = int max_v_samp_factor ; r10 = int max_v_samp_factor
; r11 = JDIMENSION downsampled_width ; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
@@ -64,7 +64,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 4
mov eax, r11d ; colctr mov eax, r11d ; colctr
test rax, rax test rax, rax
@@ -175,7 +175,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args uncollect_args 4
pop rbp pop rbp
ret ret
@@ -192,7 +192,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
; ;
; r10 = int max_v_samp_factor ; r10 = int max_v_samp_factor
; r11 = JDIMENSION downsampled_width ; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
@@ -210,7 +210,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 4
push rbx push rbx
mov eax, r11d ; colctr mov eax, r11d ; colctr
@@ -473,7 +473,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
@@ -492,7 +492,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
; ;
; r10 = int max_v_samp_factor ; r10 = int max_v_samp_factor
; r11 = JDIMENSION output_width ; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
@@ -503,7 +503,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 4
mov edx, r11d mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1 add rdx, byte (2*SIZEOF_XMMWORD)-1
@@ -564,7 +564,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
jg short .rowloop jg short .rowloop
.return: .return:
uncollect_args uncollect_args 4
pop rbp pop rbp
ret ret
@@ -581,7 +581,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
; ;
; r10 = int max_v_samp_factor ; r10 = int max_v_samp_factor
; r11 = JDIMENSION output_width ; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
@@ -592,7 +592,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 4
push rbx push rbx
mov edx, r11d mov edx, r11d
@@ -661,7 +661,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args uncollect_args 4
pop rbp pop rbp
ret ret

View File

@@ -74,7 +74,7 @@ EXTN(jsimd_fdct_float_sse):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -346,7 +346,7 @@ EXTN(jsimd_fdct_float_sse):
dec rcx dec rcx
jnz near .columnloop jnz near .columnloop
uncollect_args uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -89,7 +89,7 @@ EXTN(jsimd_fdct_ifast_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -380,7 +380,7 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -110,7 +110,7 @@ EXTN(jsimd_fdct_islow_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -610,7 +610,7 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
uncollect_args uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -64,7 +64,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r10 = void *dct_table ; r10 = void *dct_table
; r11 = JCOEFPTR coef_block ; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
@@ -83,7 +83,7 @@ EXTN(jsimd_idct_float_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [workspace] lea rsp, [workspace]
collect_args collect_args 4
push rbx push rbx
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -471,7 +471,7 @@ EXTN(jsimd_idct_float_sse2):
jnz near .rowloop jnz near .rowloop
pop rbx pop rbx
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -85,7 +85,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r10 = jpeg_component_info *compptr ; r10 = jpeg_component_info *compptr
; r11 = JCOEFPTR coef_block ; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
@@ -102,7 +102,7 @@ EXTN(jsimd_idct_ifast_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -479,7 +479,7 @@ EXTN(jsimd_idct_ifast_sse2):
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -98,7 +98,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r10 = jpeg_component_info *compptr ; r10 = jpeg_component_info *compptr
; r11 = JCOEFPTR coef_block ; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
@@ -115,7 +115,7 @@ EXTN(jsimd_idct_islow_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -836,7 +836,7 @@ EXTN(jsimd_idct_islow_sse2):
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp

View File

@@ -106,7 +106,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r10 = void *dct_table ; r10 = void *dct_table
; r11 = JCOEFPTR coef_block ; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp+0 %define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
@@ -123,7 +123,7 @@ EXTN(jsimd_idct_4x4_sse2):
mov [rsp], rax mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)] lea rsp, [wk(0)]
collect_args collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -389,7 +389,7 @@ EXTN(jsimd_idct_4x4_sse2):
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp pop rsp ; rsp <- original rbp
pop rbp pop rbp
@@ -409,7 +409,7 @@ EXTN(jsimd_idct_4x4_sse2):
; r10 = void *dct_table ; r10 = void *dct_table
; r11 = JCOEFPTR coef_block ; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13 = JDIMENSION output_col ; r13d = JDIMENSION output_col
align 16 align 16
global EXTN(jsimd_idct_2x2_sse2) global EXTN(jsimd_idct_2x2_sse2)
@@ -418,7 +418,7 @@ EXTN(jsimd_idct_2x2_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 4
push rbx push rbx
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -566,7 +566,7 @@ EXTN(jsimd_idct_2x2_sse2):
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx pop rbx
uncollect_args uncollect_args 4
pop rbp pop rbp
ret ret

View File

@@ -31,7 +31,7 @@
; ;
; r10 = JSAMPARRAY sample_data ; r10 = JSAMPARRAY sample_data
; r11 = JDIMENSION start_col ; r11d = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace ; r12 = FAST_FLOAT *workspace
align 16 align 16
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_float_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 3
push rbx push rbx
pcmpeqw xmm7, xmm7 pcmpeqw xmm7, xmm7
@@ -90,7 +90,7 @@ EXTN(jsimd_convsamp_float_sse2):
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args uncollect_args 3
pop rbp pop rbp
ret ret
@@ -115,7 +115,7 @@ EXTN(jsimd_quantize_float_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 3
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
@@ -148,7 +148,7 @@ EXTN(jsimd_quantize_float_sse2):
dec rax dec rax
jnz short .quantloop jnz short .quantloop
uncollect_args uncollect_args 3
pop rbp pop rbp
ret ret

View File

@@ -31,7 +31,7 @@
; ;
; r10 = JSAMPARRAY sample_data ; r10 = JSAMPARRAY sample_data
; r11 = JDIMENSION start_col ; r11d = JDIMENSION start_col
; r12 = DCTELEM *workspace ; r12 = DCTELEM *workspace
align 16 align 16
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 3
push rbx push rbx
pxor xmm6, xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
@@ -85,7 +85,7 @@ EXTN(jsimd_convsamp_sse2):
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args uncollect_args 3
pop rbp pop rbp
ret ret
@@ -117,7 +117,7 @@ EXTN(jsimd_quantize_sse2):
push rbp push rbp
mov rax, rsp mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args collect_args 3
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
@@ -177,7 +177,7 @@ EXTN(jsimd_quantize_sse2):
dec rax dec rax
jnz near .quantloop jnz near .quantloop
uncollect_args uncollect_args 3
pop rbp pop rbp
ret ret

View File

@@ -307,61 +307,99 @@ const_base:
%ifdef WIN64 %ifdef WIN64
%imacro collect_args 0 %imacro collect_args 1
push r12
push r13
push r14
push r15
mov r10, rcx
mov r11, rdx
mov r12, r8
mov r13, r9
mov r14, [rax+48]
mov r15, [rax+56]
push rsi
push rdi
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm6 movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm7 movaps XMMWORD [rsp], xmm7
mov r10, rcx
%if %1 > 1
mov r11, rdx
%endif
%if %1 > 2
push r12
mov r12, r8
%endif
%if %1 > 3
push r13
mov r13, r9
%endif
%if %1 > 4
push r14
mov r14, [rax+48]
%endif
%if %1 > 5
push r15
mov r15, [rax+56]
%endif
push rsi
push rdi
%endmacro %endmacro
%imacro uncollect_args 0 %imacro uncollect_args 1
pop rdi
pop rsi
%if %1 > 5
pop r15
%endif
%if %1 > 4
pop r14
%endif
%if %1 > 3
pop r13
%endif
%if %1 > 2
pop r12
%endif
movaps xmm7, XMMWORD [rsp] movaps xmm7, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD add rsp, SIZEOF_XMMWORD
movaps xmm6, XMMWORD [rsp] movaps xmm6, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD add rsp, SIZEOF_XMMWORD
pop rdi
pop rsi
pop r15
pop r14
pop r13
pop r12
%endmacro %endmacro
%else %else
%imacro collect_args 0 %imacro collect_args 1
push r10 push r10
push r11
push r12
push r13
push r14
push r15
mov r10, rdi mov r10, rdi
%if %1 > 1
push r11
mov r11, rsi mov r11, rsi
%endif
%if %1 > 2
push r12
mov r12, rdx mov r12, rdx
%endif
%if %1 > 3
push r13
mov r13, rcx mov r13, rcx
%endif
%if %1 > 4
push r14
mov r14, r8 mov r14, r8
%endif
%if %1 > 5
push r15
mov r15, r9 mov r15, r9
%endif
%endmacro %endmacro
%imacro uncollect_args 0 %imacro uncollect_args 1
%if %1 > 5
pop r15 pop r15
%endif
%if %1 > 4
pop r14 pop r14
%endif
%if %1 > 3
pop r13 pop r13
%endif
%if %1 > 2
pop r12 pop r12
%endif
%if %1 > 1
pop r11 pop r11
%endif
pop r10 pop r10
%endmacro %endmacro