x86-64 SIMD: Optimize argument collection
Expand collect_args/uncollect_args macros so that the number of arguments can be specified. This prevents unnecessary push and mov instructions. NOTE: On Windows, the push/pop of xmm6 and xmm7 had to be moved to the other end of the macro to ensure that rsp is aligned on a 16-byte boundary.
This commit is contained in:
@@ -27,11 +27,11 @@
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION img_width
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13 = JDIMENSION output_row
|
||||
; r14 = int num_rows
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
@@ -475,7 +475,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -27,11 +27,11 @@
|
||||
; JDIMENSION output_row, int num_rows);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION img_width
|
||||
; r10d = JDIMENSION img_width
|
||||
; r11 = JSAMPARRAY input_buf
|
||||
; r12 = JSAMPIMAGE output_buf
|
||||
; r13 = JDIMENSION output_row
|
||||
; r14 = int num_rows
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d
|
||||
@@ -354,7 +354,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -172,7 +172,7 @@ EXTN(jconst_huff_encode_one_block):
|
||||
; r10 = working_state *state
|
||||
; r11 = JOCTET *buffer
|
||||
; r12 = JCOEFPTR block
|
||||
; r13 = int last_dc_val
|
||||
; r13d = int last_dc_val
|
||||
; r14 = c_derived_tbl *dctbl
|
||||
; r15 = c_derived_tbl *actbl
|
||||
|
||||
@@ -193,7 +193,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp,rsp ; rbp = aligned rbp
|
||||
lea rsp, [t2]
|
||||
collect_args
|
||||
collect_args 6
|
||||
%ifdef WIN64
|
||||
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
|
||||
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
|
||||
@@ -349,7 +349,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
|
||||
add rsp, 4*SIZEOF_XMMWORD
|
||||
%endif
|
||||
uncollect_args
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -32,10 +32,10 @@
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION image_width
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12 = JDIMENSION v_samp_factor
|
||||
; r13 = JDIMENSION width_blocks
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
@@ -46,7 +46,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
@@ -160,7 +160,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -176,10 +176,10 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION image_width
|
||||
; r10d = JDIMENSION image_width
|
||||
; r11 = int max_v_samp_factor
|
||||
; r12 = JDIMENSION v_samp_factor
|
||||
; r13 = JDIMENSION width_blocks
|
||||
; r12d = JDIMENSION v_samp_factor
|
||||
; r13d = JDIMENSION width_blocks
|
||||
; r14 = JSAMPARRAY input_data
|
||||
; r15 = JSAMPARRAY output_data
|
||||
|
||||
@@ -190,7 +190,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 6
|
||||
|
||||
mov ecx, r13d
|
||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||
@@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
uncollect_args 6
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -28,11 +28,11 @@
|
||||
; JSAMPARRAY output_buf, int num_rows)
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION out_width
|
||||
; r10d = JDIMENSION out_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION input_row
|
||||
; r12d = JDIMENSION input_row
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14 = int num_rows
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
@@ -48,7 +48,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; num_cols
|
||||
@@ -429,7 +429,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -29,9 +29,9 @@
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION output_width
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION in_row_group_ctr
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
@@ -48,7 +48,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; col
|
||||
@@ -422,7 +422,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
@@ -439,9 +439,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
; JSAMPARRAY output_buf);
|
||||
;
|
||||
|
||||
; r10 = JDIMENSION output_width
|
||||
; r10d = JDIMENSION output_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12 = JDIMENSION in_row_group_ctr
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
align 16
|
||||
@@ -451,7 +451,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r10d
|
||||
@@ -528,7 +528,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
pop rdx
|
||||
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ PW_EIGHT times 8 dw 8
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION downsampled_width
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
@@ -64,7 +64,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 4
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
test rax, rax
|
||||
@@ -175,7 +175,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
jg near .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -192,7 +192,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION downsampled_width
|
||||
; r11d = JDIMENSION downsampled_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
@@ -210,7 +210,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov eax, r11d ; colctr
|
||||
@@ -473,7 +473,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
@@ -492,7 +492,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION output_width
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
@@ -503,7 +503,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 4
|
||||
|
||||
mov edx, r11d
|
||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||
@@ -564,7 +564,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
||||
jg short .rowloop
|
||||
|
||||
.return:
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -581,7 +581,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
||||
;
|
||||
|
||||
; r10 = int max_v_samp_factor
|
||||
; r11 = JDIMENSION output_width
|
||||
; r11d = JDIMENSION output_width
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
@@ -592,7 +592,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
mov edx, r11d
|
||||
@@ -661,7 +661,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ EXTN(jsimd_fdct_float_sse):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
@@ -346,7 +346,7 @@ EXTN(jsimd_fdct_float_sse):
|
||||
dec rcx
|
||||
jnz near .columnloop
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -89,7 +89,7 @@ EXTN(jsimd_fdct_ifast_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
@@ -380,7 +380,7 @@ EXTN(jsimd_fdct_ifast_sse2):
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -110,7 +110,7 @@ EXTN(jsimd_fdct_islow_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
|
||||
@@ -610,7 +610,7 @@ EXTN(jsimd_fdct_islow_sse2):
|
||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -64,7 +64,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
@@ -83,7 +83,7 @@ EXTN(jsimd_idct_float_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [workspace]
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
; ---- Pass 1: process columns from input, store into work array.
|
||||
@@ -471,7 +471,7 @@ EXTN(jsimd_idct_float_sse2):
|
||||
jnz near .rowloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -85,7 +85,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r10 = jpeg_component_info *compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
@@ -102,7 +102,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
@@ -479,7 +479,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -98,7 +98,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r10 = jpeg_component_info *compptr
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
@@ -115,7 +115,7 @@ EXTN(jsimd_idct_islow_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
@@ -836,7 +836,7 @@ EXTN(jsimd_idct_islow_sse2):
|
||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
|
||||
@@ -106,7 +106,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp+0
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
@@ -123,7 +123,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
|
||||
@@ -389,7 +389,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
@@ -409,7 +409,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
; r10 = void *dct_table
|
||||
; r11 = JCOEFPTR coef_block
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13 = JDIMENSION output_col
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
align 16
|
||||
global EXTN(jsimd_idct_2x2_sse2)
|
||||
@@ -418,7 +418,7 @@ EXTN(jsimd_idct_2x2_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
@@ -566,7 +566,7 @@ EXTN(jsimd_idct_2x2_sse2):
|
||||
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
|
||||
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 4
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11 = JDIMENSION start_col
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = FAST_FLOAT *workspace
|
||||
|
||||
align 16
|
||||
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 3
|
||||
push rbx
|
||||
|
||||
pcmpeqw xmm7, xmm7
|
||||
@@ -90,7 +90,7 @@ EXTN(jsimd_convsamp_float_sse2):
|
||||
jnz short .convloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -115,7 +115,7 @@ EXTN(jsimd_quantize_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 3
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
@@ -148,7 +148,7 @@ EXTN(jsimd_quantize_float_sse2):
|
||||
dec rax
|
||||
jnz short .quantloop
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11 = JDIMENSION start_col
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 16
|
||||
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 3
|
||||
push rbx
|
||||
|
||||
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||
@@ -85,7 +85,7 @@ EXTN(jsimd_convsamp_sse2):
|
||||
jnz short .convloop
|
||||
|
||||
pop rbx
|
||||
uncollect_args
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -117,7 +117,7 @@ EXTN(jsimd_quantize_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args
|
||||
collect_args 3
|
||||
|
||||
mov rsi, r12
|
||||
mov rdx, r11
|
||||
@@ -177,7 +177,7 @@ EXTN(jsimd_quantize_sse2):
|
||||
dec rax
|
||||
jnz near .quantloop
|
||||
|
||||
uncollect_args
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -307,61 +307,99 @@ const_base:
|
||||
|
||||
%ifdef WIN64
|
||||
|
||||
%imacro collect_args 0
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov r10, rcx
|
||||
mov r11, rdx
|
||||
mov r12, r8
|
||||
mov r13, r9
|
||||
mov r14, [rax+48]
|
||||
mov r15, [rax+56]
|
||||
push rsi
|
||||
push rdi
|
||||
%imacro collect_args 1
|
||||
sub rsp, SIZEOF_XMMWORD
|
||||
movaps XMMWORD [rsp], xmm6
|
||||
sub rsp, SIZEOF_XMMWORD
|
||||
movaps XMMWORD [rsp], xmm7
|
||||
mov r10, rcx
|
||||
%if %1 > 1
|
||||
mov r11, rdx
|
||||
%endif
|
||||
%if %1 > 2
|
||||
push r12
|
||||
mov r12, r8
|
||||
%endif
|
||||
%if %1 > 3
|
||||
push r13
|
||||
mov r13, r9
|
||||
%endif
|
||||
%if %1 > 4
|
||||
push r14
|
||||
mov r14, [rax+48]
|
||||
%endif
|
||||
%if %1 > 5
|
||||
push r15
|
||||
mov r15, [rax+56]
|
||||
%endif
|
||||
push rsi
|
||||
push rdi
|
||||
%endmacro
|
||||
|
||||
%imacro uncollect_args 0
|
||||
%imacro uncollect_args 1
|
||||
pop rdi
|
||||
pop rsi
|
||||
%if %1 > 5
|
||||
pop r15
|
||||
%endif
|
||||
%if %1 > 4
|
||||
pop r14
|
||||
%endif
|
||||
%if %1 > 3
|
||||
pop r13
|
||||
%endif
|
||||
%if %1 > 2
|
||||
pop r12
|
||||
%endif
|
||||
movaps xmm7, XMMWORD [rsp]
|
||||
add rsp, SIZEOF_XMMWORD
|
||||
movaps xmm6, XMMWORD [rsp]
|
||||
add rsp, SIZEOF_XMMWORD
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
%endmacro
|
||||
|
||||
%else
|
||||
|
||||
%imacro collect_args 0
|
||||
%imacro collect_args 1
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov r10, rdi
|
||||
%if %1 > 1
|
||||
push r11
|
||||
mov r11, rsi
|
||||
%endif
|
||||
%if %1 > 2
|
||||
push r12
|
||||
mov r12, rdx
|
||||
%endif
|
||||
%if %1 > 3
|
||||
push r13
|
||||
mov r13, rcx
|
||||
%endif
|
||||
%if %1 > 4
|
||||
push r14
|
||||
mov r14, r8
|
||||
%endif
|
||||
%if %1 > 5
|
||||
push r15
|
||||
mov r15, r9
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%imacro uncollect_args 0
|
||||
%imacro uncollect_args 1
|
||||
%if %1 > 5
|
||||
pop r15
|
||||
%endif
|
||||
%if %1 > 4
|
||||
pop r14
|
||||
%endif
|
||||
%if %1 > 3
|
||||
pop r13
|
||||
%endif
|
||||
%if %1 > 2
|
||||
pop r12
|
||||
%endif
|
||||
%if %1 > 1
|
||||
pop r11
|
||||
%endif
|
||||
pop r10
|
||||
%endmacro
|
||||
|
||||
|
||||
Reference in New Issue
Block a user