x86-64 SIMD: Optimize argument collection
Expand collect_args/uncollect_args macros so that the number of arguments can be specified. This prevents unnecessary push and mov instructions. NOTE: On Windows, the push/pop of xmm6 and xmm7 had to be moved to the other end of the macro to ensure that rsp is aligned on a 16-byte boundary.
This commit is contained in:
@@ -27,11 +27,11 @@
|
|||||||
; JDIMENSION output_row, int num_rows);
|
; JDIMENSION output_row, int num_rows);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION img_width
|
; r10d = JDIMENSION img_width
|
||||||
; r11 = JSAMPARRAY input_buf
|
; r11 = JSAMPARRAY input_buf
|
||||||
; r12 = JSAMPIMAGE output_buf
|
; r12 = JSAMPIMAGE output_buf
|
||||||
; r13 = JDIMENSION output_row
|
; r13d = JDIMENSION output_row
|
||||||
; r14 = int num_rows
|
; r14d = int num_rows
|
||||||
|
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
%define WK_NUM 8
|
%define WK_NUM 8
|
||||||
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 5
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d
|
mov ecx, r10d
|
||||||
@@ -475,7 +475,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 5
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -27,11 +27,11 @@
|
|||||||
; JDIMENSION output_row, int num_rows);
|
; JDIMENSION output_row, int num_rows);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION img_width
|
; r10d = JDIMENSION img_width
|
||||||
; r11 = JSAMPARRAY input_buf
|
; r11 = JSAMPARRAY input_buf
|
||||||
; r12 = JSAMPIMAGE output_buf
|
; r12 = JSAMPIMAGE output_buf
|
||||||
; r13 = JDIMENSION output_row
|
; r13d = JDIMENSION output_row
|
||||||
; r14 = int num_rows
|
; r14d = int num_rows
|
||||||
|
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
%define WK_NUM 2
|
%define WK_NUM 2
|
||||||
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 5
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d
|
mov ecx, r10d
|
||||||
@@ -354,7 +354,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 5
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -172,7 +172,7 @@ EXTN(jconst_huff_encode_one_block):
|
|||||||
; r10 = working_state *state
|
; r10 = working_state *state
|
||||||
; r11 = JOCTET *buffer
|
; r11 = JOCTET *buffer
|
||||||
; r12 = JCOEFPTR block
|
; r12 = JCOEFPTR block
|
||||||
; r13 = int last_dc_val
|
; r13d = int last_dc_val
|
||||||
; r14 = c_derived_tbl *dctbl
|
; r14 = c_derived_tbl *dctbl
|
||||||
; r15 = c_derived_tbl *actbl
|
; r15 = c_derived_tbl *actbl
|
||||||
|
|
||||||
@@ -193,7 +193,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp,rsp ; rbp = aligned rbp
|
mov rbp,rsp ; rbp = aligned rbp
|
||||||
lea rsp, [t2]
|
lea rsp, [t2]
|
||||||
collect_args
|
collect_args 6
|
||||||
%ifdef WIN64
|
%ifdef WIN64
|
||||||
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
|
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
|
||||||
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
|
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
|
||||||
@@ -349,7 +349,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
|||||||
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
|
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
|
||||||
add rsp, 4*SIZEOF_XMMWORD
|
add rsp, 4*SIZEOF_XMMWORD
|
||||||
%endif
|
%endif
|
||||||
uncollect_args
|
uncollect_args 6
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -32,10 +32,10 @@
|
|||||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION image_width
|
; r10d = JDIMENSION image_width
|
||||||
; r11 = int max_v_samp_factor
|
; r11 = int max_v_samp_factor
|
||||||
; r12 = JDIMENSION v_samp_factor
|
; r12d = JDIMENSION v_samp_factor
|
||||||
; r13 = JDIMENSION width_blocks
|
; r13d = JDIMENSION width_blocks
|
||||||
; r14 = JSAMPARRAY input_data
|
; r14 = JSAMPARRAY input_data
|
||||||
; r15 = JSAMPARRAY output_data
|
; r15 = JSAMPARRAY output_data
|
||||||
|
|
||||||
@@ -46,7 +46,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 6
|
||||||
|
|
||||||
mov ecx, r13d
|
mov ecx, r13d
|
||||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||||
@@ -160,7 +160,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
jg near .rowloop
|
jg near .rowloop
|
||||||
|
|
||||||
.return:
|
.return:
|
||||||
uncollect_args
|
uncollect_args 6
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -176,10 +176,10 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
|||||||
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
; JSAMPARRAY input_data, JSAMPARRAY output_data);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION image_width
|
; r10d = JDIMENSION image_width
|
||||||
; r11 = int max_v_samp_factor
|
; r11 = int max_v_samp_factor
|
||||||
; r12 = JDIMENSION v_samp_factor
|
; r12d = JDIMENSION v_samp_factor
|
||||||
; r13 = JDIMENSION width_blocks
|
; r13d = JDIMENSION width_blocks
|
||||||
; r14 = JSAMPARRAY input_data
|
; r14 = JSAMPARRAY input_data
|
||||||
; r15 = JSAMPARRAY output_data
|
; r15 = JSAMPARRAY output_data
|
||||||
|
|
||||||
@@ -190,7 +190,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 6
|
||||||
|
|
||||||
mov ecx, r13d
|
mov ecx, r13d
|
||||||
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
|
||||||
@@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
|
|||||||
jg near .rowloop
|
jg near .rowloop
|
||||||
|
|
||||||
.return:
|
.return:
|
||||||
uncollect_args
|
uncollect_args 6
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -28,11 +28,11 @@
|
|||||||
; JSAMPARRAY output_buf, int num_rows)
|
; JSAMPARRAY output_buf, int num_rows)
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION out_width
|
; r10d = JDIMENSION out_width
|
||||||
; r11 = JSAMPIMAGE input_buf
|
; r11 = JSAMPIMAGE input_buf
|
||||||
; r12 = JDIMENSION input_row
|
; r12d = JDIMENSION input_row
|
||||||
; r13 = JSAMPARRAY output_buf
|
; r13 = JSAMPARRAY output_buf
|
||||||
; r14 = int num_rows
|
; r14d = int num_rows
|
||||||
|
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
%define WK_NUM 2
|
%define WK_NUM 2
|
||||||
@@ -48,7 +48,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 5
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d ; num_cols
|
mov ecx, r10d ; num_cols
|
||||||
@@ -429,7 +429,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 5
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -29,9 +29,9 @@
|
|||||||
; JSAMPARRAY output_buf);
|
; JSAMPARRAY output_buf);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION output_width
|
; r10d = JDIMENSION output_width
|
||||||
; r11 = JSAMPIMAGE input_buf
|
; r11 = JSAMPIMAGE input_buf
|
||||||
; r12 = JDIMENSION in_row_group_ctr
|
; r12d = JDIMENSION in_row_group_ctr
|
||||||
; r13 = JSAMPARRAY output_buf
|
; r13 = JSAMPARRAY output_buf
|
||||||
|
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
@@ -48,7 +48,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov ecx, r10d ; col
|
mov ecx, r10d ; col
|
||||||
@@ -422,7 +422,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
@@ -439,9 +439,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
|||||||
; JSAMPARRAY output_buf);
|
; JSAMPARRAY output_buf);
|
||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JDIMENSION output_width
|
; r10d = JDIMENSION output_width
|
||||||
; r11 = JSAMPIMAGE input_buf
|
; r11 = JSAMPIMAGE input_buf
|
||||||
; r12 = JDIMENSION in_row_group_ctr
|
; r12d = JDIMENSION in_row_group_ctr
|
||||||
; r13 = JSAMPARRAY output_buf
|
; r13 = JSAMPARRAY output_buf
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
@@ -451,7 +451,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov eax, r10d
|
mov eax, r10d
|
||||||
@@ -528,7 +528,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
|
|||||||
pop rdx
|
pop rdx
|
||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ PW_EIGHT times 8 dw 8
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = int max_v_samp_factor
|
; r10 = int max_v_samp_factor
|
||||||
; r11 = JDIMENSION downsampled_width
|
; r11d = JDIMENSION downsampled_width
|
||||||
; r12 = JSAMPARRAY input_data
|
; r12 = JSAMPARRAY input_data
|
||||||
; r13 = JSAMPARRAY *output_data_ptr
|
; r13 = JSAMPARRAY *output_data_ptr
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 4
|
||||||
|
|
||||||
mov eax, r11d ; colctr
|
mov eax, r11d ; colctr
|
||||||
test rax, rax
|
test rax, rax
|
||||||
@@ -175,7 +175,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
jg near .rowloop
|
jg near .rowloop
|
||||||
|
|
||||||
.return:
|
.return:
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -192,7 +192,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = int max_v_samp_factor
|
; r10 = int max_v_samp_factor
|
||||||
; r11 = JDIMENSION downsampled_width
|
; r11d = JDIMENSION downsampled_width
|
||||||
; r12 = JSAMPARRAY input_data
|
; r12 = JSAMPARRAY input_data
|
||||||
; r13 = JSAMPARRAY *output_data_ptr
|
; r13 = JSAMPARRAY *output_data_ptr
|
||||||
|
|
||||||
@@ -210,7 +210,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov eax, r11d ; colctr
|
mov eax, r11d ; colctr
|
||||||
@@ -473,7 +473,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
@@ -492,7 +492,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = int max_v_samp_factor
|
; r10 = int max_v_samp_factor
|
||||||
; r11 = JDIMENSION output_width
|
; r11d = JDIMENSION output_width
|
||||||
; r12 = JSAMPARRAY input_data
|
; r12 = JSAMPARRAY input_data
|
||||||
; r13 = JSAMPARRAY *output_data_ptr
|
; r13 = JSAMPARRAY *output_data_ptr
|
||||||
|
|
||||||
@@ -503,7 +503,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 4
|
||||||
|
|
||||||
mov edx, r11d
|
mov edx, r11d
|
||||||
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
add rdx, byte (2*SIZEOF_XMMWORD)-1
|
||||||
@@ -564,7 +564,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
jg short .rowloop
|
jg short .rowloop
|
||||||
|
|
||||||
.return:
|
.return:
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -581,7 +581,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = int max_v_samp_factor
|
; r10 = int max_v_samp_factor
|
||||||
; r11 = JDIMENSION output_width
|
; r11d = JDIMENSION output_width
|
||||||
; r12 = JSAMPARRAY input_data
|
; r12 = JSAMPARRAY input_data
|
||||||
; r13 = JSAMPARRAY *output_data_ptr
|
; r13 = JSAMPARRAY *output_data_ptr
|
||||||
|
|
||||||
@@ -592,7 +592,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
mov edx, r11d
|
mov edx, r11d
|
||||||
@@ -661,7 +661,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
|
|||||||
|
|
||||||
.return:
|
.return:
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 1
|
||||||
|
|
||||||
; ---- Pass 1: process rows.
|
; ---- Pass 1: process rows.
|
||||||
|
|
||||||
@@ -346,7 +346,7 @@ EXTN(jsimd_fdct_float_sse):
|
|||||||
dec rcx
|
dec rcx
|
||||||
jnz near .columnloop
|
jnz near .columnloop
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 1
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 1
|
||||||
|
|
||||||
; ---- Pass 1: process rows.
|
; ---- Pass 1: process rows.
|
||||||
|
|
||||||
@@ -380,7 +380,7 @@ EXTN(jsimd_fdct_ifast_sse2):
|
|||||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
|
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
|
||||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 1
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 1
|
||||||
|
|
||||||
; ---- Pass 1: process rows.
|
; ---- Pass 1: process rows.
|
||||||
|
|
||||||
@@ -610,7 +610,7 @@ EXTN(jsimd_fdct_islow_sse2):
|
|||||||
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
|
||||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 1
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
; r10 = void *dct_table
|
; r10 = void *dct_table
|
||||||
; r11 = JCOEFPTR coef_block
|
; r11 = JCOEFPTR coef_block
|
||||||
; r12 = JSAMPARRAY output_buf
|
; r12 = JSAMPARRAY output_buf
|
||||||
; r13 = JDIMENSION output_col
|
; r13d = JDIMENSION output_col
|
||||||
|
|
||||||
%define original_rbp rbp+0
|
%define original_rbp rbp+0
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
@@ -83,7 +83,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [workspace]
|
lea rsp, [workspace]
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
; ---- Pass 1: process columns from input, store into work array.
|
; ---- Pass 1: process columns from input, store into work array.
|
||||||
@@ -471,7 +471,7 @@ EXTN(jsimd_idct_float_sse2):
|
|||||||
jnz near .rowloop
|
jnz near .rowloop
|
||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
; r10 = jpeg_component_info *compptr
|
; r10 = jpeg_component_info *compptr
|
||||||
; r11 = JCOEFPTR coef_block
|
; r11 = JCOEFPTR coef_block
|
||||||
; r12 = JSAMPARRAY output_buf
|
; r12 = JSAMPARRAY output_buf
|
||||||
; r13 = JDIMENSION output_col
|
; r13d = JDIMENSION output_col
|
||||||
|
|
||||||
%define original_rbp rbp+0
|
%define original_rbp rbp+0
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
@@ -102,7 +102,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 4
|
||||||
|
|
||||||
; ---- Pass 1: process columns from input.
|
; ---- Pass 1: process columns from input.
|
||||||
|
|
||||||
@@ -479,7 +479,7 @@ EXTN(jsimd_idct_ifast_sse2):
|
|||||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
|
||||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
; r10 = jpeg_component_info *compptr
|
; r10 = jpeg_component_info *compptr
|
||||||
; r11 = JCOEFPTR coef_block
|
; r11 = JCOEFPTR coef_block
|
||||||
; r12 = JSAMPARRAY output_buf
|
; r12 = JSAMPARRAY output_buf
|
||||||
; r13 = JDIMENSION output_col
|
; r13d = JDIMENSION output_col
|
||||||
|
|
||||||
%define original_rbp rbp+0
|
%define original_rbp rbp+0
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
@@ -115,7 +115,7 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 4
|
||||||
|
|
||||||
; ---- Pass 1: process columns from input.
|
; ---- Pass 1: process columns from input.
|
||||||
|
|
||||||
@@ -836,7 +836,7 @@ EXTN(jsimd_idct_islow_sse2):
|
|||||||
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
|
||||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
|||||||
; r10 = void *dct_table
|
; r10 = void *dct_table
|
||||||
; r11 = JCOEFPTR coef_block
|
; r11 = JCOEFPTR coef_block
|
||||||
; r12 = JSAMPARRAY output_buf
|
; r12 = JSAMPARRAY output_buf
|
||||||
; r13 = JDIMENSION output_col
|
; r13d = JDIMENSION output_col
|
||||||
|
|
||||||
%define original_rbp rbp+0
|
%define original_rbp rbp+0
|
||||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||||
@@ -123,7 +123,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
mov [rsp], rax
|
mov [rsp], rax
|
||||||
mov rbp, rsp ; rbp = aligned rbp
|
mov rbp, rsp ; rbp = aligned rbp
|
||||||
lea rsp, [wk(0)]
|
lea rsp, [wk(0)]
|
||||||
collect_args
|
collect_args 4
|
||||||
|
|
||||||
; ---- Pass 1: process columns from input.
|
; ---- Pass 1: process columns from input.
|
||||||
|
|
||||||
@@ -389,7 +389,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
|
||||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
mov rsp, rbp ; rsp <- aligned rbp
|
mov rsp, rbp ; rsp <- aligned rbp
|
||||||
pop rsp ; rsp <- original rbp
|
pop rsp ; rsp <- original rbp
|
||||||
pop rbp
|
pop rbp
|
||||||
@@ -409,7 +409,7 @@ EXTN(jsimd_idct_4x4_sse2):
|
|||||||
; r10 = void *dct_table
|
; r10 = void *dct_table
|
||||||
; r11 = JCOEFPTR coef_block
|
; r11 = JCOEFPTR coef_block
|
||||||
; r12 = JSAMPARRAY output_buf
|
; r12 = JSAMPARRAY output_buf
|
||||||
; r13 = JDIMENSION output_col
|
; r13d = JDIMENSION output_col
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
global EXTN(jsimd_idct_2x2_sse2)
|
global EXTN(jsimd_idct_2x2_sse2)
|
||||||
@@ -418,7 +418,7 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 4
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
; ---- Pass 1: process columns from input.
|
; ---- Pass 1: process columns from input.
|
||||||
@@ -566,7 +566,7 @@ EXTN(jsimd_idct_2x2_sse2):
|
|||||||
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
|
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
|
||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 4
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JSAMPARRAY sample_data
|
; r10 = JSAMPARRAY sample_data
|
||||||
; r11 = JDIMENSION start_col
|
; r11d = JDIMENSION start_col
|
||||||
; r12 = FAST_FLOAT *workspace
|
; r12 = FAST_FLOAT *workspace
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 3
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
pcmpeqw xmm7, xmm7
|
pcmpeqw xmm7, xmm7
|
||||||
@@ -90,7 +90,7 @@ EXTN(jsimd_convsamp_float_sse2):
|
|||||||
jnz short .convloop
|
jnz short .convloop
|
||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 3
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -115,7 +115,7 @@ EXTN(jsimd_quantize_float_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 3
|
||||||
|
|
||||||
mov rsi, r12
|
mov rsi, r12
|
||||||
mov rdx, r11
|
mov rdx, r11
|
||||||
@@ -148,7 +148,7 @@ EXTN(jsimd_quantize_float_sse2):
|
|||||||
dec rax
|
dec rax
|
||||||
jnz short .quantloop
|
jnz short .quantloop
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 3
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
; r10 = JSAMPARRAY sample_data
|
; r10 = JSAMPARRAY sample_data
|
||||||
; r11 = JDIMENSION start_col
|
; r11d = JDIMENSION start_col
|
||||||
; r12 = DCTELEM *workspace
|
; r12 = DCTELEM *workspace
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
@@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 3
|
||||||
push rbx
|
push rbx
|
||||||
|
|
||||||
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
pxor xmm6, xmm6 ; xmm6=(all 0's)
|
||||||
@@ -85,7 +85,7 @@ EXTN(jsimd_convsamp_sse2):
|
|||||||
jnz short .convloop
|
jnz short .convloop
|
||||||
|
|
||||||
pop rbx
|
pop rbx
|
||||||
uncollect_args
|
uncollect_args 3
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@@ -117,7 +117,7 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
push rbp
|
push rbp
|
||||||
mov rax, rsp
|
mov rax, rsp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
collect_args
|
collect_args 3
|
||||||
|
|
||||||
mov rsi, r12
|
mov rsi, r12
|
||||||
mov rdx, r11
|
mov rdx, r11
|
||||||
@@ -177,7 +177,7 @@ EXTN(jsimd_quantize_sse2):
|
|||||||
dec rax
|
dec rax
|
||||||
jnz near .quantloop
|
jnz near .quantloop
|
||||||
|
|
||||||
uncollect_args
|
uncollect_args 3
|
||||||
pop rbp
|
pop rbp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|||||||
@@ -307,61 +307,99 @@ const_base:
|
|||||||
|
|
||||||
%ifdef WIN64
|
%ifdef WIN64
|
||||||
|
|
||||||
%imacro collect_args 0
|
%imacro collect_args 1
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
mov r10, rcx
|
|
||||||
mov r11, rdx
|
|
||||||
mov r12, r8
|
|
||||||
mov r13, r9
|
|
||||||
mov r14, [rax+48]
|
|
||||||
mov r15, [rax+56]
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, SIZEOF_XMMWORD
|
sub rsp, SIZEOF_XMMWORD
|
||||||
movaps XMMWORD [rsp], xmm6
|
movaps XMMWORD [rsp], xmm6
|
||||||
sub rsp, SIZEOF_XMMWORD
|
sub rsp, SIZEOF_XMMWORD
|
||||||
movaps XMMWORD [rsp], xmm7
|
movaps XMMWORD [rsp], xmm7
|
||||||
|
mov r10, rcx
|
||||||
|
%if %1 > 1
|
||||||
|
mov r11, rdx
|
||||||
|
%endif
|
||||||
|
%if %1 > 2
|
||||||
|
push r12
|
||||||
|
mov r12, r8
|
||||||
|
%endif
|
||||||
|
%if %1 > 3
|
||||||
|
push r13
|
||||||
|
mov r13, r9
|
||||||
|
%endif
|
||||||
|
%if %1 > 4
|
||||||
|
push r14
|
||||||
|
mov r14, [rax+48]
|
||||||
|
%endif
|
||||||
|
%if %1 > 5
|
||||||
|
push r15
|
||||||
|
mov r15, [rax+56]
|
||||||
|
%endif
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%imacro uncollect_args 0
|
%imacro uncollect_args 1
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
%if %1 > 5
|
||||||
|
pop r15
|
||||||
|
%endif
|
||||||
|
%if %1 > 4
|
||||||
|
pop r14
|
||||||
|
%endif
|
||||||
|
%if %1 > 3
|
||||||
|
pop r13
|
||||||
|
%endif
|
||||||
|
%if %1 > 2
|
||||||
|
pop r12
|
||||||
|
%endif
|
||||||
movaps xmm7, XMMWORD [rsp]
|
movaps xmm7, XMMWORD [rsp]
|
||||||
add rsp, SIZEOF_XMMWORD
|
add rsp, SIZEOF_XMMWORD
|
||||||
movaps xmm6, XMMWORD [rsp]
|
movaps xmm6, XMMWORD [rsp]
|
||||||
add rsp, SIZEOF_XMMWORD
|
add rsp, SIZEOF_XMMWORD
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop r15
|
|
||||||
pop r14
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%else
|
%else
|
||||||
|
|
||||||
%imacro collect_args 0
|
%imacro collect_args 1
|
||||||
push r10
|
push r10
|
||||||
push r11
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
push r14
|
|
||||||
push r15
|
|
||||||
mov r10, rdi
|
mov r10, rdi
|
||||||
|
%if %1 > 1
|
||||||
|
push r11
|
||||||
mov r11, rsi
|
mov r11, rsi
|
||||||
|
%endif
|
||||||
|
%if %1 > 2
|
||||||
|
push r12
|
||||||
mov r12, rdx
|
mov r12, rdx
|
||||||
|
%endif
|
||||||
|
%if %1 > 3
|
||||||
|
push r13
|
||||||
mov r13, rcx
|
mov r13, rcx
|
||||||
|
%endif
|
||||||
|
%if %1 > 4
|
||||||
|
push r14
|
||||||
mov r14, r8
|
mov r14, r8
|
||||||
|
%endif
|
||||||
|
%if %1 > 5
|
||||||
|
push r15
|
||||||
mov r15, r9
|
mov r15, r9
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%imacro uncollect_args 0
|
%imacro uncollect_args 1
|
||||||
|
%if %1 > 5
|
||||||
pop r15
|
pop r15
|
||||||
|
%endif
|
||||||
|
%if %1 > 4
|
||||||
pop r14
|
pop r14
|
||||||
|
%endif
|
||||||
|
%if %1 > 3
|
||||||
pop r13
|
pop r13
|
||||||
|
%endif
|
||||||
|
%if %1 > 2
|
||||||
pop r12
|
pop r12
|
||||||
|
%endif
|
||||||
|
%if %1 > 1
|
||||||
pop r11
|
pop r11
|
||||||
|
%endif
|
||||||
pop r10
|
pop r10
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user