x86-64 SIMD: Use std stack frame/prologue/epilogue

This allows debuggers and profilers to reliably capture backtraces from
within the x86-64 SIMD functions.

In places where rbp was previously used to access temporary variables
(after stack alignment), we now use r15 and save/restore it accordingly.
The total amount of work is approximately the same, because the previous
code pushed the pre-alignment stack pointer to the aligned stack.  The
new prologue and epilogue actually have fewer instructions.

Also note that the {un}collect_args macros now use rbp instead of rax to
access arguments passed on the stack, so we save a few instructions
there as well.

Based on:
debcc7c3b4

Closes #707
Closes #708
This commit is contained in:
DRC
2023-07-28 11:46:10 -04:00
parent e17fa3a271
commit 7b844bfda6
29 changed files with 209 additions and 215 deletions

View File

@@ -1,3 +1,13 @@
3.0.1
=====
### Significant changes relative to 3.0.0:
1. The x86-64 SIMD functions now use a standard stack frame, prologue, and
epilogue so that debuggers and profilers can reliably capture backtraces from
within the functions.
3.0.0 3.0.0
===== =====

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. ; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
; Copyright (C) 2018, Matthieu Darbois. ; Copyright (C) 2018, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
; ;
@@ -397,11 +398,11 @@ const_base:
%endif %endif
%if %1 > 4 %if %1 > 4
push r14 push r14
mov r14, [rax+48] mov r14, [rbp+48]
%endif %endif
%if %1 > 5 %if %1 > 5
push r15 push r15
mov r15, [rax+56] mov r15, [rbp+56]
%endif %endif
push rsi push rsi
push rdi push rdi

View File

@@ -4,6 +4,7 @@
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13d = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 8 %define WK_NUM 8
align 32 align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_rgb_ycc_convert_avx2): EXTN(jsimd_rgb_ycc_convert_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, (SIZEOF_YMMWORD * WK_NUM)
collect_args 5 collect_args 5
push rbx push rbx
@@ -549,8 +550,8 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -3,6 +3,7 @@
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +33,7 @@
; r13d = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8 %define WK_NUM 8
align 32 align 32
@@ -40,12 +41,12 @@
EXTN(jsimd_rgb_ycc_convert_sse2): EXTN(jsimd_rgb_ycc_convert_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 collect_args 5
push rbx push rbx
@@ -474,8 +475,8 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -4,6 +4,7 @@
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13d = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_rgb_gray_convert_avx2): EXTN(jsimd_rgb_gray_convert_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_YMMWORD * WK_NUM)
collect_args 5 collect_args 5
push rbx push rbx
@@ -428,8 +429,8 @@ EXTN(jsimd_rgb_gray_convert_avx2):
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -3,6 +3,7 @@
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +33,7 @@
; r13d = JDIMENSION output_row ; r13d = JDIMENSION output_row
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -40,12 +41,12 @@
EXTN(jsimd_rgb_gray_convert_sse2): EXTN(jsimd_rgb_gray_convert_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 collect_args 5
push rbx push rbx
@@ -353,8 +354,8 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -1,9 +1,10 @@
; ;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
; ;
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -208,15 +209,15 @@ times 1 << 15 db 16
; rax - buffer ; rax - buffer
; rbx - temp ; rbx - temp
; rcx - nbits ; rcx - nbits
; rdx - block --> free_bits ; rdx - code
; rsi - nbits_base ; rsi - nbits_base
; rdi - t ; rdi - t
; rbp - code
; r8 - dctbl --> code_temp ; r8 - dctbl --> code_temp
; r9 - actbl ; r9 - actbl
; r10 - state ; r10 - state
; r11 - index ; r11 - index
; r12 - put_buffer ; r12 - put_buffer
; r15 - block --> free_bits
%define buffer rax %define buffer rax
%ifdef WIN64 %ifdef WIN64
@@ -231,12 +232,11 @@ times 1 << 15 db 16
%define nbitsq rcx %define nbitsq rcx
%define nbits ecx %define nbits ecx
%define nbitsb cl %define nbitsb cl
%define block rdx %define codeq rdx
%define code edx
%define nbits_base rsi %define nbits_base rsi
%define t rdi %define t rdi
%define td edi %define td edi
%define codeq rbp
%define code ebp
%define dctbl r8 %define dctbl r8
%define actbl r9 %define actbl r9
%define state r10 %define state r10
@@ -244,6 +244,7 @@ times 1 << 15 db 16
%define indexd r11d %define indexd r11d
%define put_buffer r12 %define put_buffer r12
%define put_bufferd r12d %define put_bufferd r12d
%define block r15
; Step 1: Re-arrange input data according to jpeg_natural_order ; Step 1: Re-arrange input data according to jpeg_natural_order
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
@@ -259,6 +260,8 @@ times 1 << 15 db 16
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2): EXTN(jsimd_huff_encode_one_block_sse2):
push rbp
mov rbp, rsp
%ifdef WIN64 %ifdef WIN64
@@ -266,15 +269,15 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; rdx = JOCTET *buffer ; rdx = JOCTET *buffer
; r8 = JCOEFPTR block ; r8 = JCOEFPTR block
; r9 = int last_dc_val ; r9 = int last_dc_val
; [rax+48] = c_derived_tbl *dctbl ; [rbp+48] = c_derived_tbl *dctbl
; [rax+56] = c_derived_tbl *actbl ; [rbp+56] = c_derived_tbl *actbl
;X: X = code stream ;X: X = code stream
mov buffer, rdx mov buffer, rdx
push r15
mov block, r8 mov block, r8
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
push rbx push rbx
push rbp
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
push rsi push rsi
push rdi push rdi
@@ -284,12 +287,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movsx code, word [block] ;Z: code = block[0]; movsx code, word [block] ;Z: code = block[0];
pxor xmm4, xmm4 ;A: w4[i] = 0; pxor xmm4, xmm4 ;A: w4[i] = 0;
sub code, r9d ;Z: code -= last_dc_val; sub code, r9d ;Z: code -= last_dc_val;
mov dctbl, POINTER [rsp+6*8+4*8] mov dctbl, POINTER [rbp+48]
mov actbl, POINTER [rsp+6*8+5*8] mov actbl, POINTER [rbp+56]
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
lea nbits_base, [rel jpeg_nbits_table] lea nbits_base, [rel jpeg_nbits_table]
add rsp, -DCTSIZE2 * SIZEOF_WORD
mov t, rsp
%else %else
@@ -301,9 +302,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; r9 = c_derived_tbl *actbl ; r9 = c_derived_tbl *actbl
;X: X = code stream ;X: X = code stream
push r15
mov block, rdx
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
push rbx push rbx
push rbp
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
push r12 push r12
mov state, rdi mov state, rdi
@@ -314,10 +316,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pxor xmm4, xmm4 ;A: w4[i] = 0; pxor xmm4, xmm4 ;A: w4[i] = 0;
sub codeq, rcx ;Z: code -= last_dc_val; sub codeq, rcx ;Z: code -= last_dc_val;
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
%endif %endif
; Allocate stack space for t array, and realign stack.
add rsp, -DCTSIZE2 * SIZEOF_WORD - 8
mov t, rsp
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
@@ -443,9 +448,9 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
; (Row 4, offset 1) ; (Row 4, offset 1)
%undef block %undef block
%define free_bitsq rdx %define free_bitsq r15
%define free_bitsd edx %define free_bitsd r15d
%define free_bitsb dl %define free_bitsb r15b
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
shl tempq, 48 ;Z: temp <<= 48; shl tempq, 48 ;Z: temp <<= 48;
pxor xmm2, xmm2 ;E: w2[i] = 0; pxor xmm2, xmm2 ;E: w2[i] = 0;
@@ -534,12 +539,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
test index, index test index, index
jnz .BLOOP ; } while (index != 0); jnz .BLOOP ; } while (index != 0);
.ELOOP: ; } /* index != 0 */ .ELOOP: ; } /* index != 0 */
sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); sub td, esp ; t -= &t_[0];
%ifdef WIN64
cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
%else
cmp td, -2 * SIZEOF_WORD ; if (t != -2)
%endif
je .EFN ; { je .EFN ; {
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
; nbits = actbl->ehufsi[0]; ; nbits = actbl->ehufsi[0];
@@ -556,18 +557,17 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; state->cur.put_buffer.simd = put_buffer; ; state->cur.put_buffer.simd = put_buffer;
mov byte [state + working_state.cur.free_bits], free_bitsb mov byte [state + working_state.cur.free_bits], free_bitsb
; state->cur.free_bits = free_bits; ; state->cur.free_bits = free_bits;
%ifdef WIN64 sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8
sub rsp, -DCTSIZE2 * SIZEOF_WORD
pop r12 pop r12
%ifdef WIN64
pop rdi pop rdi
pop rsi pop rsi
pop rbp
pop rbx pop rbx
%else %else
pop r12
pop rbp
pop rbx pop rbx
%endif %endif
pop r15
pop rbp
ret ret
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -3,6 +3,7 @@
; (64-bit SSE2) ; (64-bit SSE2)
; ;
; Copyright (C) 2016, 2018, Matthieu Darbois ; Copyright (C) 2016, 2018, Matthieu Darbois
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -282,16 +283,12 @@
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax sub rsp, SIZEOF_XMMWORD
mov rbp, rsp ; rbp = aligned rbp movdqa XMMWORD [rsp], ZERO
lea rsp, [rbp - 16]
collect_args 6 collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
movd AL, r13d movd AL, r13d
pxor ZERO, ZERO pxor ZERO, ZERO
mov K, LEN mov K, LEN
@@ -384,10 +381,9 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
REDUCE0 REDUCE0
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6 uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp movdqa ZERO, XMMWORD [rsp]
pop rsp ; rsp <- original rbp mov rsp, rbp
pop rbp pop rbp
ret ret
@@ -450,16 +446,12 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax sub rsp, SIZEOF_XMMWORD
mov rbp, rsp ; rbp = aligned rbp movdqa XMMWORD [rsp], ZERO
lea rsp, [rbp - 16]
collect_args 6 collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
xor SIGN, SIGN xor SIGN, SIGN
xor EOB, EOB xor EOB, EOB
xor KK, KK xor KK, KK
@@ -606,10 +598,9 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
REDUCE0 REDUCE0
mov eax, EOB mov eax, EOB
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6 uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp movdqa ZERO, XMMWORD [rsp]
pop rsp ; rsp <- original rbp mov rsp, rbp
pop rbp pop rbp
ret ret

View File

@@ -45,7 +45,6 @@
EXTN(jsimd_h2v1_downsample_avx2): EXTN(jsimd_h2v1_downsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 6 collect_args 6
@@ -207,7 +206,6 @@ EXTN(jsimd_h2v1_downsample_avx2):
EXTN(jsimd_h2v2_downsample_avx2): EXTN(jsimd_h2v2_downsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 6 collect_args 6

View File

@@ -44,7 +44,6 @@
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 6 collect_args 6
@@ -189,7 +188,6 @@ EXTN(jsimd_h2v1_downsample_sse2):
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 6 collect_args 6

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +35,7 @@
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -42,12 +43,12 @@
EXTN(jsimd_ycc_rgb_convert_avx2): EXTN(jsimd_ycc_rgb_convert_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (WK_NUM * SIZEOF_YMMWORD)
collect_args 5 collect_args 5
push rbx push rbx
@@ -486,8 +487,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
; r14d = int num_rows ; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_ycc_rgb_convert_sse2): EXTN(jsimd_ycc_rgb_convert_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 collect_args 5
push rbx push rbx
@@ -429,8 +430,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +35,7 @@
; r12d = JDIMENSION in_row_group_ctr ; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 3 %define WK_NUM 3
align 32 align 32
@@ -42,12 +43,12 @@
EXTN(jsimd_h2v1_merged_upsample_avx2): EXTN(jsimd_h2v1_merged_upsample_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, SIZEOF_YMMWORD * WK_NUM
collect_args 4 collect_args 4
push rbx push rbx
@@ -480,8 +481,8 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
@@ -506,7 +507,6 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
EXTN(jsimd_h2v2_merged_upsample_avx2): EXTN(jsimd_h2v2_merged_upsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
push rbx push rbx

View File

@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r12d = JDIMENSION in_row_group_ctr ; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf ; r13 = JSAMPARRAY output_buf
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3 %define WK_NUM 3
align 32 align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_h2v1_merged_upsample_sse2): EXTN(jsimd_h2v1_merged_upsample_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 collect_args 4
push rbx push rbx
@@ -422,8 +423,8 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
@@ -448,7 +449,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
push rbx push rbx

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -62,7 +63,6 @@ PW_EIGHT times 16 dw 8
EXTN(jsimd_h2v1_fancy_upsample_avx2): EXTN(jsimd_h2v1_fancy_upsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
push_xmm 3 push_xmm 3
collect_args 4 collect_args 4
@@ -208,7 +208,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 4 %define WK_NUM 4
align 32 align 32
@@ -216,12 +216,12 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
EXTN(jsimd_h2v2_fancy_upsample_avx2): EXTN(jsimd_h2v2_fancy_upsample_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits and rsp, byte (-SIZEOF_YMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, (SIZEOF_YMMWORD * WK_NUM)
push_xmm 3 push_xmm 3
collect_args 4 collect_args 4
push rbx push rbx
@@ -500,8 +500,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vzeroupper vzeroupper
uncollect_args 4 uncollect_args 4
pop_xmm 3 pop_xmm 3
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
@@ -525,7 +525,6 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
EXTN(jsimd_h2v1_upsample_avx2): EXTN(jsimd_h2v1_upsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
@@ -614,7 +613,6 @@ EXTN(jsimd_h2v1_upsample_avx2):
EXTN(jsimd_h2v2_upsample_avx2): EXTN(jsimd_h2v2_upsample_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
push rbx push rbx

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -61,7 +62,6 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2): EXTN(jsimd_h2v1_fancy_upsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
@@ -195,7 +195,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
; r12 = JSAMPARRAY input_data ; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr ; r13 = JSAMPARRAY *output_data_ptr
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 4 %define WK_NUM 4
align 32 align 32
@@ -203,12 +203,12 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
EXTN(jsimd_h2v2_fancy_upsample_sse2): EXTN(jsimd_h2v2_fancy_upsample_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 collect_args 4
push rbx push rbx
@@ -473,8 +473,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
@@ -498,7 +498,6 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
EXTN(jsimd_h2v1_upsample_sse2): EXTN(jsimd_h2v1_upsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
@@ -585,7 +584,6 @@ EXTN(jsimd_h2v1_upsample_sse2):
EXTN(jsimd_h2v2_upsample_sse2): EXTN(jsimd_h2v2_upsample_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
push rbx push rbx

View File

@@ -3,6 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,7 +59,7 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; r10 = FAST_FLOAT *data ; r10 = FAST_FLOAT *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -66,12 +67,12 @@ PD_1_306 times 4 dd 1.306562964876376527856643
EXTN(jsimd_fdct_float_sse): EXTN(jsimd_fdct_float_sse):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -345,8 +346,8 @@ EXTN(jsimd_fdct_float_sse):
jnz near .columnloop jnz near .columnloop
uncollect_args 1 uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -3,6 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -73,7 +74,7 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; r10 = DCTELEM *data ; r10 = DCTELEM *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
align 32 align 32
@@ -81,12 +82,12 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
EXTN(jsimd_fdct_ifast_sse2): EXTN(jsimd_fdct_ifast_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -379,8 +380,8 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args 1 uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -261,7 +261,6 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_fdct_islow_avx2): EXTN(jsimd_fdct_islow_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 1 collect_args 1

View File

@@ -3,6 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2020, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -94,7 +95,7 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
; r10 = DCTELEM *data ; r10 = DCTELEM *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 6 %define WK_NUM 6
align 32 align 32
@@ -102,12 +103,12 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
EXTN(jsimd_fdct_islow_sse2): EXTN(jsimd_fdct_islow_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 collect_args 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -609,8 +610,8 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
uncollect_args 1 uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,8 +66,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp + 0 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM] ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
@@ -77,11 +77,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_float_sse2): EXTN(jsimd_idct_float_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [workspace] lea rsp, [workspace]
collect_args 4 collect_args 4
push rbx push rbx
@@ -322,7 +322,6 @@ EXTN(jsimd_idct_float_sse2):
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT *wsptr lea rsi, [workspace] ; FAST_FLOAT *wsptr
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
@@ -472,8 +471,8 @@ EXTN(jsimd_idct_float_sse2):
pop rbx pop rbx
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -86,8 +87,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp + 0 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM] ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
@@ -96,12 +96,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_ifast_sse2): EXTN(jsimd_idct_ifast_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -320,7 +320,6 @@ EXTN(jsimd_idct_ifast_sse2):
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
@@ -480,8 +479,8 @@ EXTN(jsimd_idct_ifast_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
ret ret

View File

@@ -283,7 +283,6 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_idct_islow_avx2): EXTN(jsimd_idct_islow_avx2):
push rbp push rbp
mov rax, rsp ; rax = original rbp
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
push_xmm 4 push_xmm 4
collect_args 4 collect_args 4

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2020, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -99,8 +100,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp + 0 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM] ; xmmword wk[WK_NUM]
%define WK_NUM 12 %define WK_NUM 12
@@ -109,12 +109,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_islow_sse2): EXTN(jsimd_idct_islow_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -512,7 +512,6 @@ EXTN(jsimd_idct_islow_sse2):
; ---- Pass 2: process rows from work array, store into output array. ; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
@@ -837,8 +836,8 @@ EXTN(jsimd_idct_islow_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -107,8 +108,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf ; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col ; r13d = JDIMENSION output_col
%define original_rbp rbp + 0 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM] ; xmmword wk[WK_NUM]
%define WK_NUM 2 %define WK_NUM 2
@@ -117,12 +117,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_4x4_sse2): EXTN(jsimd_idct_4x4_sse2):
push rbp push rbp
mov rax, rsp ; rax = original rbp mov rbp, rsp
sub rsp, byte 4 push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax ; Allocate stack space for wk array. r15 is used to access it.
mov rbp, rsp ; rbp = aligned rbp mov r15, rsp
lea rsp, [wk(0)] sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 collect_args 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -309,7 +309,6 @@ EXTN(jsimd_idct_4x4_sse2):
; ---- Pass 2: process rows, store into output array. ; ---- Pass 2: process rows, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *) mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d mov eax, r13d
@@ -390,8 +389,8 @@ EXTN(jsimd_idct_4x4_sse2):
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args 4 uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp lea rsp, [rbp-8]
pop rsp ; rsp <- original rbp pop r15
pop rbp pop rbp
ret ret
@@ -415,7 +414,6 @@ EXTN(jsimd_idct_4x4_sse2):
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 4 collect_args 4
push rbx push rbx

View File

@@ -38,7 +38,6 @@
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3
push rbx push rbx
@@ -111,7 +110,6 @@ EXTN(jsimd_convsamp_float_sse2):
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3

View File

@@ -39,7 +39,6 @@
EXTN(jsimd_convsamp_avx2): EXTN(jsimd_convsamp_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_avx2):
EXTN(jsimd_quantize_avx2): EXTN(jsimd_quantize_avx2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3

View File

@@ -38,7 +38,6 @@
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3
push rbx push rbx
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_sse2):
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push rbp push rbp
mov rax, rsp
mov rbp, rsp mov rbp, rsp
collect_args 3 collect_args 3

View File

@@ -3,6 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on ; Based on
; x86 SIMD extension for IJG JPEG library ; x86 SIMD extension for IJG JPEG library
@@ -31,6 +32,8 @@
GLOBAL_FUNCTION(jpeg_simd_cpu_support) GLOBAL_FUNCTION(jpeg_simd_cpu_support)
EXTN(jpeg_simd_cpu_support): EXTN(jpeg_simd_cpu_support):
push rbp
mov rbp, rsp
push rbx push rbx
push rdi push rdi
@@ -79,6 +82,7 @@ EXTN(jpeg_simd_cpu_support):
pop rdi pop rdi
pop rbx pop rbx
pop rbp
ret ret
; For some reason, the OS X linker does not honor the request to align the ; For some reason, the OS X linker does not honor the request to align the