x86-64 SIMD: Use std stack frame/prologue/epilogue

This allows debuggers and profilers to reliably capture backtraces from
within the x86-64 SIMD functions.

In places where rbp was previously used to access temporary variables
(after stack alignment), we now use r15 and save/restore it accordingly.
The total amount of work is approximately the same, because the previous
code pushed the pre-alignment stack pointer to the aligned stack.  The
new prologue and epilogue actually have fewer instructions.

Also note that the {un}collect_args macros now use rbp instead of rax to
access arguments passed on the stack, so we save a few instructions
there as well.

Based on:
debcc7c3b4

Closes #707
Closes #708
This commit is contained in:
DRC
2023-07-28 11:46:10 -04:00
parent e17fa3a271
commit 7b844bfda6
29 changed files with 209 additions and 215 deletions

View File

@@ -1,3 +1,13 @@
3.0.1
=====
### Significant changes relative to 3.0.0:
1. The x86-64 SIMD functions now use a standard stack frame, prologue, and
epilogue so that debuggers and profilers can reliably capture backtraces from
within the functions.
3.0.0
=====

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
; Copyright (C) 2018, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
;
@@ -397,11 +398,11 @@ const_base:
%endif
%if %1 > 4
push r14
mov r14, [rax+48]
mov r14, [rbp+48]
%endif
%if %1 > 5
push r15
mov r15, [rax+56]
mov r15, [rbp+56]
%endif
push rsi
push rdi

View File

@@ -4,6 +4,7 @@
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13d = JDIMENSION output_row
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 8
align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_rgb_ycc_convert_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, (SIZEOF_YMMWORD * WK_NUM)
collect_args 5
push rbx
@@ -549,8 +550,8 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -3,6 +3,7 @@
;
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +33,7 @@
; r13d = JDIMENSION output_row
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
align 32
@@ -40,12 +41,12 @@
EXTN(jsimd_rgb_ycc_convert_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 5
push rbx
@@ -474,8 +475,8 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.return:
pop rbx
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -4,6 +4,7 @@
; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13d = JDIMENSION output_row
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_rgb_gray_convert_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_YMMWORD * WK_NUM)
collect_args 5
push rbx
@@ -428,8 +429,8 @@ EXTN(jsimd_rgb_gray_convert_avx2):
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -3,6 +3,7 @@
;
; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +33,7 @@
; r13d = JDIMENSION output_row
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -40,12 +41,12 @@
EXTN(jsimd_rgb_gray_convert_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5
push rbx
@@ -353,8 +354,8 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.return:
pop rbx
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -1,9 +1,10 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
;
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -208,15 +209,15 @@ times 1 << 15 db 16
; rax - buffer
; rbx - temp
; rcx - nbits
; rdx - block --> free_bits
; rdx - code
; rsi - nbits_base
; rdi - t
; rbp - code
; r8 - dctbl --> code_temp
; r9 - actbl
; r10 - state
; r11 - index
; r12 - put_buffer
; r15 - block --> free_bits
%define buffer rax
%ifdef WIN64
@@ -231,12 +232,11 @@ times 1 << 15 db 16
%define nbitsq rcx
%define nbits ecx
%define nbitsb cl
%define block rdx
%define codeq rdx
%define code edx
%define nbits_base rsi
%define t rdi
%define td edi
%define codeq rbp
%define code ebp
%define dctbl r8
%define actbl r9
%define state r10
@@ -244,6 +244,7 @@ times 1 << 15 db 16
%define indexd r11d
%define put_buffer r12
%define put_bufferd r12d
%define block r15
; Step 1: Re-arrange input data according to jpeg_natural_order
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
@@ -259,6 +260,8 @@ times 1 << 15 db 16
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
push rbp
mov rbp, rsp
%ifdef WIN64
@@ -266,15 +269,15 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; rdx = JOCTET *buffer
; r8 = JCOEFPTR block
; r9 = int last_dc_val
; [rax+48] = c_derived_tbl *dctbl
; [rax+56] = c_derived_tbl *actbl
; [rbp+48] = c_derived_tbl *dctbl
; [rbp+56] = c_derived_tbl *actbl
;X: X = code stream
mov buffer, rdx
push r15
mov block, r8
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
push rbx
push rbp
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
push rsi
push rdi
@@ -284,12 +287,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movsx code, word [block] ;Z: code = block[0];
pxor xmm4, xmm4 ;A: w4[i] = 0;
sub code, r9d ;Z: code -= last_dc_val;
mov dctbl, POINTER [rsp+6*8+4*8]
mov actbl, POINTER [rsp+6*8+5*8]
mov dctbl, POINTER [rbp+48]
mov actbl, POINTER [rbp+56]
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
lea nbits_base, [rel jpeg_nbits_table]
add rsp, -DCTSIZE2 * SIZEOF_WORD
mov t, rsp
%else
@@ -301,9 +302,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; r9 = c_derived_tbl *actbl
;X: X = code stream
push r15
mov block, rdx
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
push rbx
push rbp
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
push r12
mov state, rdi
@@ -314,10 +316,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pxor xmm4, xmm4 ;A: w4[i] = 0;
sub codeq, rcx ;Z: code -= last_dc_val;
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
%endif
; Allocate stack space for t array, and realign stack.
add rsp, -DCTSIZE2 * SIZEOF_WORD - 8
mov t, rsp
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
@@ -443,9 +448,9 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
; (Row 4, offset 1)
%undef block
%define free_bitsq rdx
%define free_bitsd edx
%define free_bitsb dl
%define free_bitsq r15
%define free_bitsd r15d
%define free_bitsb r15b
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
shl tempq, 48 ;Z: temp <<= 48;
pxor xmm2, xmm2 ;E: w2[i] = 0;
@@ -534,12 +539,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
test index, index
jnz .BLOOP ; } while (index != 0);
.ELOOP: ; } /* index != 0 */
sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
%ifdef WIN64
sub td, esp ; t -= &t_[0];
cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
%else
cmp td, -2 * SIZEOF_WORD ; if (t != -2)
%endif
je .EFN ; {
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
; nbits = actbl->ehufsi[0];
@@ -556,18 +557,17 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; state->cur.put_buffer.simd = put_buffer;
mov byte [state + working_state.cur.free_bits], free_bitsb
; state->cur.free_bits = free_bits;
%ifdef WIN64
sub rsp, -DCTSIZE2 * SIZEOF_WORD
sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8
pop r12
%ifdef WIN64
pop rdi
pop rsi
pop rbp
pop rbx
%else
pop r12
pop rbp
pop rbx
%endif
pop r15
pop rbp
ret
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -3,6 +3,7 @@
; (64-bit SSE2)
;
; Copyright (C) 2016, 2018, Matthieu Darbois
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -282,16 +283,12 @@
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [rbp - 16]
sub rsp, SIZEOF_XMMWORD
movdqa XMMWORD [rsp], ZERO
collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
movd AL, r13d
pxor ZERO, ZERO
mov K, LEN
@@ -384,10 +381,9 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
REDUCE0
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
movdqa ZERO, XMMWORD [rsp]
mov rsp, rbp
pop rbp
ret
@@ -450,16 +446,12 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [rbp - 16]
sub rsp, SIZEOF_XMMWORD
movdqa XMMWORD [rsp], ZERO
collect_args 6
movdqa XMMWORD [rbp - 16], ZERO
xor SIGN, SIGN
xor EOB, EOB
xor KK, KK
@@ -606,10 +598,9 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
REDUCE0
mov eax, EOB
movdqa ZERO, XMMWORD [rbp - 16]
uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
movdqa ZERO, XMMWORD [rsp]
mov rsp, rbp
pop rbp
ret

View File

@@ -45,7 +45,6 @@
EXTN(jsimd_h2v1_downsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 6
@@ -207,7 +206,6 @@ EXTN(jsimd_h2v1_downsample_avx2):
EXTN(jsimd_h2v2_downsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 6

View File

@@ -44,7 +44,6 @@
EXTN(jsimd_h2v1_downsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 6
@@ -189,7 +188,6 @@ EXTN(jsimd_h2v1_downsample_sse2):
EXTN(jsimd_h2v2_downsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 6

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +35,7 @@
; r13 = JSAMPARRAY output_buf
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -42,12 +43,12 @@
EXTN(jsimd_ycc_rgb_convert_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (WK_NUM * SIZEOF_YMMWORD)
collect_args 5
push rbx
@@ -486,8 +487,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r13 = JSAMPARRAY output_buf
; r14d = int num_rows
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_ycc_rgb_convert_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5
push rbx
@@ -429,8 +430,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
.return:
pop rbx
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +35,7 @@
; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 3
align 32
@@ -42,12 +43,12 @@
EXTN(jsimd_h2v1_merged_upsample_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, SIZEOF_YMMWORD * WK_NUM
collect_args 4
push rbx
@@ -480,8 +481,8 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
pop rbx
vzeroupper
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
@@ -506,7 +507,6 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
EXTN(jsimd_h2v2_merged_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx

View File

@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +34,7 @@
; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 3
align 32
@@ -41,12 +42,12 @@
EXTN(jsimd_h2v1_merged_upsample_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4
push rbx
@@ -422,8 +423,8 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.return:
pop rbx
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
@@ -448,7 +449,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx

View File

@@ -5,6 +5,7 @@
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -62,7 +63,6 @@ PW_EIGHT times 16 dw 8
EXTN(jsimd_h2v1_fancy_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
push_xmm 3
collect_args 4
@@ -208,7 +208,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 4
align 32
@@ -216,12 +216,12 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
EXTN(jsimd_h2v2_fancy_upsample_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_YMMWORD) ; align to 128 bits
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, (SIZEOF_YMMWORD * WK_NUM)
push_xmm 3
collect_args 4
push rbx
@@ -500,8 +500,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vzeroupper
uncollect_args 4
pop_xmm 3
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
@@ -525,7 +525,6 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
EXTN(jsimd_h2v1_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
@@ -614,7 +613,6 @@ EXTN(jsimd_h2v1_upsample_avx2):
EXTN(jsimd_h2v2_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -61,7 +62,6 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
@@ -195,7 +195,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 4
align 32
@@ -203,12 +203,12 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
EXTN(jsimd_h2v2_fancy_upsample_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4
push rbx
@@ -473,8 +473,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.return:
pop rbx
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
@@ -498,7 +498,6 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
EXTN(jsimd_h2v1_upsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
@@ -585,7 +584,6 @@ EXTN(jsimd_h2v1_upsample_sse2):
EXTN(jsimd_h2v2_upsample_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx

View File

@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,7 +59,7 @@ PD_1_306 times 4 dd 1.306562964876376527856643
; r10 = FAST_FLOAT *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -66,12 +67,12 @@ PD_1_306 times 4 dd 1.306562964876376527856643
EXTN(jsimd_fdct_float_sse):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1
; ---- Pass 1: process rows.
@@ -345,8 +346,8 @@ EXTN(jsimd_fdct_float_sse):
jnz near .columnloop
uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -73,7 +74,7 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
; r10 = DCTELEM *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
align 32
@@ -81,12 +82,12 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
EXTN(jsimd_fdct_ifast_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1
; ---- Pass 1: process rows.
@@ -379,8 +380,8 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -261,7 +261,6 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_fdct_islow_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 1

View File

@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -94,7 +95,7 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
; r10 = DCTELEM *data
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 6
align 32
@@ -102,12 +103,12 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
EXTN(jsimd_fdct_islow_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1
; ---- Pass 1: process rows.
@@ -609,8 +610,8 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,8 +66,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col
%define original_rbp rbp + 0
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
@@ -77,11 +77,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_float_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
lea rsp, [workspace]
collect_args 4
push rbx
@@ -322,7 +322,6 @@ EXTN(jsimd_idct_float_sse2):
; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
lea rsi, [workspace] ; FAST_FLOAT *wsptr
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
@@ -472,8 +471,8 @@ EXTN(jsimd_idct_float_sse2):
pop rbx
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -86,8 +87,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col
%define original_rbp rbp + 0
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
@@ -96,12 +96,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_ifast_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4
; ---- Pass 1: process columns from input.
@@ -320,7 +320,6 @@ EXTN(jsimd_idct_ifast_sse2):
; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
@@ -480,8 +479,8 @@ EXTN(jsimd_idct_ifast_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
ret

View File

@@ -283,7 +283,6 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_idct_islow_avx2):
push rbp
mov rax, rsp ; rax = original rbp
mov rbp, rsp ; rbp = aligned rbp
push_xmm 4
collect_args 4

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -99,8 +100,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col
%define original_rbp rbp + 0
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 12
@@ -109,12 +109,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_islow_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 4
; ---- Pass 1: process columns from input.
@@ -512,7 +512,6 @@ EXTN(jsimd_idct_islow_sse2):
; ---- Pass 2: process rows from work array, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
@@ -837,8 +836,8 @@ EXTN(jsimd_idct_islow_sse2):
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret

View File

@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -107,8 +108,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
; r12 = JSAMPARRAY output_buf
; r13d = JDIMENSION output_col
%define original_rbp rbp + 0
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
; xmmword wk[WK_NUM]
%define WK_NUM 2
@@ -117,12 +117,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
EXTN(jsimd_idct_4x4_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
mov rbp, rsp
push r15
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4
; ---- Pass 1: process columns from input.
@@ -309,7 +309,6 @@ EXTN(jsimd_idct_4x4_sse2):
; ---- Pass 2: process rows, store into output array.
mov rax, [original_rbp]
mov rdi, r12 ; (JSAMPROW *)
mov eax, r13d
@@ -390,8 +389,8 @@ EXTN(jsimd_idct_4x4_sse2):
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
lea rsp, [rbp-8]
pop r15
pop rbp
ret
@@ -415,7 +414,6 @@ EXTN(jsimd_idct_4x4_sse2):
EXTN(jsimd_idct_2x2_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx

View File

@@ -38,7 +38,6 @@
EXTN(jsimd_convsamp_float_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3
push rbx
@@ -111,7 +110,6 @@ EXTN(jsimd_convsamp_float_sse2):
EXTN(jsimd_quantize_float_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3

View File

@@ -39,7 +39,6 @@
EXTN(jsimd_convsamp_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_avx2):
EXTN(jsimd_quantize_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3

View File

@@ -38,7 +38,6 @@
EXTN(jsimd_convsamp_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3
push rbx
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_sse2):
EXTN(jsimd_quantize_sse2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 3

View File

@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka.
;
; Based on
; x86 SIMD extension for IJG JPEG library
@@ -31,6 +32,8 @@
GLOBAL_FUNCTION(jpeg_simd_cpu_support)
EXTN(jpeg_simd_cpu_support):
push rbp
mov rbp, rsp
push rbx
push rdi
@@ -79,6 +82,7 @@ EXTN(jpeg_simd_cpu_support):
pop rdi
pop rbx
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the