x86-64 SIMD: Use std stack frame/prologue/epilogue
This allows debuggers and profilers to reliably capture backtraces from
within the x86-64 SIMD functions.
In places where rbp was previously used to access temporary variables
(after stack alignment), we now use r15 and save/restore it accordingly.
The total amount of work is approximately the same, because the previous
code pushed the pre-alignment stack pointer to the aligned stack. The
new prologue and epilogue actually have fewer instructions.
Also note that the {un}collect_args macros now use rbp instead of rax to
access arguments passed on the stack, so we save a few instructions
there as well.
Based on:
debcc7c3b4
Closes #707
Closes #708
This commit is contained in:
10
ChangeLog.md
10
ChangeLog.md
@@ -1,3 +1,13 @@
|
||||
3.0.1
|
||||
=====
|
||||
|
||||
### Significant changes relative to 3.0.0:
|
||||
|
||||
1. The x86-64 SIMD functions now use a standard stack frame, prologue, and
|
||||
epilogue so that debuggers and profilers can reliably capture backtraces from
|
||||
within the functions.
|
||||
|
||||
|
||||
3.0.0
|
||||
=====
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthieu Darbois.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
|
||||
;
|
||||
@@ -397,11 +398,11 @@ const_base:
|
||||
%endif
|
||||
%if %1 > 4
|
||||
push r14
|
||||
mov r14, [rax+48]
|
||||
mov r14, [rbp+48]
|
||||
%endif
|
||||
%if %1 > 5
|
||||
push r15
|
||||
mov r15, [rax+56]
|
||||
mov r15, [rbp+56]
|
||||
%endif
|
||||
push rsi
|
||||
push rdi
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -33,7 +34,7 @@
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
|
||||
align 32
|
||||
@@ -41,12 +42,12 @@
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, (SIZEOF_YMMWORD * WK_NUM)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -549,8 +550,8 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -32,7 +33,7 @@
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 8
|
||||
|
||||
align 32
|
||||
@@ -40,12 +41,12 @@
|
||||
|
||||
EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -474,8 +475,8 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -33,7 +34,7 @@
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -41,12 +42,12 @@
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_YMMWORD * WK_NUM)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -428,8 +429,8 @@ EXTN(jsimd_rgb_gray_convert_avx2):
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright (C) 2011, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -32,7 +33,7 @@
|
||||
; r13d = JDIMENSION output_row
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -40,12 +41,12 @@
|
||||
|
||||
EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -353,8 +354,8 @@ EXTN(jsimd_rgb_gray_convert_sse2):
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
;
|
||||
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
|
||||
; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander.
|
||||
; Copyright (C) 2015, Matthieu Darbois.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -208,15 +209,15 @@ times 1 << 15 db 16
|
||||
; rax - buffer
|
||||
; rbx - temp
|
||||
; rcx - nbits
|
||||
; rdx - block --> free_bits
|
||||
; rdx - code
|
||||
; rsi - nbits_base
|
||||
; rdi - t
|
||||
; rbp - code
|
||||
; r8 - dctbl --> code_temp
|
||||
; r9 - actbl
|
||||
; r10 - state
|
||||
; r11 - index
|
||||
; r12 - put_buffer
|
||||
; r15 - block --> free_bits
|
||||
|
||||
%define buffer rax
|
||||
%ifdef WIN64
|
||||
@@ -231,12 +232,11 @@ times 1 << 15 db 16
|
||||
%define nbitsq rcx
|
||||
%define nbits ecx
|
||||
%define nbitsb cl
|
||||
%define block rdx
|
||||
%define codeq rdx
|
||||
%define code edx
|
||||
%define nbits_base rsi
|
||||
%define t rdi
|
||||
%define td edi
|
||||
%define codeq rbp
|
||||
%define code ebp
|
||||
%define dctbl r8
|
||||
%define actbl r9
|
||||
%define state r10
|
||||
@@ -244,6 +244,7 @@ times 1 << 15 db 16
|
||||
%define indexd r11d
|
||||
%define put_buffer r12
|
||||
%define put_bufferd r12d
|
||||
%define block r15
|
||||
|
||||
; Step 1: Re-arrange input data according to jpeg_natural_order
|
||||
; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
|
||||
@@ -259,6 +260,8 @@ times 1 << 15 db 16
|
||||
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
|
||||
|
||||
EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
|
||||
%ifdef WIN64
|
||||
|
||||
@@ -266,15 +269,15 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
; rdx = JOCTET *buffer
|
||||
; r8 = JCOEFPTR block
|
||||
; r9 = int last_dc_val
|
||||
; [rax+48] = c_derived_tbl *dctbl
|
||||
; [rax+56] = c_derived_tbl *actbl
|
||||
; [rbp+48] = c_derived_tbl *dctbl
|
||||
; [rbp+56] = c_derived_tbl *actbl
|
||||
|
||||
;X: X = code stream
|
||||
mov buffer, rdx
|
||||
push r15
|
||||
mov block, r8
|
||||
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
||||
push rbx
|
||||
push rbp
|
||||
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
||||
push rsi
|
||||
push rdi
|
||||
@@ -284,12 +287,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
movsx code, word [block] ;Z: code = block[0];
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
sub code, r9d ;Z: code -= last_dc_val;
|
||||
mov dctbl, POINTER [rsp+6*8+4*8]
|
||||
mov actbl, POINTER [rsp+6*8+5*8]
|
||||
mov dctbl, POINTER [rbp+48]
|
||||
mov actbl, POINTER [rbp+56]
|
||||
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
||||
lea nbits_base, [rel jpeg_nbits_table]
|
||||
add rsp, -DCTSIZE2 * SIZEOF_WORD
|
||||
mov t, rsp
|
||||
|
||||
%else
|
||||
|
||||
@@ -301,9 +302,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
; r9 = c_derived_tbl *actbl
|
||||
|
||||
;X: X = code stream
|
||||
push r15
|
||||
mov block, rdx
|
||||
movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
|
||||
push rbx
|
||||
push rbp
|
||||
movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
|
||||
push r12
|
||||
mov state, rdi
|
||||
@@ -314,10 +316,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
pxor xmm4, xmm4 ;A: w4[i] = 0;
|
||||
sub codeq, rcx ;Z: code -= last_dc_val;
|
||||
punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
|
||||
lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
|
||||
|
||||
%endif
|
||||
|
||||
; Allocate stack space for t array, and realign stack.
|
||||
add rsp, -DCTSIZE2 * SIZEOF_WORD - 8
|
||||
mov t, rsp
|
||||
|
||||
pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
|
||||
pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
|
||||
punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
|
||||
@@ -443,9 +448,9 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
|
||||
; (Row 4, offset 1)
|
||||
%undef block
|
||||
%define free_bitsq rdx
|
||||
%define free_bitsd edx
|
||||
%define free_bitsb dl
|
||||
%define free_bitsq r15
|
||||
%define free_bitsd r15d
|
||||
%define free_bitsb r15b
|
||||
pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
|
||||
shl tempq, 48 ;Z: temp <<= 48;
|
||||
pxor xmm2, xmm2 ;E: w2[i] = 0;
|
||||
@@ -534,12 +539,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
test index, index
|
||||
jnz .BLOOP ; } while (index != 0);
|
||||
.ELOOP: ; } /* index != 0 */
|
||||
sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
|
||||
%ifdef WIN64
|
||||
sub td, esp ; t -= &t_[0];
|
||||
cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
|
||||
%else
|
||||
cmp td, -2 * SIZEOF_WORD ; if (t != -2)
|
||||
%endif
|
||||
je .EFN ; {
|
||||
movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
|
||||
; nbits = actbl->ehufsi[0];
|
||||
@@ -556,18 +557,17 @@ EXTN(jsimd_huff_encode_one_block_sse2):
|
||||
; state->cur.put_buffer.simd = put_buffer;
|
||||
mov byte [state + working_state.cur.free_bits], free_bitsb
|
||||
; state->cur.free_bits = free_bits;
|
||||
%ifdef WIN64
|
||||
sub rsp, -DCTSIZE2 * SIZEOF_WORD
|
||||
sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8
|
||||
pop r12
|
||||
%ifdef WIN64
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
%else
|
||||
pop r12
|
||||
pop rbp
|
||||
pop rbx
|
||||
%endif
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
; (64-bit SSE2)
|
||||
;
|
||||
; Copyright (C) 2016, 2018, Matthieu Darbois
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -282,16 +283,12 @@
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
sub rsp, SIZEOF_XMMWORD
|
||||
movdqa XMMWORD [rsp], ZERO
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
movd AL, r13d
|
||||
pxor ZERO, ZERO
|
||||
mov K, LEN
|
||||
@@ -384,10 +381,9 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
|
||||
REDUCE0
|
||||
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
movdqa ZERO, XMMWORD [rsp]
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -450,16 +446,12 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
|
||||
|
||||
EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [rbp - 16]
|
||||
sub rsp, SIZEOF_XMMWORD
|
||||
movdqa XMMWORD [rsp], ZERO
|
||||
collect_args 6
|
||||
|
||||
movdqa XMMWORD [rbp - 16], ZERO
|
||||
|
||||
xor SIGN, SIGN
|
||||
xor EOB, EOB
|
||||
xor KK, KK
|
||||
@@ -606,10 +598,9 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
|
||||
REDUCE0
|
||||
|
||||
mov eax, EOB
|
||||
movdqa ZERO, XMMWORD [rbp - 16]
|
||||
uncollect_args 6
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
movdqa ZERO, XMMWORD [rsp]
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
@@ -207,7 +206,6 @@ EXTN(jsimd_h2v1_downsample_avx2):
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
|
||||
@@ -44,7 +44,6 @@
|
||||
|
||||
EXTN(jsimd_h2v1_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
@@ -189,7 +188,6 @@ EXTN(jsimd_h2v1_downsample_sse2):
|
||||
|
||||
EXTN(jsimd_h2v2_downsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 6
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -34,7 +35,7 @@
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -42,12 +43,12 @@
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (WK_NUM * SIZEOF_YMMWORD)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -486,8 +487,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -33,7 +34,7 @@
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -41,12 +42,12 @@
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
@@ -429,8 +430,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -34,7 +35,7 @@
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
|
||||
align 32
|
||||
@@ -42,12 +43,12 @@
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, SIZEOF_YMMWORD * WK_NUM
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -480,8 +481,8 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
|
||||
pop rbx
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -506,7 +507,6 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -33,7 +34,7 @@
|
||||
; r12d = JDIMENSION in_row_group_ctr
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 3
|
||||
|
||||
align 32
|
||||
@@ -41,12 +42,12 @@
|
||||
|
||||
EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -422,8 +423,8 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -448,7 +449,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
|
||||
|
||||
EXTN(jsimd_h2v2_merged_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -62,7 +63,6 @@ PW_EIGHT times 16 dw 8
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
push_xmm 3
|
||||
collect_args 4
|
||||
@@ -208,7 +208,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 32
|
||||
@@ -216,12 +216,12 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 128 bits
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, (SIZEOF_YMMWORD * WK_NUM)
|
||||
push_xmm 3
|
||||
collect_args 4
|
||||
push rbx
|
||||
@@ -500,8 +500,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
|
||||
vzeroupper
|
||||
uncollect_args 4
|
||||
pop_xmm 3
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -525,7 +525,6 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
@@ -614,7 +613,6 @@ EXTN(jsimd_h2v1_upsample_avx2):
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -61,7 +62,6 @@ PW_EIGHT times 8 dw 8
|
||||
|
||||
EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
@@ -195,7 +195,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
; r12 = JSAMPARRAY input_data
|
||||
; r13 = JSAMPARRAY *output_data_ptr
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 4
|
||||
|
||||
align 32
|
||||
@@ -203,12 +203,12 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
|
||||
|
||||
EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -473,8 +473,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -498,7 +498,6 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
|
||||
|
||||
EXTN(jsimd_h2v1_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
|
||||
@@ -585,7 +584,6 @@ EXTN(jsimd_h2v1_upsample_sse2):
|
||||
|
||||
EXTN(jsimd_h2v2_upsample_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -58,7 +59,7 @@ PD_1_306 times 4 dd 1.306562964876376527856643
|
||||
|
||||
; r10 = FAST_FLOAT *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -66,12 +67,12 @@ PD_1_306 times 4 dd 1.306562964876376527856643
|
||||
|
||||
EXTN(jsimd_fdct_float_sse):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
@@ -345,8 +346,8 @@ EXTN(jsimd_fdct_float_sse):
|
||||
jnz near .columnloop
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -73,7 +74,7 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
; r10 = DCTELEM *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
@@ -81,12 +82,12 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
|
||||
|
||||
EXTN(jsimd_fdct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
@@ -379,8 +380,8 @@ EXTN(jsimd_fdct_ifast_sse2):
|
||||
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -261,7 +261,6 @@ PW_1_NEG1 times 8 dw 1
|
||||
|
||||
EXTN(jsimd_fdct_islow_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 1
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2020, D. R. Commander.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -94,7 +95,7 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
|
||||
|
||||
; r10 = DCTELEM *data
|
||||
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 6
|
||||
|
||||
align 32
|
||||
@@ -102,12 +103,12 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
|
||||
|
||||
EXTN(jsimd_fdct_islow_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 1
|
||||
|
||||
; ---- Pass 1: process rows.
|
||||
@@ -609,8 +610,8 @@ EXTN(jsimd_fdct_islow_sse2):
|
||||
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
|
||||
|
||||
uncollect_args 1
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -65,8 +66,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
|
||||
@@ -77,11 +77,11 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
EXTN(jsimd_idct_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
lea rsp, [workspace]
|
||||
collect_args 4
|
||||
push rbx
|
||||
@@ -322,7 +322,6 @@ EXTN(jsimd_idct_float_sse2):
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
lea rsi, [workspace] ; FAST_FLOAT *wsptr
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
@@ -472,8 +471,8 @@ EXTN(jsimd_idct_float_sse2):
|
||||
|
||||
pop rbx
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -86,8 +87,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
@@ -96,12 +96,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
EXTN(jsimd_idct_ifast_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
@@ -320,7 +320,6 @@ EXTN(jsimd_idct_ifast_sse2):
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
@@ -480,8 +479,8 @@ EXTN(jsimd_idct_ifast_sse2):
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
ret
|
||||
|
||||
@@ -283,7 +283,6 @@ PW_1_NEG1 times 8 dw 1
|
||||
|
||||
EXTN(jsimd_idct_islow_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
push_xmm 4
|
||||
collect_args 4
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, 2020, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -99,8 +100,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 12
|
||||
|
||||
@@ -109,12 +109,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
EXTN(jsimd_idct_islow_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
@@ -512,7 +512,6 @@ EXTN(jsimd_idct_islow_sse2):
|
||||
|
||||
; ---- Pass 2: process rows from work array, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
@@ -837,8 +836,8 @@ EXTN(jsimd_idct_islow_sse2):
|
||||
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2018, Matthias Räncker.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
@@ -107,8 +108,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
; r12 = JSAMPARRAY output_buf
|
||||
; r13d = JDIMENSION output_col
|
||||
|
||||
%define original_rbp rbp + 0
|
||||
%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
%define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
|
||||
; xmmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
@@ -117,12 +117,12 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
|
||||
|
||||
EXTN(jsimd_idct_4x4_sse2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
mov rbp, rsp
|
||||
push r15
|
||||
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
; Allocate stack space for wk array. r15 is used to access it.
|
||||
mov r15, rsp
|
||||
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
|
||||
collect_args 4
|
||||
|
||||
; ---- Pass 1: process columns from input.
|
||||
@@ -309,7 +309,6 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
|
||||
; ---- Pass 2: process rows, store into output array.
|
||||
|
||||
mov rax, [original_rbp]
|
||||
mov rdi, r12 ; (JSAMPROW *)
|
||||
mov eax, r13d
|
||||
|
||||
@@ -390,8 +389,8 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
|
||||
|
||||
uncollect_args 4
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
lea rsp, [rbp-8]
|
||||
pop r15
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
@@ -415,7 +414,6 @@ EXTN(jsimd_idct_4x4_sse2):
|
||||
|
||||
EXTN(jsimd_idct_2x2_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 4
|
||||
push rbx
|
||||
|
||||
@@ -38,7 +38,6 @@
|
||||
|
||||
EXTN(jsimd_convsamp_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
push rbx
|
||||
@@ -111,7 +110,6 @@ EXTN(jsimd_convsamp_float_sse2):
|
||||
|
||||
EXTN(jsimd_quantize_float_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@
|
||||
|
||||
EXTN(jsimd_convsamp_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_avx2):
|
||||
|
||||
EXTN(jsimd_quantize_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
|
||||
@@ -38,7 +38,6 @@
|
||||
|
||||
EXTN(jsimd_convsamp_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
push rbx
|
||||
@@ -117,7 +116,6 @@ EXTN(jsimd_convsamp_sse2):
|
||||
|
||||
EXTN(jsimd_quantize_sse2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2016, D. R. Commander.
|
||||
; Copyright (C) 2023, Aliaksiej Kandracienka.
|
||||
;
|
||||
; Based on
|
||||
; x86 SIMD extension for IJG JPEG library
|
||||
@@ -31,6 +32,8 @@
|
||||
GLOBAL_FUNCTION(jpeg_simd_cpu_support)
|
||||
|
||||
EXTN(jpeg_simd_cpu_support):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
push rbx
|
||||
push rdi
|
||||
|
||||
@@ -79,6 +82,7 @@ EXTN(jpeg_simd_cpu_support):
|
||||
|
||||
pop rdi
|
||||
pop rbx
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
|
||||
Reference in New Issue
Block a user