x86 SIMD: Capitalize all instruction-like macros

(to improve code readability)
This commit is contained in:
DRC
2024-02-29 12:18:49 -05:00
parent 26fc07c8d1
commit 1335547558
88 changed files with 662 additions and 661 deletions

View File

@@ -2,7 +2,7 @@
; jccolext.asm - colorspace conversion (AVX2) ; jccolext.asm - colorspace conversion (AVX2)
; ;
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx test ecx, ecx
@@ -80,9 +80,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edx push edx
push ebx push ebx
push edi push edi
@@ -93,11 +93,11 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1 mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2 mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_YMMWORD cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop jae near .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -154,7 +154,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -278,7 +278,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -552,7 +552,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
pop edi pop edi
pop ebx pop ebx
pop edx pop edx
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jccolext.asm - colorspace conversion (MMX) ; jccolext.asm - colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] ; num_cols mov ecx, JDIMENSION [img_width(eax)] ; num_cols
test ecx, ecx test ecx, ecx
@@ -80,9 +80,9 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edx push edx
push ebx push ebx
push edi push edi
@@ -93,11 +93,11 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1 mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2 mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_MMWORD cmp ecx, byte SIZEOF_MMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -143,7 +143,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -211,7 +211,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -449,7 +449,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
pop edi pop edi
pop ebx pop ebx
pop edx pop edx
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -1,7 +1,7 @@
; ;
; jccolext.asm - colorspace conversion (SSE2) ; jccolext.asm - colorspace conversion (SSE2)
; ;
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -48,15 +48,15 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx test ecx, ecx
@@ -79,9 +79,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edx push edx
push ebx push ebx
push edi push edi
@@ -92,11 +92,11 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
mov ebx, JSAMPROW [ebx] ; outptr1 mov ebx, JSAMPROW [ebx] ; outptr1
mov edx, JSAMPROW [edx] ; outptr2 mov edx, JSAMPROW [edx] ; outptr2
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -147,7 +147,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -232,7 +232,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_ycc_cnv jmp short .rgb_ycc_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -478,7 +478,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
pop edi pop edi
pop ebx pop ebx
pop edx pop edx
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -1,7 +1,7 @@
; ;
; jccolor.asm - colorspace conversion (AVX2) ; jccolor.asm - colorspace conversion (AVX2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
EXTN(jconst_rgb_ycc_convert_avx2): EXTN(jconst_rgb_ycc_convert_avx2):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS) (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jccolor.asm - colorspace conversion (MMX) ; jccolor.asm - colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_ycc_convert_mmx) GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
EXTN(jconst_rgb_ycc_convert_mmx): EXTN(jconst_rgb_ycc_convert_mmx):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS) (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jccolor.asm - colorspace conversion (SSE2) ; jccolor.asm - colorspace conversion (SSE2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2): EXTN(jconst_rgb_ycc_convert_sse2):
@@ -45,7 +45,7 @@ PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS) (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgray.asm - grayscale colorspace conversion (AVX2) ; jcgray.asm - grayscale colorspace conversion (AVX2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_gray_convert_avx2) GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
EXTN(jconst_rgb_gray_convert_avx2): EXTN(jconst_rgb_gray_convert_avx2):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337
PW_F0114_F0250 times 8 dw F_0_114, F_0_250 PW_F0114_F0250 times 8 dw F_0_114, F_0_250
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jcgray.asm - grayscale colorspace conversion (MMX) ; jcgray.asm - grayscale colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_gray_convert_mmx) GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
EXTN(jconst_rgb_gray_convert_mmx): EXTN(jconst_rgb_gray_convert_mmx):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 2 dw F_0_299, F_0_337
PW_F0114_F0250 times 2 dw F_0_114, F_0_250 PW_F0114_F0250 times 2 dw F_0_114, F_0_250
PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 2 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgray.asm - grayscale colorspace conversion (SSE2) ; jcgray.asm - grayscale colorspace conversion (SSE2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -28,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_gray_convert_sse2) GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2): EXTN(jconst_rgb_gray_convert_sse2):
@@ -37,7 +37,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgryext.asm - grayscale colorspace conversion (AVX2) ; jcgryext.asm - grayscale colorspace conversion (AVX2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_gray_convert_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx test ecx, ecx
@@ -76,20 +76,20 @@ EXTN(jsimd_rgb_gray_convert_avx2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edi push edi
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_YMMWORD cmp ecx, byte SIZEOF_YMMWORD
jae near .columnloop jae near .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -146,7 +146,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -270,7 +270,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD] vmovdqu ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -433,7 +433,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
pop ecx ; col pop ecx ; col
pop esi pop esi
pop edi pop edi
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jcgryext.asm - grayscale colorspace conversion (MMX) ; jcgryext.asm - grayscale colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_gray_convert_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] ; num_cols mov ecx, JDIMENSION [img_width(eax)] ; num_cols
test ecx, ecx test ecx, ecx
@@ -76,20 +76,20 @@ EXTN(jsimd_rgb_gray_convert_mmx):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edi push edi
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_MMWORD cmp ecx, byte SIZEOF_MMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -135,7 +135,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] movq mmG, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -203,7 +203,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] movq mmF, MMWORD [esi+1*SIZEOF_MMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] movq mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -330,7 +330,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
pop ecx ; col pop ecx ; col
pop esi pop esi
pop edi pop edi
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -1,7 +1,7 @@
; ;
; jcgryext.asm - grayscale colorspace conversion (SSE2) ; jcgryext.asm - grayscale colorspace conversion (SSE2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -48,15 +48,15 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [img_width(eax)] mov ecx, JDIMENSION [img_width(eax)]
test ecx, ecx test ecx, ecx
@@ -75,20 +75,20 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
pushpic eax PUSHPIC eax
push edi push edi
push esi push esi
push ecx ; col push ecx ; col
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr0 mov edi, JSAMPROW [edi] ; outptr0
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae near .columnloop jae near .columnloop
alignx 16, 7 ALIGNX 16, 7
%if RGB_PIXELSIZE == 3 ; --------------- %if RGB_PIXELSIZE == 3 ; ---------------
@@ -139,7 +139,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -224,7 +224,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
jmp short .rgb_gray_cnv jmp short .rgb_gray_cnv
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -359,7 +359,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
pop ecx ; col pop ecx ; col
pop esi pop esi
pop edi pop edi
poppic eax POPPIC eax
add esi, byte SIZEOF_JSAMPROW ; input_buf add esi, byte SIZEOF_JSAMPROW ; input_buf
add edi, byte SIZEOF_JSAMPROW add edi, byte SIZEOF_JSAMPROW

View File

@@ -42,7 +42,7 @@ endstruc
EXTN(jconst_huff_encode_one_block): EXTN(jconst_huff_encode_one_block):
alignz 32 ALIGNZ 32
jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007 jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
dq 0x000f, 0x001f, 0x003f, 0x007f dq 0x000f, 0x001f, 0x003f, 0x007f
@@ -84,7 +84,7 @@ times 1 << 12 db 13
times 1 << 13 db 14 times 1 << 13 db 14
times 1 << 14 db 15 times 1 << 14 db 15
alignz 32 ALIGNZ 32
%ifdef PIC %ifdef PIC
%define NBITS(x) nbits_base + x %define NBITS(x) nbits_base + x
@@ -236,7 +236,7 @@ times 1 << 14 db 15
; If PIC is defined, load the address of a symbol defined in this file into a ; If PIC is defined, load the address of a symbol defined in this file into a
; register. Equivalent to ; register. Equivalent to
; get_GOT %1 ; GET_GOT %1
; lea %1, [GOTOFF(%1, %2)] ; lea %1, [GOTOFF(%1, %2)]
; without using the GOT. ; without using the GOT.
; ;

View File

@@ -3,7 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +70,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -106,7 +106,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -117,7 +117,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
cmp ecx, byte SIZEOF_YMMWORD cmp ecx, byte SIZEOF_YMMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_r24: .columnloop_r24:
; ecx can possibly be 8, 16, 24 ; ecx can possibly be 8, 16, 24
@@ -141,7 +141,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
vpxor ymm1, ymm1, ymm1 vpxor ymm1, ymm1, ymm1
mov ecx, SIZEOF_YMMWORD mov ecx, SIZEOF_YMMWORD
jmp short .downsample jmp short .downsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -243,7 +243,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -279,7 +279,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -291,7 +291,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
cmp ecx, byte SIZEOF_YMMWORD cmp ecx, byte SIZEOF_YMMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_r24: .columnloop_r24:
cmp ecx, 24 cmp ecx, 24
@@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
vpxor ymm3, ymm3, ymm3 vpxor ymm3, ymm3, ymm3
mov ecx, SIZEOF_YMMWORD mov ecx, SIZEOF_YMMWORD
jmp short .downsample jmp short .downsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]

View File

@@ -2,7 +2,7 @@
; jcsample.asm - downsampling (MMX) ; jcsample.asm - downsampling (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -112,7 +112,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -212,7 +212,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -247,7 +247,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -256,7 +256,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]

View File

@@ -2,7 +2,7 @@
; jcsample.asm - downsampling (SSE2) ; jcsample.asm - downsampling (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -115,14 +115,14 @@ EXTN(jsimd_h2v1_downsample_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
pxor xmm1, xmm1 pxor xmm1, xmm1
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -225,7 +225,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
cld cld
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
alignx 16, 7 ALIGNX 16, 7
.expandloop: .expandloop:
push eax push eax
push ecx push ecx
@@ -260,7 +260,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, JSAMPARRAY [output_data(ebp)] ; output_data mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push ecx push ecx
push edi push edi
@@ -272,7 +272,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
cmp ecx, byte SIZEOF_XMMWORD cmp ecx, byte SIZEOF_XMMWORD
jae short .columnloop jae short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_r8: .columnloop_r8:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
@@ -281,7 +281,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
pxor xmm3, xmm3 pxor xmm3, xmm3
mov ecx, SIZEOF_XMMWORD mov ecx, SIZEOF_XMMWORD
jmp short .downsample jmp short .downsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]

View File

@@ -2,7 +2,7 @@
; jdcolext.asm - colorspace conversion (AVX2) ; jdcolext.asm - colorspace conversion (AVX2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander. ; Copyright (C) 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -50,15 +50,15 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx test ecx, ecx
@@ -81,7 +81,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax push eax
push edi push edi
@@ -94,8 +94,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
mov ebx, JSAMPROW [ebx] ; inptr1 mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2 mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -295,7 +295,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
add ebx, byte SIZEOF_YMMWORD ; inptr1 add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2 add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st64: .column_st64:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -436,7 +436,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
add ebx, byte SIZEOF_YMMWORD ; inptr1 add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2 add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st64: .column_st64:
cmp ecx, byte SIZEOF_YMMWORD/2 cmp ecx, byte SIZEOF_YMMWORD/2
@@ -479,7 +479,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop ecx pop ecx

View File

@@ -2,7 +2,7 @@
; jdcolext.asm - colorspace conversion (MMX) ; jdcolext.asm - colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx test ecx, ecx
@@ -80,7 +80,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax push eax
push edi push edi
@@ -93,8 +93,8 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
mov ebx, JSAMPROW [ebx] ; inptr1 mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2 mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
@@ -255,7 +255,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
add edx, byte SIZEOF_MMWORD ; inptr2 add edx, byte SIZEOF_MMWORD ; inptr2
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st16: .column_st16:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -344,7 +344,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
add edx, byte SIZEOF_MMWORD ; inptr2 add edx, byte SIZEOF_MMWORD ; inptr2
add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_MMWORD/2 cmp ecx, byte SIZEOF_MMWORD/2
@@ -369,7 +369,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop ecx pop ecx

View File

@@ -2,7 +2,7 @@
; jdcolext.asm - colorspace conversion (SSE2) ; jdcolext.asm - colorspace conversion (SSE2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander. ; Copyright (C) 2012, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [out_width(eax)] ; num_cols mov ecx, JDIMENSION [out_width(eax)] ; num_cols
test ecx, ecx test ecx, ecx
@@ -80,7 +80,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov eax, INT [num_rows(eax)] mov eax, INT [num_rows(eax)]
test eax, eax test eax, eax
jle near .return jle near .return
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax push eax
push edi push edi
@@ -93,8 +93,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov ebx, JSAMPROW [ebx] ; inptr1 mov ebx, JSAMPROW [ebx] ; inptr1
mov edx, JSAMPROW [edx] ; inptr2 mov edx, JSAMPROW [edx] ; inptr2
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF)
@@ -275,7 +275,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -387,7 +387,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2
@@ -423,7 +423,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
%endif ; RGB_PIXELSIZE ; --------------- %endif ; RGB_PIXELSIZE ; ---------------
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop ecx pop ecx

View File

@@ -3,7 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
EXTN(jconst_ycc_rgb_convert_avx2): EXTN(jconst_ycc_rgb_convert_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1 PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdcolor.asm - colorspace conversion (MMX) ; jdcolor.asm - colorspace conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_ycc_rgb_convert_mmx) GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
EXTN(jconst_ycc_rgb_convert_mmx): EXTN(jconst_ycc_rgb_convert_mmx):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
PW_ONE times 4 dw 1 PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdcolor.asm - colorspace conversion (SSE2) ; jdcolor.asm - colorspace conversion (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2): EXTN(jconst_ycc_rgb_convert_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmerge.asm - merged upsampling/color conversion (AVX2) ; jdmerge.asm - merged upsampling/color conversion (AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_merged_upsample_avx2) GLOBAL_DATA(jconst_merged_upsample_avx2)
EXTN(jconst_merged_upsample_avx2): EXTN(jconst_merged_upsample_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1 PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmerge.asm - merged upsampling/color conversion (MMX) ; jdmerge.asm - merged upsampling/color conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_merged_upsample_mmx) GLOBAL_DATA(jconst_merged_upsample_mmx)
EXTN(jconst_merged_upsample_mmx): EXTN(jconst_merged_upsample_mmx):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
PW_ONE times 4 dw 1 PW_ONE times 4 dw 1
PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 2 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmerge.asm - merged upsampling/color conversion (SSE2) ; jdmerge.asm - merged upsampling/color conversion (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_merged_upsample_sse2) GLOBAL_DATA(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2): EXTN(jconst_merged_upsample_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmrgext.asm - merged upsampling/color conversion (AVX2) ; jdmrgext.asm - merged upsampling/color conversion (AVX2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander. ; Copyright (C) 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -50,15 +50,15 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx test ecx, ecx
@@ -79,9 +79,9 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
pop ecx ; col pop ecx ; col
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) vmovdqu ymm6, YMMWORD [ebx] ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) vmovdqu ymm7, YMMWORD [edx] ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -168,13 +168,13 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
mov al, 2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
alignx 16, 7 ALIGNX 16, 7
.Yloop_2nd: .Yloop_2nd:
vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H vmovdqa ymm0, YMMWORD [wk(1)] ; ymm0=(R-Y)H
vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H vmovdqa ymm2, YMMWORD [wk(2)] ; ymm2=(G-Y)H
vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H vmovdqa ymm4, YMMWORD [wk(0)] ; ymm4=(B-Y)H
alignx 16, 7 ALIGNX 16, 7
.Yloop_1st: .Yloop_1st:
vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) vmovdqu ymm7, YMMWORD [esi] ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -301,7 +301,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
add ebx, byte SIZEOF_YMMWORD ; inptr1 add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2 add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st64: .column_st64:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -445,7 +445,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
add ebx, byte SIZEOF_YMMWORD ; inptr1 add ebx, byte SIZEOF_YMMWORD ; inptr1
add edx, byte SIZEOF_YMMWORD ; inptr2 add edx, byte SIZEOF_YMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st64: .column_st64:
cmp ecx, byte SIZEOF_YMMWORD/2 cmp ecx, byte SIZEOF_YMMWORD/2

View File

@@ -2,7 +2,7 @@
; jdmrgext.asm - merged upsampling/color conversion (MMX) ; jdmrgext.asm - merged upsampling/color conversion (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -47,15 +47,15 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx test ecx, ecx
@@ -76,9 +76,9 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
pop ecx ; col pop ecx ; col
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) movq mm6, MMWORD [ebx] ; mm6=Cb(01234567)
movq mm7, MMWORD [edx] ; mm7=Cr(01234567) movq mm7, MMWORD [edx] ; mm7=Cr(01234567)
@@ -171,13 +171,13 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
mov al, 2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
alignx 16, 7 ALIGNX 16, 7
.Yloop_2nd: .Yloop_2nd:
movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H
movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H
movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H
alignx 16, 7 ALIGNX 16, 7
.Yloop_1st: .Yloop_1st:
movq mm7, MMWORD [esi] ; mm7=Y(01234567) movq mm7, MMWORD [esi] ; mm7=Y(01234567)
@@ -258,7 +258,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
add ebx, byte SIZEOF_MMWORD ; inptr1 add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2 add edx, byte SIZEOF_MMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st16: .column_st16:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -350,7 +350,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
add ebx, byte SIZEOF_MMWORD ; inptr1 add ebx, byte SIZEOF_MMWORD ; inptr1
add edx, byte SIZEOF_MMWORD ; inptr2 add edx, byte SIZEOF_MMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st16: .column_st16:
cmp ecx, byte SIZEOF_MMWORD/2 cmp ecx, byte SIZEOF_MMWORD/2

View File

@@ -2,7 +2,7 @@
; jdmrgext.asm - merged upsampling/color conversion (SSE2) ; jdmrgext.asm - merged upsampling/color conversion (SSE2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2012, 2016, D. R. Commander. ; Copyright (C) 2012, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov ecx, JDIMENSION [output_width(eax)] ; col mov ecx, JDIMENSION [output_width(eax)] ; col
test ecx, ecx test ecx, ecx
@@ -78,9 +78,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
pop ecx ; col pop ecx ; col
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movpic eax, POINTER [gotptr] ; load GOT address (eax) MOVPIC eax, POINTER [gotptr] ; load GOT address (eax)
movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF)
movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF)
@@ -173,13 +173,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
mov al, 2 ; Yctr mov al, 2 ; Yctr
jmp short .Yloop_1st jmp short .Yloop_1st
alignx 16, 7 ALIGNX 16, 7
.Yloop_2nd: .Yloop_2nd:
movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
alignx 16, 7 ALIGNX 16, 7
.Yloop_1st: .Yloop_1st:
movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF)
@@ -280,7 +280,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st32: .column_st32:
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
@@ -395,7 +395,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
add ebx, byte SIZEOF_XMMWORD ; inptr1 add ebx, byte SIZEOF_XMMWORD ; inptr1
add edx, byte SIZEOF_XMMWORD ; inptr2 add edx, byte SIZEOF_XMMWORD ; inptr2
jmp near .columnloop jmp near .columnloop
alignx 16, 7 ALIGNX 16, 7
.column_st32: .column_st32:
cmp ecx, byte SIZEOF_XMMWORD/2 cmp ecx, byte SIZEOF_XMMWORD/2

View File

@@ -3,7 +3,7 @@
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,7 +20,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fancy_upsample_avx2) GLOBAL_DATA(jconst_fancy_upsample_avx2)
EXTN(jconst_fancy_upsample_avx2): EXTN(jconst_fancy_upsample_avx2):
@@ -31,7 +31,7 @@ PW_THREE times 16 dw 3
PW_SEVEN times 16 dw 7 PW_SEVEN times 16 dw 7
PW_EIGHT times 16 dw 8 PW_EIGHT times 16 dw 8
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -62,13 +62,13 @@ PW_EIGHT times 16 dw 8
EXTN(jsimd_h2v1_fancy_upsample_avx2): EXTN(jsimd_h2v1_fancy_upsample_avx2):
push ebp push ebp
mov ebp, esp mov ebp, esp
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax test eax, eax
@@ -81,7 +81,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push edi push edi
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
and eax, byte -SIZEOF_YMMWORD and eax, byte -SIZEOF_YMMWORD
cmp eax, byte SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
vpcmpeqb xmm6, xmm6, xmm6 vpcmpeqb xmm6, xmm6, xmm6
@@ -112,7 +112,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
jmp short .upsample jmp short .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
@@ -196,7 +196,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
pop ebp pop ebp
ret ret
@@ -234,15 +234,15 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
@@ -256,7 +256,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)] mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push ecx push ecx
@@ -286,8 +286,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
@@ -328,19 +328,19 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vmovdqa YMMWORD [wk(0)], ymm1 vmovdqa YMMWORD [wk(0)], ymm1
vmovdqa YMMWORD [wk(1)], ymm2 vmovdqa YMMWORD [wk(1)], ymm2
poppic ebx POPPIC ebx
add eax, byte SIZEOF_YMMWORD-1 add eax, byte SIZEOF_YMMWORD-1
and eax, byte -SIZEOF_YMMWORD and eax, byte -SIZEOF_YMMWORD
cmp eax, byte SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
; -- process the last column block ; -- process the last column block
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
vpcmpeqb xmm1, xmm1, xmm1 vpcmpeqb xmm1, xmm1, xmm1
vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
@@ -353,7 +353,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
jmp near .upsample jmp near .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
; -- process the next column block ; -- process the next column block
@@ -362,8 +362,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
@@ -516,7 +516,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
poppic ebx POPPIC ebx
sub eax, byte SIZEOF_YMMWORD sub eax, byte SIZEOF_YMMWORD
add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
@@ -590,7 +590,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -598,7 +598,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
cmp eax, byte SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD
@@ -629,7 +629,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
add esi, byte SIZEOF_YMMWORD ; inptr add esi, byte SIZEOF_YMMWORD ; inptr
add edi, byte 2*SIZEOF_YMMWORD ; outptr add edi, byte 2*SIZEOF_YMMWORD ; outptr
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi
@@ -689,7 +689,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -698,7 +698,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
cmp eax, byte SIZEOF_YMMWORD cmp eax, byte SIZEOF_YMMWORD
@@ -734,7 +734,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
add ebx, 2*SIZEOF_YMMWORD ; outptr0 add ebx, 2*SIZEOF_YMMWORD ; outptr0
add edi, 2*SIZEOF_YMMWORD ; outptr1 add edi, 2*SIZEOF_YMMWORD ; outptr1
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi

View File

@@ -2,7 +2,7 @@
; jdsample.asm - upsampling (MMX) ; jdsample.asm - upsampling (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,7 +19,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fancy_upsample_mmx) GLOBAL_DATA(jconst_fancy_upsample_mmx)
EXTN(jconst_fancy_upsample_mmx): EXTN(jconst_fancy_upsample_mmx):
@@ -30,7 +30,7 @@ PW_THREE times 4 dw 3
PW_SEVEN times 4 dw 7 PW_SEVEN times 4 dw 7
PW_EIGHT times 4 dw 8 PW_EIGHT times 4 dw 8
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -61,13 +61,13 @@ PW_EIGHT times 4 dw 8
EXTN(jsimd_h2v1_fancy_upsample_mmx): EXTN(jsimd_h2v1_fancy_upsample_mmx):
push ebp push ebp
mov ebp, esp mov ebp, esp
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax test eax, eax
@@ -80,7 +80,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push edi push edi
@@ -103,14 +103,14 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
and eax, byte -SIZEOF_MMWORD and eax, byte -SIZEOF_MMWORD
cmp eax, byte SIZEOF_MMWORD cmp eax, byte SIZEOF_MMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
pcmpeqb mm6, mm6 pcmpeqb mm6, mm6
psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
jmp short .upsample jmp short .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
@@ -187,7 +187,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
pop ebp pop ebp
ret ret
@@ -224,15 +224,15 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
@@ -246,7 +246,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)] mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push ecx push ecx
@@ -276,8 +276,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pxor mm3, mm3 ; mm3=(all 0's) pxor mm3, mm3 ; mm3=(all 0's)
movq mm4, mm0 movq mm4, mm0
@@ -312,19 +312,19 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
movq MMWORD [wk(0)], mm1 movq MMWORD [wk(0)], mm1
movq MMWORD [wk(1)], mm2 movq MMWORD [wk(1)], mm2
poppic ebx POPPIC ebx
add eax, byte SIZEOF_MMWORD-1 add eax, byte SIZEOF_MMWORD-1
and eax, byte -SIZEOF_MMWORD and eax, byte -SIZEOF_MMWORD
cmp eax, byte SIZEOF_MMWORD cmp eax, byte SIZEOF_MMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
; -- process the last column block ; -- process the last column block
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pcmpeqb mm1, mm1 pcmpeqb mm1, mm1
psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
@@ -337,7 +337,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
movq MMWORD [wk(3)], mm2 movq MMWORD [wk(3)], mm2
jmp short .upsample jmp short .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
; -- process the next column block ; -- process the next column block
@@ -346,8 +346,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pxor mm3, mm3 ; mm3=(all 0's) pxor mm3, mm3 ; mm3=(all 0's)
movq mm4, mm0 movq mm4, mm0
@@ -486,7 +486,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
poppic ebx POPPIC ebx
sub eax, byte SIZEOF_MMWORD sub eax, byte SIZEOF_MMWORD
add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
@@ -561,7 +561,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -569,7 +569,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -599,7 +599,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
add esi, byte 2*SIZEOF_MMWORD ; inptr add esi, byte 2*SIZEOF_MMWORD ; inptr
add edi, byte 4*SIZEOF_MMWORD ; outptr add edi, byte 4*SIZEOF_MMWORD ; outptr
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi
@@ -660,7 +660,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -669,7 +669,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -704,7 +704,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
add ebx, byte 4*SIZEOF_MMWORD ; outptr0 add ebx, byte 4*SIZEOF_MMWORD ; outptr0
add edi, byte 4*SIZEOF_MMWORD ; outptr1 add edi, byte 4*SIZEOF_MMWORD ; outptr1
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi

View File

@@ -2,7 +2,7 @@
; jdsample.asm - upsampling (SSE2) ; jdsample.asm - upsampling (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,7 +19,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fancy_upsample_sse2) GLOBAL_DATA(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2): EXTN(jconst_fancy_upsample_sse2):
@@ -30,7 +30,7 @@ PW_THREE times 8 dw 3
PW_SEVEN times 8 dw 7 PW_SEVEN times 8 dw 7
PW_EIGHT times 8 dw 8 PW_EIGHT times 8 dw 8
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -61,13 +61,13 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2): EXTN(jsimd_h2v1_fancy_upsample_sse2):
push ebp push ebp
mov ebp, esp mov ebp, esp
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
test eax, eax test eax, eax
@@ -80,7 +80,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push edi push edi
@@ -103,14 +103,14 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
and eax, byte -SIZEOF_XMMWORD and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
pcmpeqb xmm6, xmm6 pcmpeqb xmm6, xmm6
pslldq xmm6, (SIZEOF_XMMWORD-1) pslldq xmm6, (SIZEOF_XMMWORD-1)
pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
jmp short .upsample jmp short .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
@@ -185,7 +185,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
pop ebp pop ebp
ret ret
@@ -223,15 +223,15 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic eax ; make a room for GOT address PUSHPIC eax ; make a room for GOT address
push ebx push ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
movpic POINTER [gotptr], ebx ; save GOT address MOVPIC POINTER [gotptr], ebx ; save GOT address
mov edx, eax ; edx = original ebp mov edx, eax ; edx = original ebp
mov eax, JDIMENSION [downsamp_width(edx)] ; colctr mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
@@ -245,7 +245,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
mov esi, JSAMPARRAY [input_data(edx)] ; input_data mov esi, JSAMPARRAY [input_data(edx)] ; input_data
mov edi, POINTER [output_data_ptr(edx)] mov edi, POINTER [output_data_ptr(edx)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push eax ; colctr push eax ; colctr
push ecx push ecx
@@ -275,8 +275,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pxor xmm3, xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4, xmm0 movdqa xmm4, xmm0
@@ -311,19 +311,19 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa XMMWORD [wk(0)], xmm1 movdqa XMMWORD [wk(0)], xmm1
movdqa XMMWORD [wk(1)], xmm2 movdqa XMMWORD [wk(1)], xmm2
poppic ebx POPPIC ebx
add eax, byte SIZEOF_XMMWORD-1 add eax, byte SIZEOF_XMMWORD-1
and eax, byte -SIZEOF_XMMWORD and eax, byte -SIZEOF_XMMWORD
cmp eax, byte SIZEOF_XMMWORD cmp eax, byte SIZEOF_XMMWORD
ja short .columnloop ja short .columnloop
alignx 16, 7 ALIGNX 16, 7
.columnloop_last: .columnloop_last:
; -- process the last column block ; -- process the last column block
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pcmpeqb xmm1, xmm1 pcmpeqb xmm1, xmm1
pslldq xmm1, (SIZEOF_XMMWORD-2) pslldq xmm1, (SIZEOF_XMMWORD-2)
@@ -336,7 +336,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
jmp near .upsample jmp near .upsample
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
; -- process the next column block ; -- process the next column block
@@ -345,8 +345,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
pushpic ebx PUSHPIC ebx
movpic ebx, POINTER [gotptr] ; load GOT address MOVPIC ebx, POINTER [gotptr] ; load GOT address
pxor xmm3, xmm3 ; xmm3=(all 0's) pxor xmm3, xmm3 ; xmm3=(all 0's)
movdqa xmm4, xmm0 movdqa xmm4, xmm0
@@ -485,7 +485,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
poppic ebx POPPIC ebx
sub eax, byte SIZEOF_XMMWORD sub eax, byte SIZEOF_XMMWORD
add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
@@ -558,7 +558,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -566,7 +566,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
mov esi, JSAMPROW [esi] ; inptr mov esi, JSAMPROW [esi] ; inptr
mov edi, JSAMPROW [edi] ; outptr mov edi, JSAMPROW [edi] ; outptr
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -596,7 +596,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
add esi, byte 2*SIZEOF_XMMWORD ; inptr add esi, byte 2*SIZEOF_XMMWORD ; inptr
add edi, byte 4*SIZEOF_XMMWORD ; outptr add edi, byte 4*SIZEOF_XMMWORD ; outptr
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi
@@ -655,7 +655,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
mov esi, JSAMPARRAY [input_data(ebp)] ; input_data mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
mov edi, POINTER [output_data_ptr(ebp)] mov edi, POINTER [output_data_ptr(ebp)]
mov edi, JSAMPARRAY [edi] ; output_data mov edi, JSAMPARRAY [edi] ; output_data
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
push edi push edi
push esi push esi
@@ -664,7 +664,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
mov eax, edx ; colctr mov eax, edx ; colctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -699,7 +699,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
add edi, byte 4*SIZEOF_XMMWORD ; outptr1 add edi, byte 4*SIZEOF_XMMWORD ; outptr1
jmp short .columnloop jmp short .columnloop
alignx 16, 7 ALIGNX 16, 7
.nextrow: .nextrow:
pop esi pop esi

View File

@@ -2,7 +2,7 @@
; jfdctflt.asm - floating-point FDCT (3DNow!) ; jfdctflt.asm - floating-point FDCT (3DNow!)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,7 +24,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_float_3dnow) GLOBAL_DATA(jconst_fdct_float_3dnow)
EXTN(jconst_fdct_float_3dnow): EXTN(jconst_fdct_float_3dnow):
@@ -34,7 +34,7 @@ PD_0_707 times 2 dd 0.707106781186547524400844
PD_0_541 times 2 dd 0.541196100146196984399723 PD_0_541 times 2 dd 0.541196100146196984399723
PD_1_306 times 2 dd 1.306562964876376527856643 PD_1_306 times 2 dd 1.306562964876376527856643
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -63,19 +63,19 @@ EXTN(jsimd_fdct_float_3dnow):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
@@ -190,7 +190,7 @@ EXTN(jsimd_fdct_float_3dnow):
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
@@ -307,7 +307,7 @@ EXTN(jsimd_fdct_float_3dnow):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jfdctflt.asm - floating-point FDCT (SSE) ; jfdctflt.asm - floating-point FDCT (SSE)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +34,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_float_sse) GLOBAL_DATA(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse): EXTN(jconst_fdct_float_sse):
@@ -44,7 +44,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723 PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643 PD_1_306 times 4 dd 1.306562964876376527856643
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -74,19 +74,19 @@ EXTN(jsimd_fdct_float_sse):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -222,7 +222,7 @@ EXTN(jsimd_fdct_float_sse):
mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -358,7 +358,7 @@ EXTN(jsimd_fdct_float_sse):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jfdctfst.asm - fast integer FDCT (MMX) ; jfdctfst.asm - fast integer FDCT (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,7 +49,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_ifast_mmx) GLOBAL_DATA(jconst_fdct_ifast_mmx)
EXTN(jconst_fdct_ifast_mmx): EXTN(jconst_fdct_ifast_mmx):
@@ -59,7 +59,7 @@ PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
PW_F0541 times 4 dw F_0_541 << CONST_SHIFT PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
PW_F1306 times 4 dw F_1_306 << CONST_SHIFT PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -88,19 +88,19 @@ EXTN(jsimd_fdct_ifast_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *) mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -241,7 +241,7 @@ EXTN(jsimd_fdct_ifast_mmx):
mov edx, POINTER [data(eax)] ; (DCTELEM *) mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -384,7 +384,7 @@ EXTN(jsimd_fdct_ifast_mmx):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jfdctfst.asm - fast integer FDCT (SSE2) ; jfdctfst.asm - fast integer FDCT (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,7 +49,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_ifast_sse2) GLOBAL_DATA(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2): EXTN(jconst_fdct_ifast_sse2):
@@ -59,7 +59,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -89,13 +89,13 @@ EXTN(jsimd_fdct_ifast_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -392,7 +392,7 @@ EXTN(jsimd_fdct_ifast_sse2):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (AVX2) ; jfdctint.asm - accurate integer FDCT (AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %1-%4: Input/output registers ; %1-%4: Input/output registers
; %5-%8: Temp registers ; %5-%8: Temp registers
%macro dotranspose 8 %macro DOTRANSPOSE 8
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
@@ -108,7 +108,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %5-%8: Temp registers ; %5-%8: Temp registers
; %9: Pass (1 or 2) ; %9: Pass (1 or 2)
%macro dodct 9 %macro DODCT 9
vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
@@ -223,7 +223,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_islow_avx2) GLOBAL_DATA(jconst_fdct_islow_avx2)
EXTN(jconst_fdct_islow_avx2): EXTN(jconst_fdct_islow_avx2):
@@ -242,7 +242,7 @@ PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
PW_1_NEG1 times 8 dw 1 PW_1_NEG1 times 8 dw 1
times 8 dw -1 times 8 dw -1
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -262,13 +262,13 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_fdct_islow_avx2): EXTN(jsimd_fdct_islow_avx2):
push ebp push ebp
mov ebp, esp mov ebp, esp
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -292,9 +292,9 @@ EXTN(jsimd_fdct_islow_avx2):
; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
@@ -302,9 +302,9 @@ EXTN(jsimd_fdct_islow_avx2):
vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
@@ -322,7 +322,7 @@ EXTN(jsimd_fdct_islow_avx2):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
pop ebp pop ebp
ret ret

View File

@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (MMX) ; jfdctint.asm - accurate integer FDCT (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander. ; Copyright (C) 2016, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_islow_mmx) GLOBAL_DATA(jconst_fdct_islow_mmx)
EXTN(jconst_fdct_islow_mmx): EXTN(jconst_fdct_islow_mmx):
@@ -80,7 +80,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1) PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -109,19 +109,19 @@ EXTN(jsimd_fdct_islow_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
mov edx, POINTER [data(eax)] ; (DCTELEM *) mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -363,7 +363,7 @@ EXTN(jsimd_fdct_islow_mmx):
mov edx, POINTER [data(eax)] ; (DCTELEM *) mov edx, POINTER [data(eax)] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -609,7 +609,7 @@ EXTN(jsimd_fdct_islow_mmx):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (SSE2) ; jfdctint.asm - accurate integer FDCT (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander. ; Copyright (C) 2016, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_islow_sse2) GLOBAL_DATA(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2): EXTN(jconst_fdct_islow_sse2):
@@ -80,7 +80,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -110,13 +110,13 @@ EXTN(jsimd_fdct_islow_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
; push esi ; unused ; push esi ; unused
; push edi ; unused ; push edi ; unused
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -622,7 +622,7 @@ EXTN(jsimd_fdct_islow_sse2):
; pop esi ; unused ; pop esi ; unused
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jidctflt.asm - floating-point IDCT (3DNow! & MMX) ; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,7 +24,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_float_3dnow) GLOBAL_DATA(jconst_idct_float_3dnow)
EXTN(jconst_idct_float_3dnow): EXTN(jconst_idct_float_3dnow):
@@ -36,7 +36,7 @@ PD_2_613 times 2 dd 2.613125929752753055713286
PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE PB_CENTERJSAMP times 8 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -78,7 +78,7 @@ EXTN(jsimd_idct_float_3dnow):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -87,21 +87,21 @@ EXTN(jsimd_idct_float_3dnow):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/2 ; ctr mov ecx, DCTSIZE/2 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT jnz short .columnDCT
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
or eax, ebx or eax, ebx
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
jnz short .columnDCT jnz short .columnDCT
; -- AC terms all zero ; -- AC terms all zero
@@ -127,7 +127,7 @@ EXTN(jsimd_idct_float_3dnow):
movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -293,7 +293,7 @@ EXTN(jsimd_idct_float_3dnow):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/2 ; ctr mov ecx, DCTSIZE/2 ; ctr
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -420,14 +420,14 @@ EXTN(jsimd_idct_float_3dnow):
punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07) punpckldq mm6, mm4 ; mm6=(00 01 02 03 04 05 06 07)
punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17) punpckhdq mm7, mm4 ; mm7=(10 11 12 13 14 15 16 17)
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 2*SIZEOF_JSAMPROW add edi, byte 2*SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jidctflt.asm - floating-point IDCT (SSE & MMX) ; jidctflt.asm - floating-point IDCT (SSE & MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -23,18 +23,18 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_float_sse) GLOBAL_DATA(jconst_idct_float_sse)
EXTN(jconst_idct_float_sse): EXTN(jconst_idct_float_sse):
@@ -46,7 +46,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286
PD_0_125 times 4 dd 0.125 ; 1/8 PD_0_125 times 4 dd 0.125 ; 1/8
PB_CENTERJSAMP times 8 db CENTERJSAMPLE PB_CENTERJSAMP times 8 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -88,7 +88,7 @@ EXTN(jsimd_idct_float_sse):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -149,7 +149,7 @@ EXTN(jsimd_idct_float_sse):
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -325,11 +325,11 @@ EXTN(jsimd_idct_float_sse):
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3, xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0, xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
@@ -340,11 +340,11 @@ EXTN(jsimd_idct_float_sse):
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6, xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3, xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
@@ -372,7 +372,7 @@ EXTN(jsimd_idct_float_sse):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -536,7 +536,7 @@ EXTN(jsimd_idct_float_sse):
punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27) punpckldq mm5, mm6 ; mm5=(20 21 22 23 24 25 26 27)
punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37) punpckhdq mm4, mm6 ; mm4=(30 31 32 33 34 35 36 37)
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -547,7 +547,7 @@ EXTN(jsimd_idct_float_sse):
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW add edi, byte 4*SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jidctflt.asm - floating-point IDCT (SSE & SSE2) ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -23,18 +23,18 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_float_sse2) GLOBAL_DATA(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2): EXTN(jconst_idct_float_sse2):
@@ -46,7 +46,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -88,7 +88,7 @@ EXTN(jsimd_idct_float_sse2):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; FAST_FLOAT *wsptr lea edi, [workspace] ; FAST_FLOAT *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -150,7 +150,7 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -287,11 +287,11 @@ EXTN(jsimd_idct_float_sse2):
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3, xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0, xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
@@ -302,11 +302,11 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6, xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3, xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
@@ -334,7 +334,7 @@ EXTN(jsimd_idct_float_sse2):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -464,7 +464,7 @@ EXTN(jsimd_idct_float_sse2):
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
@@ -475,7 +475,7 @@ EXTN(jsimd_idct_float_sse2):
movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW add edi, byte 4*SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jidctfst.asm - fast integer IDCT (MMX) ; jidctfst.asm - fast integer IDCT (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +56,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_ifast_mmx) GLOBAL_DATA(jconst_idct_ifast_mmx)
EXTN(jconst_idct_ifast_mmx): EXTN(jconst_idct_ifast_mmx):
@@ -67,7 +67,7 @@ PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 4 dw F_1_082 << CONST_SHIFT PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 8 db CENTERJSAMPLE PB_CENTERJSAMP times 8 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -109,7 +109,7 @@ EXTN(jsimd_idct_ifast_mmx):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -118,7 +118,7 @@ EXTN(jsimd_idct_ifast_mmx):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; JCOEF *wsptr lea edi, [workspace] ; JCOEF *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -163,7 +163,7 @@ EXTN(jsimd_idct_ifast_mmx):
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -326,7 +326,7 @@ EXTN(jsimd_idct_ifast_mmx):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -464,7 +464,7 @@ EXTN(jsimd_idct_ifast_mmx):
punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27)
punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37)
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -475,7 +475,7 @@ EXTN(jsimd_idct_ifast_mmx):
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
add esi, byte 4*SIZEOF_JCOEF ; wsptr add esi, byte 4*SIZEOF_JCOEF ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW add edi, byte 4*SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jidctfst.asm - fast integer IDCT (SSE2) ; jidctfst.asm - fast integer IDCT (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +56,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_ifast_sse2) GLOBAL_DATA(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2): EXTN(jconst_idct_ifast_sse2):
@@ -67,7 +67,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -101,13 +101,13 @@ EXTN(jsimd_idct_ifast_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -155,7 +155,7 @@ EXTN(jsimd_idct_ifast_sse2):
movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1
movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3
jmp near .column_end jmp near .column_end
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -490,7 +490,7 @@ EXTN(jsimd_idct_ifast_sse2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (AVX2) ; jidctint.asm - accurate integer IDCT (AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %1-%4: Input/output registers ; %1-%4: Input/output registers
; %5-%8: Temp registers ; %5-%8: Temp registers
%macro dotranspose 8 %macro DOTRANSPOSE 8
; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
@@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %5-%12: Temp registers ; %5-%12: Temp registers
; %9: Pass (1 or 2) ; %9: Pass (1 or 2)
%macro dodct 13 %macro DODCT 13
; -- Even part ; -- Even part
; (Original) ; (Original)
@@ -250,7 +250,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_islow_avx2) GLOBAL_DATA(jconst_idct_islow_avx2)
EXTN(jconst_idct_islow_avx2): EXTN(jconst_idct_islow_avx2):
@@ -269,7 +269,7 @@ PB_CENTERJSAMP times 32 db CENTERJSAMPLE
PW_1_NEG1 times 8 dw 1 PW_1_NEG1 times 8 dw 1
times 8 dw -1 times 8 dw -1
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -303,13 +303,13 @@ EXTN(jsimd_idct_islow_avx2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns. ; ---- Pass 1: process columns.
@@ -353,7 +353,7 @@ EXTN(jsimd_idct_islow_avx2):
vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07) vpshufd ymm3, ymm4, 0xFF ; ymm3=col3_7=(03 03 03 03 03 03 03 03 07 07 07 07 07 07 07 07)
jmp near .column_end jmp near .column_end
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -371,10 +371,10 @@ EXTN(jsimd_idct_islow_avx2):
vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1 DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
.column_end: .column_end:
@@ -395,10 +395,10 @@ EXTN(jsimd_idct_islow_avx2):
vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2 DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
@@ -442,7 +442,7 @@ EXTN(jsimd_idct_islow_avx2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (MMX) ; jidctint.asm - accurate integer IDCT (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander. ; Copyright (C) 2016, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_islow_mmx) GLOBAL_DATA(jconst_idct_islow_mmx)
EXTN(jconst_idct_islow_mmx): EXTN(jconst_idct_islow_mmx):
@@ -80,7 +80,7 @@ PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2 - 1)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE PB_CENTERJSAMP times 8 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -122,7 +122,7 @@ EXTN(jsimd_idct_islow_mmx):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -131,7 +131,7 @@ EXTN(jsimd_idct_islow_mmx):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; JCOEF *wsptr lea edi, [workspace] ; JCOEF *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -178,7 +178,7 @@ EXTN(jsimd_idct_islow_mmx):
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -513,7 +513,7 @@ EXTN(jsimd_idct_islow_mmx):
mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
mov eax, JDIMENSION [output_col(eax)] mov eax, JDIMENSION [output_col(eax)]
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.rowloop: .rowloop:
; -- Even part ; -- Even part
@@ -816,7 +816,7 @@ EXTN(jsimd_idct_islow_mmx):
punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27) punpckldq mm7, mm5 ; mm7=(20 21 22 23 24 25 26 27)
punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37) punpckhdq mm4, mm5 ; mm4=(30 31 32 33 34 35 36 37)
pushpic ebx ; save GOT address PUSHPIC ebx ; save GOT address
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -827,7 +827,7 @@ EXTN(jsimd_idct_islow_mmx):
movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
poppic ebx ; restore GOT address POPPIC ebx ; restore GOT address
add esi, byte 4*SIZEOF_JCOEF ; wsptr add esi, byte 4*SIZEOF_JCOEF ; wsptr
add edi, byte 4*SIZEOF_JSAMPROW add edi, byte 4*SIZEOF_JSAMPROW

View File

@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (SSE2) ; jidctint.asm - accurate integer IDCT (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, 2020, D. R. Commander. ; Copyright (C) 2016, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_islow_sse2) GLOBAL_DATA(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2): EXTN(jconst_idct_islow_sse2):
@@ -80,7 +80,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -114,13 +114,13 @@ EXTN(jsimd_idct_islow_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -172,7 +172,7 @@ EXTN(jsimd_idct_islow_sse2):
movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
jmp near .column_end jmp near .column_end
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -847,7 +847,7 @@ EXTN(jsimd_idct_islow_sse2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp

View File

@@ -2,7 +2,7 @@
; jidctred.asm - reduced-size IDCT (MMX) ; jidctred.asm - reduced-size IDCT (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_red_mmx) GLOBAL_DATA(jconst_idct_red_mmx)
EXTN(jconst_idct_red_mmx): EXTN(jconst_idct_red_mmx):
@@ -87,7 +87,7 @@ PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2 - 1)
PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1) PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2 - 1)
PB_CENTERJSAMP times 8 db CENTERJSAMPLE PB_CENTERJSAMP times 8 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -124,13 +124,13 @@ EXTN(jsimd_idct_4x4_mmx):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [workspace] lea esp, [workspace]
pushpic ebx PUSHPIC ebx
; push ecx ; need not be preserved ; push ecx ; need not be preserved
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -139,7 +139,7 @@ EXTN(jsimd_idct_4x4_mmx):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr mov esi, JCOEFPTR [coef_block(eax)] ; inptr
lea edi, [workspace] ; JCOEF *wsptr lea edi, [workspace] ; JCOEF *wsptr
mov ecx, DCTSIZE/4 ; ctr mov ecx, DCTSIZE/4 ; ctr
alignx 16, 7 ALIGNX 16, 7
.columnloop: .columnloop:
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -181,7 +181,7 @@ EXTN(jsimd_idct_4x4_mmx):
movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
jmp near .nextcolumn jmp near .nextcolumn
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -479,7 +479,7 @@ EXTN(jsimd_idct_4x4_mmx):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; need not be preserved ; pop ecx ; need not be preserved
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
@@ -512,7 +512,7 @@ EXTN(jsimd_idct_2x2_mmx):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.

View File

@@ -2,7 +2,7 @@
; jidctred.asm - reduced-size IDCT (SSE2) ; jidctred.asm - reduced-size IDCT (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_red_sse2) GLOBAL_DATA(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2): EXTN(jconst_idct_red_sse2):
@@ -87,7 +87,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -122,13 +122,13 @@ EXTN(jsimd_idct_4x4_sse2):
mov [esp], eax mov [esp], eax
mov ebp, esp ; ebp = aligned ebp mov ebp, esp ; ebp = aligned ebp
lea esp, [wk(0)] lea esp, [wk(0)]
pushpic ebx PUSHPIC ebx
; push ecx ; unused ; push ecx ; unused
; push edx ; need not be preserved ; push edx ; need not be preserved
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -171,7 +171,7 @@ EXTN(jsimd_idct_4x4_sse2):
pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) pshufd xmm3, xmm3, 0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
jmp near .column_end jmp near .column_end
alignx 16, 7 ALIGNX 16, 7
%endif %endif
.columnDCT: .columnDCT:
@@ -400,7 +400,7 @@ EXTN(jsimd_idct_4x4_sse2):
pop esi pop esi
; pop edx ; need not be preserved ; pop edx ; need not be preserved
; pop ecx ; unused ; pop ecx ; unused
poppic ebx POPPIC ebx
mov esp, ebp ; esp <- aligned ebp mov esp, ebp ; esp <- aligned ebp
pop esp ; esp <- original ebp pop esp ; esp <- original ebp
pop ebp pop ebp
@@ -433,7 +433,7 @@ EXTN(jsimd_idct_2x2_sse2):
push esi push esi
push edi push edi
get_GOT ebx ; get GOT address GET_GOT ebx ; get GOT address
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.

View File

@@ -2,7 +2,7 @@
; jquant.asm - sample data conversion and quantization (3DNow! & MMX) ; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_3dnow):
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16, 7 ALIGNX 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -154,7 +154,7 @@ EXTN(jsimd_quantize_float_3dnow):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16 mov eax, DCTSIZE2/16
alignx 16, 7 ALIGNX 16, 7
.quantloop: .quantloop:
movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

View File

@@ -2,7 +2,7 @@
; jquant.asm - sample data conversion and quantization (MMX) ; jquant.asm - sample data conversion and quantization (MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_mmx):
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -157,10 +157,10 @@ EXTN(jsimd_quantize_mmx):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov ah, 2 mov ah, 2
alignx 16, 7 ALIGNX 16, 7
.quantloop1: .quantloop1:
mov al, DCTSIZE2/8/2 mov al, DCTSIZE2/8/2
alignx 16, 7 ALIGNX 16, 7
.quantloop2: .quantloop2:
movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]

View File

@@ -2,7 +2,7 @@
; jquant.asm - sample data conversion and quantization (SSE & MMX) ; jquant.asm - sample data conversion and quantization (SSE & MMX)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_sse):
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16, 7 ALIGNX 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -150,7 +150,7 @@ EXTN(jsimd_quantize_float_sse):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16 mov eax, DCTSIZE2/16
alignx 16, 7 ALIGNX 16, 7
.quantloop: .quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

View File

@@ -2,7 +2,7 @@
; jquantf.asm - sample data conversion and quantization (SSE & SSE2) ; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_sse2):
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/2 mov ecx, DCTSIZE/2
alignx 16, 7 ALIGNX 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -127,7 +127,7 @@ EXTN(jsimd_quantize_float_sse2):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/16 mov eax, DCTSIZE2/16
alignx 16, 7 ALIGNX 16, 7
.quantloop: .quantloop:
movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

View File

@@ -2,7 +2,7 @@
; jquanti.asm - sample data conversion and quantization (SSE2) ; jquanti.asm - sample data conversion and quantization (SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2016, D. R. Commander. ; Copyright (C) 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_sse2):
mov eax, JDIMENSION [start_col] mov eax, JDIMENSION [start_col]
mov edi, POINTER [workspace] ; (DCTELEM *) mov edi, POINTER [workspace] ; (DCTELEM *)
mov ecx, DCTSIZE/4 mov ecx, DCTSIZE/4
alignx 16, 7 ALIGNX 16, 7
.convloop: .convloop:
mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
@@ -133,7 +133,7 @@ EXTN(jsimd_quantize_sse2):
mov edx, POINTER [divisors] mov edx, POINTER [divisors]
mov edi, JCOEFPTR [coef_block] mov edi, JCOEFPTR [coef_block]
mov eax, DCTSIZE2/32 mov eax, DCTSIZE2/32
alignx 16, 7 ALIGNX 16, 7
.quantloop: .quantloop:
movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]

View File

@@ -2,7 +2,7 @@
; jsimdext.inc - common declarations ; jsimdext.inc - common declarations
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. ; Copyright (C) 2010, 2016, 2018-2019, 2024, D. R. Commander.
; Copyright (C) 2018, Matthieu Darbois. ; Copyright (C) 2018, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -272,7 +272,7 @@ const_base:
%define GOTOFF(got, sym) (got) + (sym) - const_base %define GOTOFF(got, sym) (got) + (sym) - const_base
%imacro get_GOT 1 %imacro GET_GOT 1
; NOTE: this macro destroys ecx resister. ; NOTE: this macro destroys ecx resister.
call %%geteip call %%geteip
add ecx, byte (%%ref - $) add ecx, byte (%%ref - $)
@@ -304,7 +304,7 @@ const_base:
%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
%imacro get_GOT 1 %imacro GET_GOT 1
extern GOT_SYMBOL extern GOT_SYMBOL
call %%geteip call %%geteip
add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
@@ -317,13 +317,13 @@ const_base:
%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
%imacro pushpic 1.nolist %imacro PUSHPIC 1.nolist
push %1 push %1
%endmacro %endmacro
%imacro poppic 1.nolist %imacro POPPIC 1.nolist
pop %1 pop %1
%endmacro %endmacro
%imacro movpic 2.nolist %imacro MOVPIC 2.nolist
mov %1, %2 mov %1, %2
%endmacro %endmacro
@@ -331,13 +331,13 @@ const_base:
%define GOTOFF(got, sym) (sym) %define GOTOFF(got, sym) (sym)
%imacro get_GOT 1.nolist %imacro GET_GOT 1.nolist
%endmacro %endmacro
%imacro pushpic 1.nolist %imacro PUSHPIC 1.nolist
%endmacro %endmacro
%imacro poppic 1.nolist %imacro POPPIC 1.nolist
%endmacro %endmacro
%imacro movpic 2.nolist %imacro MOVPIC 2.nolist
%endmacro %endmacro
%endif ; PIC ----------------------------------------- %endif ; PIC -----------------------------------------
@@ -349,7 +349,7 @@ const_base:
%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
%define FILLB(b, n) (($$-(b)) & ((n)-1)) %define FILLB(b, n) (($$-(b)) & ((n)-1))
%imacro alignx 1-2.nolist 0xFFFF %imacro ALIGNX 1-2.nolist 0xFFFF
%%bs: \ %%bs: \
times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
db 0x90 ; nop db 0x90 ; nop
@@ -371,7 +371,7 @@ const_base:
; Align the next data on {2,4,8,16,..}-byte boundary. ; Align the next data on {2,4,8,16,..}-byte boundary.
; ;
%imacro alignz 1.nolist %imacro ALIGNZ 1.nolist
align %1, db 0 ; filling zeros align %1, db 0 ; filling zeros
%endmacro %endmacro
@@ -379,7 +379,7 @@ const_base:
%ifdef WIN64 %ifdef WIN64
%imacro collect_args 1 %imacro COLLECT_ARGS 1
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movaps XMMWORD [rsp], xmm6 movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
@@ -408,7 +408,7 @@ const_base:
push rdi push rdi
%endmacro %endmacro
%imacro uncollect_args 1 %imacro UNCOLLECT_ARGS 1
pop rdi pop rdi
pop rsi pop rsi
%if %1 > 5 %if %1 > 5
@@ -429,7 +429,7 @@ const_base:
add rsp, SIZEOF_XMMWORD add rsp, SIZEOF_XMMWORD
%endmacro %endmacro
%imacro push_xmm 1 %imacro PUSH_XMM 1
sub rsp, %1 * SIZEOF_XMMWORD sub rsp, %1 * SIZEOF_XMMWORD
movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
%if %1 > 1 %if %1 > 1
@@ -443,7 +443,7 @@ const_base:
%endif %endif
%endmacro %endmacro
%imacro pop_xmm 1 %imacro POP_XMM 1
movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
%if %1 > 1 %if %1 > 1
movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
@@ -459,7 +459,7 @@ const_base:
%else %else
%imacro collect_args 1 %imacro COLLECT_ARGS 1
push r10 push r10
mov r10, rdi mov r10, rdi
%if %1 > 1 %if %1 > 1
@@ -484,7 +484,7 @@ const_base:
%endif %endif
%endmacro %endmacro
%imacro uncollect_args 1 %imacro UNCOLLECT_ARGS 1
%if %1 > 5 %if %1 > 5
pop r15 pop r15
%endif %endif
@@ -503,10 +503,10 @@ const_base:
pop r10 pop r10
%endmacro %endmacro
%imacro push_xmm 1 %imacro PUSH_XMM 1
%endmacro %endmacro
%imacro pop_xmm 1 %imacro POP_XMM 1
%endmacro %endmacro
%endif %endif

View File

@@ -1,7 +1,7 @@
; ;
; jccolext.asm - colorspace conversion (64-bit AVX2) ; jccolext.asm - colorspace conversion (64-bit AVX2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, (SIZEOF_YMMWORD * WK_NUM) sub rsp, (SIZEOF_YMMWORD * WK_NUM)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -549,7 +549,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -1,7 +1,7 @@
; ;
; jccolext.asm - colorspace conversion (64-bit SSE2) ; jccolext.asm - colorspace conversion (64-bit SSE2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -47,7 +47,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, (SIZEOF_XMMWORD * WK_NUM) sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -474,7 +474,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -1,7 +1,7 @@
; ;
; jccolor.asm - colorspace conversion (64-bit AVX2) ; jccolor.asm - colorspace conversion (64-bit AVX2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_ycc_convert_avx2) GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
EXTN(jconst_rgb_ycc_convert_avx2): EXTN(jconst_rgb_ycc_convert_avx2):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 8 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS) (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jccolor.asm - colorspace conversion (64-bit SSE2) ; jccolor.asm - colorspace conversion (64-bit SSE2)
; ;
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_ycc_convert_sse2) GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
EXTN(jconst_rgb_ycc_convert_sse2): EXTN(jconst_rgb_ycc_convert_sse2):
@@ -45,7 +45,7 @@ PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS - 1)) - 1 + \
(CENTERJSAMPLE << SCALEBITS) (CENTERJSAMPLE << SCALEBITS)
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgray.asm - grayscale colorspace conversion (64-bit AVX2) ; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_gray_convert_avx2) GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
EXTN(jconst_rgb_gray_convert_avx2): EXTN(jconst_rgb_gray_convert_avx2):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337
PW_F0114_F0250 times 8 dw F_0_114, F_0_250 PW_F0114_F0250 times 8 dw F_0_114, F_0_250
PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 8 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgray.asm - grayscale colorspace conversion (64-bit SSE2) ; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -28,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_rgb_gray_convert_sse2) GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
EXTN(jconst_rgb_gray_convert_sse2): EXTN(jconst_rgb_gray_convert_sse2):
@@ -37,7 +37,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337
PW_F0114_F0250 times 4 dw F_0_114, F_0_250 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1)) PD_ONEHALF times 4 dd (1 << (SCALEBITS - 1))
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -1,7 +1,7 @@
; ;
; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) ; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -48,7 +48,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_YMMWORD * WK_NUM) sub rsp, byte (SIZEOF_YMMWORD * WK_NUM)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -428,7 +428,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -1,7 +1,7 @@
; ;
; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
; ;
; Copyright (C) 2011, 2016, D. R. Commander. ; Copyright (C) 2011, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -47,7 +47,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d mov ecx, r10d
@@ -353,7 +353,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -39,7 +39,7 @@ endstruc
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_huff_encode_one_block) GLOBAL_DATA(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block): EXTN(jconst_huff_encode_one_block):
@@ -49,7 +49,7 @@ jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
dd 0x00ff, 0x01ff, 0x03ff, 0x07ff dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
dd 0x0fff, 0x1fff, 0x3fff, 0x7fff dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
alignz 32 ALIGNZ 32
times 1 << 14 db 15 times 1 << 14 db 15
times 1 << 13 db 14 times 1 << 13 db 14
@@ -87,7 +87,7 @@ times 1 << 13 db 14
times 1 << 14 db 15 times 1 << 14 db 15
times 1 << 15 db 16 times 1 << 15 db 16
alignz 32 ALIGNZ 32
%define NBITS(x) nbits_base + x %define NBITS(x) nbits_base + x
%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) %define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))

View File

@@ -4,6 +4,7 @@
; ;
; Copyright (C) 2016, 2018, Matthieu Darbois ; Copyright (C) 2016, 2018, Matthieu Darbois
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; Copyright (C) 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -287,7 +288,7 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movdqa XMMWORD [rsp], ZERO movdqa XMMWORD [rsp], ZERO
collect_args 6 COLLECT_ARGS 6
movd AL, r13d movd AL, r13d
pxor ZERO, ZERO pxor ZERO, ZERO
@@ -381,7 +382,7 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
REDUCE0 REDUCE0
uncollect_args 6 UNCOLLECT_ARGS 6
movdqa ZERO, XMMWORD [rsp] movdqa ZERO, XMMWORD [rsp]
mov rsp, rbp mov rsp, rbp
pop rbp pop rbp
@@ -450,7 +451,7 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
sub rsp, SIZEOF_XMMWORD sub rsp, SIZEOF_XMMWORD
movdqa XMMWORD [rsp], ZERO movdqa XMMWORD [rsp], ZERO
collect_args 6 COLLECT_ARGS 6
xor SIGN, SIGN xor SIGN, SIGN
xor EOB, EOB xor EOB, EOB
@@ -598,7 +599,7 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
REDUCE0 REDUCE0
mov eax, EOB mov eax, EOB
uncollect_args 6 UNCOLLECT_ARGS 6
movdqa ZERO, XMMWORD [rsp] movdqa ZERO, XMMWORD [rsp]
mov rsp, rbp mov rsp, rbp
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jcsample.asm - downsampling (64-bit AVX2) ; jcsample.asm - downsampling (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
@@ -46,7 +46,7 @@
EXTN(jsimd_h2v1_downsample_avx2): EXTN(jsimd_h2v1_downsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 6 COLLECT_ARGS 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -177,7 +177,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
.return: .return:
vzeroupper vzeroupper
uncollect_args 6 UNCOLLECT_ARGS 6
pop rbp pop rbp
ret ret
@@ -207,7 +207,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
EXTN(jsimd_h2v2_downsample_avx2): EXTN(jsimd_h2v2_downsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 6 COLLECT_ARGS 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -356,7 +356,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
.return: .return:
vzeroupper vzeroupper
uncollect_args 6 UNCOLLECT_ARGS 6
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jcsample.asm - downsampling (64-bit SSE2) ; jcsample.asm - downsampling (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -45,7 +45,7 @@
EXTN(jsimd_h2v1_downsample_sse2): EXTN(jsimd_h2v1_downsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 6 COLLECT_ARGS 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -159,7 +159,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args 6 UNCOLLECT_ARGS 6
pop rbp pop rbp
ret ret
@@ -189,7 +189,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
EXTN(jsimd_h2v2_downsample_sse2): EXTN(jsimd_h2v2_downsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 6 COLLECT_ARGS 6
mov ecx, r13d mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -319,7 +319,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args 6 UNCOLLECT_ARGS 6
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jdcolext.asm - colorspace conversion (64-bit AVX2) ; jdcolext.asm - colorspace conversion (64-bit AVX2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -49,7 +49,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (WK_NUM * SIZEOF_YMMWORD) sub rsp, byte (WK_NUM * SIZEOF_YMMWORD)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d ; num_cols mov ecx, r10d ; num_cols
@@ -486,7 +486,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jdcolext.asm - colorspace conversion (64-bit SSE2) ; jdcolext.asm - colorspace conversion (64-bit SSE2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -48,7 +48,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 5 COLLECT_ARGS 5
push rbx push rbx
mov ecx, r10d ; num_cols mov ecx, r10d ; num_cols
@@ -429,7 +429,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 5 UNCOLLECT_ARGS 5
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jdcolor.asm - colorspace conversion (64-bit AVX2) ; jdcolor.asm - colorspace conversion (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_ycc_rgb_convert_avx2) GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
EXTN(jconst_ycc_rgb_convert_avx2): EXTN(jconst_ycc_rgb_convert_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1 PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdcolor.asm - colorspace conversion (64-bit SSE2) ; jdcolor.asm - colorspace conversion (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_ycc_rgb_convert_sse2) GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
EXTN(jconst_ycc_rgb_convert_sse2): EXTN(jconst_ycc_rgb_convert_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2) ; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_merged_upsample_avx2) GLOBAL_DATA(jconst_merged_upsample_avx2)
EXTN(jconst_merged_upsample_avx2): EXTN(jconst_merged_upsample_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
PW_ONE times 16 dw 1 PW_ONE times 16 dw 1
PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 8 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2) ; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_merged_upsample_sse2) GLOBAL_DATA(jconst_merged_upsample_sse2)
EXTN(jconst_merged_upsample_sse2): EXTN(jconst_merged_upsample_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
PW_ONE times 8 dw 1 PW_ONE times 8 dw 1
PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1) PD_ONEHALF times 4 dd 1 << (SCALEBITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT

View File

@@ -2,7 +2,7 @@
; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2) ; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -49,7 +49,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, SIZEOF_YMMWORD * WK_NUM sub rsp, SIZEOF_YMMWORD * WK_NUM
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov ecx, r10d ; col mov ecx, r10d ; col
@@ -480,7 +480,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp
@@ -508,7 +508,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
EXTN(jsimd_h2v2_merged_upsample_avx2): EXTN(jsimd_h2v2_merged_upsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov eax, r10d mov eax, r10d
@@ -587,7 +587,7 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
add rsp, SIZEOF_JSAMPARRAY*4 add rsp, SIZEOF_JSAMPARRAY*4
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) ; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
; ;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander. ; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -48,7 +48,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov ecx, r10d ; col mov ecx, r10d ; col
@@ -422,7 +422,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp
@@ -450,7 +450,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
EXTN(jsimd_h2v2_merged_upsample_sse2): EXTN(jsimd_h2v2_merged_upsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov eax, r10d mov eax, r10d
@@ -529,7 +529,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
add rsp, SIZEOF_JSAMPARRAY*4 add rsp, SIZEOF_JSAMPARRAY*4
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jdsample.asm - upsampling (64-bit AVX2) ; jdsample.asm - upsampling (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2015, Intel Corporation. ; Copyright (C) 2015, Intel Corporation.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
@@ -22,7 +22,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fancy_upsample_avx2) GLOBAL_DATA(jconst_fancy_upsample_avx2)
EXTN(jconst_fancy_upsample_avx2): EXTN(jconst_fancy_upsample_avx2):
@@ -33,7 +33,7 @@ PW_THREE times 16 dw 3
PW_SEVEN times 16 dw 7 PW_SEVEN times 16 dw 7
PW_EIGHT times 16 dw 8 PW_EIGHT times 16 dw 8
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -64,8 +64,8 @@ PW_EIGHT times 16 dw 8
EXTN(jsimd_h2v1_fancy_upsample_avx2): EXTN(jsimd_h2v1_fancy_upsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
push_xmm 3 PUSH_XMM 3
collect_args 4 COLLECT_ARGS 4
mov eax, r11d ; colctr mov eax, r11d ; colctr
test rax, rax test rax, rax
@@ -186,8 +186,8 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
.return: .return:
vzeroupper vzeroupper
uncollect_args 4 UNCOLLECT_ARGS 4
pop_xmm 3 POP_XMM 3
pop rbp pop rbp
ret ret
@@ -222,8 +222,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, (SIZEOF_YMMWORD * WK_NUM) sub rsp, (SIZEOF_YMMWORD * WK_NUM)
push_xmm 3 PUSH_XMM 3
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov eax, r11d ; colctr mov eax, r11d ; colctr
@@ -498,8 +498,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 4 UNCOLLECT_ARGS 4
pop_xmm 3 POP_XMM 3
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp
@@ -526,7 +526,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
EXTN(jsimd_h2v1_upsample_avx2): EXTN(jsimd_h2v1_upsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
mov edx, r11d mov edx, r11d
add rdx, byte (SIZEOF_YMMWORD-1) add rdx, byte (SIZEOF_YMMWORD-1)
@@ -589,7 +589,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
.return: .return:
vzeroupper vzeroupper
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret
@@ -614,7 +614,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
EXTN(jsimd_h2v2_upsample_avx2): EXTN(jsimd_h2v2_upsample_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov edx, r11d mov edx, r11d
@@ -685,7 +685,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
.return: .return:
pop rbx pop rbx
vzeroupper vzeroupper
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jdsample.asm - upsampling (64-bit SSE2) ; jdsample.asm - upsampling (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -21,7 +21,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fancy_upsample_sse2) GLOBAL_DATA(jconst_fancy_upsample_sse2)
EXTN(jconst_fancy_upsample_sse2): EXTN(jconst_fancy_upsample_sse2):
@@ -32,7 +32,7 @@ PW_THREE times 8 dw 3
PW_SEVEN times 8 dw 7 PW_SEVEN times 8 dw 7
PW_EIGHT times 8 dw 8 PW_EIGHT times 8 dw 8
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -63,7 +63,7 @@ PW_EIGHT times 8 dw 8
EXTN(jsimd_h2v1_fancy_upsample_sse2): EXTN(jsimd_h2v1_fancy_upsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
mov eax, r11d ; colctr mov eax, r11d ; colctr
test rax, rax test rax, rax
@@ -174,7 +174,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
jg near .rowloop jg near .rowloop
.return: .return:
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret
@@ -209,7 +209,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov eax, r11d ; colctr mov eax, r11d ; colctr
@@ -472,7 +472,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp
@@ -499,7 +499,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
EXTN(jsimd_h2v1_upsample_sse2): EXTN(jsimd_h2v1_upsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
mov edx, r11d mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1 add rdx, byte (2*SIZEOF_XMMWORD)-1
@@ -560,7 +560,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
jg short .rowloop jg short .rowloop
.return: .return:
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret
@@ -585,7 +585,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
EXTN(jsimd_h2v2_upsample_sse2): EXTN(jsimd_h2v2_upsample_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
mov edx, r11d mov edx, r11d
@@ -654,7 +654,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
.return: .return:
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jfdctflt.asm - floating-point FDCT (64-bit SSE) ; jfdctflt.asm - floating-point FDCT (64-bit SSE)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -35,7 +35,7 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_float_sse) GLOBAL_DATA(jconst_fdct_float_sse)
EXTN(jconst_fdct_float_sse): EXTN(jconst_fdct_float_sse):
@@ -45,7 +45,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844
PD_0_541 times 4 dd 0.541196100146196984399723 PD_0_541 times 4 dd 0.541196100146196984399723
PD_1_306 times 4 dd 1.306562964876376527856643 PD_1_306 times 4 dd 1.306562964876376527856643
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -73,7 +73,7 @@ EXTN(jsimd_fdct_float_sse):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 COLLECT_ARGS 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -345,7 +345,7 @@ EXTN(jsimd_fdct_float_sse):
dec rcx dec rcx
jnz near .columnloop jnz near .columnloop
uncollect_args 1 UNCOLLECT_ARGS 1
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jfdctfst.asm - fast integer FDCT (64-bit SSE2) ; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -50,7 +50,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS) ; FIX(1.306562965)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_ifast_sse2) GLOBAL_DATA(jconst_fdct_ifast_sse2)
EXTN(jconst_fdct_ifast_sse2): EXTN(jconst_fdct_ifast_sse2):
@@ -60,7 +60,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
PW_F0541 times 8 dw F_0_541 << CONST_SHIFT PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
PW_F1306 times 8 dw F_1_306 << CONST_SHIFT PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -88,7 +88,7 @@ EXTN(jsimd_fdct_ifast_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 COLLECT_ARGS 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -379,7 +379,7 @@ EXTN(jsimd_fdct_ifast_sse2):
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
uncollect_args 1 UNCOLLECT_ARGS 1
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (64-bit AVX2) ; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %1-%4: Input/output registers ; %1-%4: Input/output registers
; %5-%8: Temp registers ; %5-%8: Temp registers
%macro dotranspose 8 %macro DOTRANSPOSE 8
; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47) ; %1=(00 01 02 03 04 05 06 07 40 41 42 43 44 45 46 47)
; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57) ; %2=(10 11 12 13 14 15 16 17 50 51 52 53 54 55 56 57)
; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; %3=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
@@ -108,7 +108,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %5-%8: Temp registers ; %5-%8: Temp registers
; %9: Pass (1 or 2) ; %9: Pass (1 or 2)
%macro dodct 9 %macro DODCT 9
vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7 vpsubw %5, %1, %4 ; %5=data1_0-data6_7=tmp6_7
vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0 vpaddw %6, %1, %4 ; %6=data1_0+data6_7=tmp1_0
vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2 vpaddw %7, %2, %3 ; %7=data3_2+data4_5=tmp3_2
@@ -223,7 +223,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_islow_avx2) GLOBAL_DATA(jconst_fdct_islow_avx2)
EXTN(jconst_fdct_islow_avx2): EXTN(jconst_fdct_islow_avx2):
@@ -242,7 +242,7 @@ PW_DESCALE_P2X times 16 dw 1 << (PASS1_BITS - 1)
PW_1_NEG1 times 8 dw 1 PW_1_NEG1 times 8 dw 1
times 8 dw -1 times 8 dw -1
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -262,7 +262,7 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_fdct_islow_avx2): EXTN(jsimd_fdct_islow_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 1 COLLECT_ARGS 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -284,9 +284,9 @@ EXTN(jsimd_fdct_islow_avx2):
; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67) ; ymm2=(20 21 22 23 24 25 26 27 60 61 62 63 64 65 66 67)
; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77) ; ymm3=(30 31 32 33 34 35 36 37 70 71 72 73 74 75 76 77)
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1 DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
; ---- Pass 2: process columns. ; ---- Pass 2: process columns.
@@ -294,9 +294,9 @@ EXTN(jsimd_fdct_islow_avx2):
vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7 vperm2i128 ymm4, ymm1, ymm3, 0x20 ; ymm4=data3_7
vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5 vperm2i128 ymm1, ymm1, ymm3, 0x31 ; ymm1=data1_5
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2 DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5 ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1 vperm2i128 ymm3, ymm0, ymm1, 0x30 ; ymm3=data0_1
@@ -310,7 +310,7 @@ EXTN(jsimd_fdct_islow_avx2):
vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7 vmovdqu YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
vzeroupper vzeroupper
uncollect_args 1 UNCOLLECT_ARGS 1
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (64-bit SSE2) ; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -64,7 +64,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_fdct_islow_sse2) GLOBAL_DATA(jconst_fdct_islow_sse2)
EXTN(jconst_fdct_islow_sse2): EXTN(jconst_fdct_islow_sse2):
@@ -81,7 +81,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1) PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS - 1)
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -109,7 +109,7 @@ EXTN(jsimd_fdct_islow_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 1 COLLECT_ARGS 1
; ---- Pass 1: process rows. ; ---- Pass 1: process rows.
@@ -609,7 +609,7 @@ EXTN(jsimd_fdct_islow_sse2):
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
uncollect_args 1 UNCOLLECT_ARGS 1
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -25,18 +25,18 @@
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) %macro UNPCKLPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
shufps %1, %2, 0x44 shufps %1, %2, 0x44
%endmacro %endmacro
%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) %macro UNPCKHPS2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
shufps %1, %2, 0xEE shufps %1, %2, 0xEE
%endmacro %endmacro
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_float_sse2) GLOBAL_DATA(jconst_idct_float_sse2)
EXTN(jconst_idct_float_sse2): EXTN(jconst_idct_float_sse2):
@@ -48,7 +48,7 @@ PD_M2_613 times 4 dd -2.613125929752753055713286
PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -83,7 +83,7 @@ EXTN(jsimd_idct_float_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
lea rsp, [workspace] lea rsp, [workspace]
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
; ---- Pass 1: process columns from input, store into work array. ; ---- Pass 1: process columns from input, store into work array.
@@ -280,11 +280,11 @@ EXTN(jsimd_idct_float_sse2):
unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
movaps xmm3, xmm6 ; transpose coefficients(phase 2) movaps xmm3, xmm6 ; transpose coefficients(phase 2)
unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) UNPCKLPS2 xmm6, xmm7 ; xmm6=(00 10 20 30)
unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) UNPCKHPS2 xmm3, xmm7 ; xmm3=(01 11 21 31)
movaps xmm0, xmm1 ; transpose coefficients(phase 2) movaps xmm0, xmm1 ; transpose coefficients(phase 2)
unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) UNPCKLPS2 xmm1, xmm2 ; xmm1=(02 12 22 32)
unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) UNPCKHPS2 xmm0, xmm2 ; xmm0=(03 13 23 33)
movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
@@ -295,11 +295,11 @@ EXTN(jsimd_idct_float_sse2):
movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
movaps xmm6, xmm5 ; transpose coefficients(phase 2) movaps xmm6, xmm5 ; transpose coefficients(phase 2)
unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) UNPCKLPS2 xmm5, xmm7 ; xmm5=(40 50 60 70)
unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) UNPCKHPS2 xmm6, xmm7 ; xmm6=(41 51 61 71)
movaps xmm3, xmm4 ; transpose coefficients(phase 2) movaps xmm3, xmm4 ; transpose coefficients(phase 2)
unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) UNPCKLPS2 xmm4, xmm2 ; xmm4=(42 52 62 72)
unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) UNPCKHPS2 xmm3, xmm2 ; xmm3=(43 53 63 73)
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
@@ -470,7 +470,7 @@ EXTN(jsimd_idct_float_sse2):
jnz near .rowloop jnz near .rowloop
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jidctfst.asm - fast integer IDCT (64-bit SSE2) ; jidctfst.asm - fast integer IDCT (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -58,7 +58,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
%define PRE_MULTIPLY_SCALE_BITS 2 %define PRE_MULTIPLY_SCALE_BITS 2
%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_ifast_sse2) GLOBAL_DATA(jconst_idct_ifast_sse2)
EXTN(jconst_idct_ifast_sse2): EXTN(jconst_idct_ifast_sse2):
@@ -69,7 +69,7 @@ PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT
PW_F1082 times 8 dw F_1_082 << CONST_SHIFT PW_F1082 times 8 dw F_1_082 << CONST_SHIFT
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -102,7 +102,7 @@ EXTN(jsimd_idct_ifast_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 COLLECT_ARGS 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -478,7 +478,7 @@ EXTN(jsimd_idct_ifast_sse2):
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (64-bit AVX2) ; jidctint.asm - accurate integer IDCT (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -66,7 +66,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %1-%4: Input/output registers ; %1-%4: Input/output registers
; %5-%8: Temp registers ; %5-%8: Temp registers
%macro dotranspose 8 %macro DOTRANSPOSE 8
; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71) ; %5=(00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71)
; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72) ; %6=(03 13 23 33 43 53 63 73 02 12 22 32 42 52 62 72)
; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75) ; %7=(04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75)
@@ -119,7 +119,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; %5-%12: Temp registers ; %5-%12: Temp registers
; %9: Pass (1 or 2) ; %9: Pass (1 or 2)
%macro dodct 13 %macro DODCT 13
; -- Even part ; -- Even part
; (Original) ; (Original)
@@ -241,7 +241,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_islow_avx2) GLOBAL_DATA(jconst_idct_islow_avx2)
EXTN(jconst_idct_islow_avx2): EXTN(jconst_idct_islow_avx2):
@@ -260,7 +260,7 @@ PB_CENTERJSAMP times 32 db CENTERJSAMPLE
PW_1_NEG1 times 8 dw 1 PW_1_NEG1 times 8 dw 1
times 8 dw -1 times 8 dw -1
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -284,8 +284,8 @@ PW_1_NEG1 times 8 dw 1
EXTN(jsimd_idct_islow_avx2): EXTN(jsimd_idct_islow_avx2):
push rbp push rbp
mov rbp, rsp ; rbp = aligned rbp mov rbp, rsp ; rbp = aligned rbp
push_xmm 4 PUSH_XMM 4
collect_args 4 COLLECT_ARGS 4
; ---- Pass 1: process columns. ; ---- Pass 1: process columns.
@@ -342,10 +342,10 @@ EXTN(jsimd_idct_islow_avx2):
vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6 vperm2i128 ymm2, ymm5, ymm7, 0x20 ; ymm2=in2_6
vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5 vperm2i128 ymm3, ymm7, ymm6, 0x31 ; ymm3=in7_5
dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1 DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
.column_end: .column_end:
@@ -362,10 +362,10 @@ EXTN(jsimd_idct_islow_avx2):
vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5 vperm2i128 ymm4, ymm3, ymm1, 0x31 ; ymm3=in7_5
vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1 vperm2i128 ymm1, ymm3, ymm1, 0x20 ; ymm1=in3_1
dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2 DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6 ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7 DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7 ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45 vpacksswb ymm0, ymm0, ymm1 ; ymm0=data01_45
@@ -407,8 +407,8 @@ EXTN(jsimd_idct_islow_avx2):
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
uncollect_args 4 UNCOLLECT_ARGS 4
pop_xmm 4 POP_XMM 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (64-bit SSE2) ; jidctint.asm - accurate integer IDCT (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2020, D. R. Commander. ; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_islow_sse2) GLOBAL_DATA(jconst_idct_islow_sse2)
EXTN(jconst_idct_islow_sse2): EXTN(jconst_idct_islow_sse2):
@@ -82,7 +82,7 @@ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -115,7 +115,7 @@ EXTN(jsimd_idct_islow_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, (SIZEOF_XMMWORD * WK_NUM) sub rsp, (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 COLLECT_ARGS 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -835,7 +835,7 @@ EXTN(jsimd_idct_islow_sse2):
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp

View File

@@ -2,7 +2,7 @@
; jidctred.asm - reduced-size IDCT (64-bit SSE2) ; jidctred.asm - reduced-size IDCT (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; Copyright (C) 2023, Aliaksiej Kandracienka. ; Copyright (C) 2023, Aliaksiej Kandracienka.
; ;
@@ -71,7 +71,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS) ; FIX(3.624509785)
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_CONST SECTION SEG_CONST
alignz 32 ALIGNZ 32
GLOBAL_DATA(jconst_idct_red_sse2) GLOBAL_DATA(jconst_idct_red_sse2)
EXTN(jconst_idct_red_sse2): EXTN(jconst_idct_red_sse2):
@@ -89,7 +89,7 @@ PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2 - 1)
PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1) PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2 - 1)
PB_CENTERJSAMP times 16 db CENTERJSAMPLE PB_CENTERJSAMP times 16 db CENTERJSAMPLE
alignz 32 ALIGNZ 32
; -------------------------------------------------------------------------- ; --------------------------------------------------------------------------
SECTION SEG_TEXT SECTION SEG_TEXT
@@ -123,7 +123,7 @@ EXTN(jsimd_idct_4x4_sse2):
; Allocate stack space for wk array. r15 is used to access it. ; Allocate stack space for wk array. r15 is used to access it.
mov r15, rsp mov r15, rsp
sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) sub rsp, byte (SIZEOF_XMMWORD * WK_NUM)
collect_args 4 COLLECT_ARGS 4
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -388,7 +388,7 @@ EXTN(jsimd_idct_4x4_sse2):
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
uncollect_args 4 UNCOLLECT_ARGS 4
lea rsp, [rbp-8] lea rsp, [rbp-8]
pop r15 pop r15
pop rbp pop rbp
@@ -415,7 +415,7 @@ EXTN(jsimd_idct_4x4_sse2):
EXTN(jsimd_idct_2x2_sse2): EXTN(jsimd_idct_2x2_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 4 COLLECT_ARGS 4
push rbx push rbx
; ---- Pass 1: process columns from input. ; ---- Pass 1: process columns from input.
@@ -563,7 +563,7 @@ EXTN(jsimd_idct_2x2_sse2):
mov word [rsi+rax*SIZEOF_JSAMPLE], cx mov word [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx pop rbx
uncollect_args 4 UNCOLLECT_ARGS 4
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) ; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -39,7 +39,7 @@
EXTN(jsimd_convsamp_float_sse2): EXTN(jsimd_convsamp_float_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
push rbx push rbx
pcmpeqw xmm7, xmm7 pcmpeqw xmm7, xmm7
@@ -88,7 +88,7 @@ EXTN(jsimd_convsamp_float_sse2):
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret
@@ -111,7 +111,7 @@ EXTN(jsimd_convsamp_float_sse2):
EXTN(jsimd_quantize_float_sse2): EXTN(jsimd_quantize_float_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
@@ -144,7 +144,7 @@ EXTN(jsimd_quantize_float_sse2):
dec rax dec rax
jnz short .quantloop jnz short .quantloop
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jquanti.asm - sample data conversion and quantization (64-bit AVX2) ; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, D. R. Commander. ; Copyright (C) 2009, 2016, 2018, 2024, D. R. Commander.
; Copyright (C) 2016, Matthieu Darbois. ; Copyright (C) 2016, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
@@ -40,7 +40,7 @@
EXTN(jsimd_convsamp_avx2): EXTN(jsimd_convsamp_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
mov eax, r11d mov eax, r11d
@@ -83,7 +83,7 @@ EXTN(jsimd_convsamp_avx2):
vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3 vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
vzeroupper vzeroupper
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret
@@ -117,7 +117,7 @@ EXTN(jsimd_convsamp_avx2):
EXTN(jsimd_quantize_avx2): EXTN(jsimd_quantize_avx2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)] vmovdqu ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)] vmovdqu ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
@@ -152,7 +152,7 @@ EXTN(jsimd_quantize_avx2):
vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3 vmovdqu [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
vzeroupper vzeroupper
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret

View File

@@ -2,7 +2,7 @@
; jquanti.asm - sample data conversion and quantization (64-bit SSE2) ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
; ;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander. ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
; Copyright (C) 2018, Matthias Räncker. ; Copyright (C) 2018, Matthias Räncker.
; ;
; Based on the x86 SIMD extension for IJG JPEG library ; Based on the x86 SIMD extension for IJG JPEG library
@@ -39,7 +39,7 @@
EXTN(jsimd_convsamp_sse2): EXTN(jsimd_convsamp_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
push rbx push rbx
pxor xmm6, xmm6 ; xmm6=(all 0's) pxor xmm6, xmm6 ; xmm6=(all 0's)
@@ -83,7 +83,7 @@ EXTN(jsimd_convsamp_sse2):
jnz short .convloop jnz short .convloop
pop rbx pop rbx
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret
@@ -117,7 +117,7 @@ EXTN(jsimd_convsamp_sse2):
EXTN(jsimd_quantize_sse2): EXTN(jsimd_quantize_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
collect_args 3 COLLECT_ARGS 3
mov rsi, r12 mov rsi, r12
mov rdx, r11 mov rdx, r11
@@ -177,7 +177,7 @@ EXTN(jsimd_quantize_sse2):
dec rax dec rax
jnz near .quantloop jnz near .quantloop
uncollect_args 3 UNCOLLECT_ARGS 3
pop rbp pop rbp
ret ret