64-bit AVX2 implementation of int sample conv.
This commit is contained in:
@@ -816,6 +816,9 @@ EXTERN(void) jsimd_convsamp_mmx
|
||||
EXTERN(void) jsimd_convsamp_sse2
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
|
||||
|
||||
EXTERN(void) jsimd_convsamp_avx2
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
|
||||
|
||||
EXTERN(void) jsimd_convsamp_neon
|
||||
(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
|
||||
|
||||
|
||||
@@ -23,6 +23,71 @@
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
;
|
||||
; Load data into workspace, applying unsigned->signed conversion
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_convsamp_avx2 (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
; DCTELEM *workspace);
|
||||
;
|
||||
|
||||
; r10 = JSAMPARRAY sample_data
|
||||
; r11d = JDIMENSION start_col
|
||||
; r12 = DCTELEM *workspace
|
||||
|
||||
align 32
|
||||
global EXTN(jsimd_convsamp_avx2)
|
||||
|
||||
EXTN(jsimd_convsamp_avx2):
|
||||
push rbp
|
||||
mov rax, rsp
|
||||
mov rbp, rsp
|
||||
collect_args 3
|
||||
|
||||
mov eax, r11d
|
||||
|
||||
mov rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
mov rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
mov rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
|
||||
movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
|
||||
pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
|
||||
|
||||
vpmovzxbw ymm0, xmm0 ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
|
||||
vpmovzxbw ymm1, xmm1 ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
|
||||
vpmovzxbw ymm2, xmm2 ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
|
||||
vpmovzxbw ymm3, xmm3 ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
|
||||
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
||||
|
||||
vpaddw ymm0, ymm0, ymm7
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
vpaddw ymm2, ymm2, ymm7
|
||||
vpaddw ymm3, ymm3, ymm7
|
||||
|
||||
vmovdqu YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
|
||||
vmovdqu YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
|
||||
vmovdqu YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
|
||||
vmovdqu YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
|
||||
|
||||
vzeroupper
|
||||
uncollect_args 3
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
|
||||
@@ -660,6 +660,8 @@ jsimd_can_convsamp (void)
|
||||
if (sizeof(DCTELEM) != 2)
|
||||
return 0;
|
||||
|
||||
if (simd_support & JSIMD_AVX2)
|
||||
return 1;
|
||||
if (simd_support & JSIMD_SSE2)
|
||||
return 1;
|
||||
|
||||
@@ -691,7 +693,10 @@ GLOBAL(void)
|
||||
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM *workspace)
|
||||
{
|
||||
jsimd_convsamp_sse2(sample_data, start_col, workspace);
|
||||
if (simd_support & JSIMD_AVX2)
|
||||
jsimd_convsamp_avx2(sample_data, start_col, workspace);
|
||||
else
|
||||
jsimd_convsamp_sse2(sample_data, start_col, workspace);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
||||
Reference in New Issue
Block a user