64-bit AVX2 impl. of h2v2 & h2v1 upsampling

(Fancy & Plain)
This commit is contained in:
DRC
2016-05-29 08:09:27 -05:00
parent 72c837da24
commit f1cbc32876
7 changed files with 809 additions and 26 deletions

View File

@@ -26,7 +26,7 @@ if(SIMD_X86_64)
jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64
jdcolor-avx2-64)
jdcolor-avx2-64 jdsample-avx2-64)
message(STATUS "Building x86_64 SIMD extensions")
else()
set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx

View File

@@ -20,7 +20,7 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \
jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm \
jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm \
jdcolor-avx2-64.asm
jdcolor-avx2-64.asm jdsample-avx2-64.asm
jccolor-sse2-64.lo: jccolext-sse2-64.asm
jcgray-sse2-64.lo: jcgryext-sse2-64.asm

View File

@@ -193,14 +193,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2]
push_xmm 4
collect_args 6
%ifdef WIN64
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
sub rsp, 4*SIZEOF_XMMWORD
%endif
push rbx
mov buffer, r11 ; r11 is now sratch
@@ -342,14 +336,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop rbx
%ifdef WIN64
movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD
%endif
uncollect_args 6
pop_xmm 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp

718
simd/jdsample-avx2-64.asm Normal file
View File

@@ -0,0 +1,718 @@
;
; jdsample.asm - upsampling (64-bit AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
global EXTN(jconst_fancy_upsample_avx2)
EXTN(jconst_fancy_upsample_avx2):
PW_ONE times 16 dw 1
PW_TWO times 16 dw 2
PW_THREE times 16 dw 3
PW_SEVEN times 16 dw 7
PW_EIGHT times 16 dw 8
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
;
; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
;
; The upsampling algorithm is linear interpolation between pixel centers,
; also known as a "triangle filter". This is a good compromise between
; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
; of the way between input pixel centers.
;
; GLOBAL(void)
; jsimd_h2v1_fancy_upsample_avx2 (int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
; r10 = int max_v_samp_factor
; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
align 32
global EXTN(jsimd_h2v1_fancy_upsample_avx2)
EXTN(jsimd_h2v1_fancy_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
push_xmm 4
collect_args 4
mov eax, r11d ; colctr
test rax, rax
jz near .return
mov rcx, r10 ; rowctr
test rcx, rcx
jz near .return
mov rsi, r12 ; input_data
mov rdi, r13
mov rdi, JSAMPARRAY [rdi] ; output_data
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
vpcmpeqb xmm10, xmm10, xmm10
vpsrldq xmm11, xmm10, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
vpslldq xmm10, xmm10, (SIZEOF_XMMWORD-1)
vperm2i128 ymm10, ymm10, ymm10, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
.rowloop:
push rax ; colctr
push rdi
push rsi
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr
test rax, SIZEOF_YMMWORD-1
jz short .skip
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
.skip:
vpand ymm7, ymm11, YMMWORD [rsi+0*SIZEOF_YMMWORD]
add rax, byte SIZEOF_YMMWORD-1
and rax, byte -SIZEOF_YMMWORD
cmp rax, byte SIZEOF_YMMWORD
ja short .columnloop
.columnloop_last:
vpand ymm6, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
jmp short .upsample
.columnloop:
vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
vperm2i128 ymm8, ymm0, ymm6, 0x20
vpslldq ymm6, ymm8, 15
.upsample:
vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
vmovdqa ymm2, ymm1
vmovdqa ymm3, ymm1
vperm2i128 ymm8, ymm0, ymm2, 0x20
vpalignr ymm2, ymm2, ymm8, 15
vperm2i128 ymm8, ymm0, ymm3, 0x03
vpalignr ymm3, ymm8, ymm3, 1
vpor ymm2, ymm2, ymm7
vpor ymm3, ymm3, ymm6
vpsrldq ymm7, ymm8, (SIZEOF_XMMWORD-1)
vpunpckhbw ymm4, ymm1, ymm0
vpunpcklbw ymm8, ymm1, ymm0
vperm2i128 ymm1, ymm8, ymm4, 0x20
vperm2i128 ymm4, ymm8, ymm4, 0x31
vpunpckhbw ymm5, ymm2, ymm0
vpunpcklbw ymm8, ymm2, ymm0
vperm2i128 ymm2, ymm8, ymm5, 0x20
vperm2i128 ymm5, ymm8, ymm5, 0x31
vpunpckhbw ymm6, ymm3, ymm0
vpunpcklbw ymm8, ymm3, ymm0
vperm2i128 ymm3, ymm8, ymm6, 0x20
vperm2i128 ymm6, ymm8, ymm6, 0x31
vpmullw ymm1, ymm1, [rel PW_THREE]
vpmullw ymm4, ymm4, [rel PW_THREE]
vpaddw ymm2, ymm2, [rel PW_ONE]
vpaddw ymm5, ymm5, [rel PW_ONE]
vpaddw ymm3, ymm3, [rel PW_TWO]
vpaddw ymm6, ymm6, [rel PW_TWO]
vpaddw ymm2, ymm2, ymm1
vpaddw ymm5, ymm5, ymm4
vpsrlw ymm2, ymm2, 2
vpsrlw ymm5, ymm5, 2
vpaddw ymm3, ymm3, ymm1
vpaddw ymm6, ymm6, ymm4
vpsrlw ymm3, ymm3, 2
vpsrlw ymm6, ymm6, 2
vpsllw ymm3, ymm3, BYTE_BIT
vpsllw ymm6, ymm6, BYTE_BIT
vpor ymm2, ymm2, ymm3
vpor ymm5, ymm5, ymm6
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
sub rax, byte SIZEOF_YMMWORD
add rsi, byte 1*SIZEOF_YMMWORD ; inptr
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
cmp rax, byte SIZEOF_YMMWORD
ja near .columnloop
test eax, eax
jnz near .columnloop_last
pop rsi
pop rdi
pop rax
add rsi, byte SIZEOF_JSAMPROW ; input_data
add rdi, byte SIZEOF_JSAMPROW ; output_data
dec rcx ; rowctr
jg near .rowloop
.return:
uncollect_args 4
pop_xmm 4
pop rbp
ret
; --------------------------------------------------------------------------
;
; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
; Again a triangle filter; see comments for h2v1 case, above.
;
; GLOBAL(void)
; jsimd_h2v2_fancy_upsample_avx2 (int max_v_samp_factor,
; JDIMENSION downsampled_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
; r10 = int max_v_samp_factor
; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
%define WK_NUM 4
align 32
global EXTN(jsimd_h2v2_fancy_upsample_avx2)
EXTN(jsimd_h2v2_fancy_upsample_avx2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
push_xmm 4
collect_args 4
push rbx
mov eax, r11d ; colctr
test rax, rax
jz near .return
mov rcx, r10 ; rowctr
test rcx, rcx
jz near .return
mov rsi, r12 ; input_data
mov rdi, r13
mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rax ; colctr
push rcx
push rdi
push rsi
mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
vpxor ymm9, ymm9, ymm9
vpcmpeqb xmm10, xmm10, xmm10
vpsrldq xmm11, xmm10, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
vpslldq xmm10, xmm10, (SIZEOF_XMMWORD-2)
vperm2i128 ymm10, ymm10, ymm10, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
test rax, SIZEOF_YMMWORD-1
jz short .skip
push rdx
mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
pop rdx
.skip:
; -- process the first column block
vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]
vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]
vpunpckhbw ymm4, ymm0, ymm9
vpunpcklbw ymm8, ymm0, ymm9
vperm2i128 ymm0, ymm8, ymm4, 0x20
vperm2i128 ymm4, ymm8, ymm4, 0x31
vpunpckhbw ymm5, ymm1, ymm9
vpunpcklbw ymm8, ymm1, ymm9
vperm2i128 ymm1, ymm8, ymm5, 0x20
vperm2i128 ymm5, ymm8, ymm5, 0x31
vpunpckhbw ymm6, ymm2, ymm9
vpunpcklbw ymm8, ymm2, ymm9
vperm2i128 ymm2, ymm8, ymm6, 0x20
vperm2i128 ymm6, ymm8, ymm6, 0x31
vpmullw ymm0, ymm0, [rel PW_THREE]
vpmullw ymm4, ymm4, [rel PW_THREE]
vpaddw ymm1, ymm1, ymm0
vpaddw ymm5, ymm5, ymm4
vpaddw ymm2, ymm2, ymm0
vpaddw ymm6, ymm6, ymm4
vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
vpand ymm1, ymm1, ymm11
vpand ymm2, ymm2, ymm11
vmovdqa YMMWORD [wk(0)], ymm1
vmovdqa YMMWORD [wk(1)], ymm2
add rax, byte SIZEOF_YMMWORD-1
and rax, byte -SIZEOF_YMMWORD
cmp rax, byte SIZEOF_YMMWORD
ja short .columnloop
.columnloop_last:
; -- process the last column block
vpand ymm1, ymm10, YMMWORD [rdx+1*SIZEOF_YMMWORD]
vpand ymm2, ymm10, YMMWORD [rdi+1*SIZEOF_YMMWORD]
vmovdqa YMMWORD [wk(2)], ymm1
vmovdqa YMMWORD [wk(3)], ymm2
jmp near .upsample
.columnloop:
; -- process the next column block
vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]
vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]
vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]
vpunpckhbw ymm4, ymm0, ymm9
vpunpcklbw ymm8, ymm0, ymm9
vperm2i128 ymm0, ymm8, ymm4, 0x20
vperm2i128 ymm4, ymm8, ymm4, 0x31
vpunpckhbw ymm5, ymm1, ymm9
vpunpcklbw ymm8, ymm1, ymm9
vperm2i128 ymm1, ymm8, ymm5, 0x20
vperm2i128 ymm5, ymm8, ymm5, 0x31
vpunpckhbw ymm6, ymm2, ymm9
vpunpcklbw ymm8, ymm2, ymm9
vperm2i128 ymm2, ymm8, ymm6, 0x20
vperm2i128 ymm6, ymm8, ymm6, 0x31
vpmullw ymm0, ymm0, [rel PW_THREE]
vpmullw ymm4, ymm4, [rel PW_THREE]
vpaddw ymm1, ymm1, ymm0
vpaddw ymm5, ymm5, ymm4
vpaddw ymm2, ymm2, ymm0
vpaddw ymm6, ymm6, ymm4
vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
vperm2i128 ymm1, ymm9, ymm1, 0x20
vpslldq ymm1, ymm1, 14
vperm2i128 ymm2, ymm9, ymm2, 0x20
vpslldq ymm2, ymm2, 14
vmovdqa YMMWORD [wk(2)], ymm1
vmovdqa YMMWORD [wk(3)], ymm2
.upsample:
; -- process the upper row
vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]
vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]
vmovdqa ymm0, ymm7
vmovdqa ymm4, ymm3
vperm2i128 ymm8, ymm9, ymm0, 0x03
vpalignr ymm0, ymm8, ymm0, 2
vperm2i128 ymm4, ymm9, ymm4, 0x20
vpslldq ymm4, ymm4, 14
vmovdqa ymm5, ymm7
vmovdqa ymm6, ymm3
vperm2i128 ymm5, ymm9, ymm5, 0x03
vpsrldq ymm5, ymm5, 14
vperm2i128 ymm8, ymm9, ymm6, 0x20
vpalignr ymm6, ymm6, ymm8, 14
vpor ymm0, ymm0, ymm4
vpor ymm5, ymm5, ymm6
vmovdqa ymm1, ymm7
vmovdqa ymm2, ymm3
vperm2i128 ymm8, ymm9, ymm1, 0x20
vpalignr ymm1, ymm1, ymm8, 14
vperm2i128 ymm8, ymm9, ymm2, 0x03
vpalignr ymm2, ymm8, ymm2, 2
vmovdqa ymm4, ymm3
vperm2i128 ymm4, ymm9, ymm4, 0x03
vpsrldq ymm4, ymm4, 14
vpor ymm1, ymm1, YMMWORD [wk(0)]
vpor ymm2, ymm2, YMMWORD [wk(2)]
vmovdqa YMMWORD [wk(0)], ymm4
vpmullw ymm7, ymm7, [rel PW_THREE]
vpmullw ymm3, ymm3, [rel PW_THREE]
vpaddw ymm1, ymm1, [rel PW_EIGHT]
vpaddw ymm5, ymm5, [rel PW_EIGHT]
vpaddw ymm0, ymm0, [rel PW_SEVEN]
vpaddw ymm2, [rel PW_SEVEN]
vpaddw ymm1, ymm1, ymm7
vpaddw ymm5, ymm5, ymm3
vpsrlw ymm1, ymm1, 4
vpsrlw ymm5, ymm5, 4
vpaddw ymm0, ymm0, ymm7
vpaddw ymm2, ymm2, ymm3
vpsrlw ymm0, ymm0, 4
vpsrlw ymm2, ymm2, 4
vpsllw ymm0, ymm0, BYTE_BIT
vpsllw ymm2, ymm2, BYTE_BIT
vpor ymm1, ymm1, ymm0
vpor ymm5, ymm5, ymm2
vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
; -- process the lower row
vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]
vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]
vmovdqa ymm7, ymm6
vmovdqa ymm3, ymm4
vperm2i128 ymm8, ymm9, ymm7, 0x03
vpalignr ymm7, ymm8, ymm7, 2
vperm2i128 ymm3, ymm9, ymm3, 0x20
vpslldq ymm3, ymm3, 14
vmovdqa ymm0, ymm6
vmovdqa ymm2, ymm4
vperm2i128 ymm0, ymm9, ymm0, 0x03
vpsrldq ymm0, ymm0, 14
vperm2i128 ymm8, ymm9, ymm2, 0x20
vpalignr ymm2, ymm2, ymm8, 14
vpor ymm7, ymm7, ymm3
vpor ymm0, ymm0, ymm2
vmovdqa ymm1, ymm6
vmovdqa ymm5, ymm4
vperm2i128 ymm8, ymm9, ymm1, 0x20
vpalignr ymm1, ymm1, ymm8, 14
vperm2i128 ymm8, ymm9, ymm5, 0x03
vpalignr ymm5, ymm8, ymm5, 2
vmovdqa ymm3, ymm4
vperm2i128 ymm3, ymm9, ymm3, 0x03
vpsrldq ymm3, ymm3, 14
vpor ymm1, ymm1, YMMWORD [wk(1)]
vpor ymm5, ymm5, YMMWORD [wk(3)]
vmovdqa YMMWORD [wk(1)], ymm3
vpmullw ymm6, ymm6, [rel PW_THREE]
vpmullw ymm4, ymm4, [rel PW_THREE]
vpaddw ymm1, ymm1, [rel PW_EIGHT]
vpaddw ymm0, ymm0, [rel PW_EIGHT]
vpaddw ymm7, ymm7, [rel PW_SEVEN]
vpaddw ymm5, ymm5, [rel PW_SEVEN]
vpaddw ymm1, ymm1, ymm6
vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 4
vpsrlw ymm0, ymm0, 4
vpaddw ymm7, ymm7, ymm6
vpaddw ymm5, ymm5, ymm4
vpsrlw ymm7, ymm7, 4
vpsrlw ymm5, ymm5, 4
vpsllw ymm7, ymm7, BYTE_BIT
vpsllw ymm5, ymm5, BYTE_BIT
vpor ymm1, ymm1, ymm7
vpor ymm0, ymm0, ymm5
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
sub rax, byte SIZEOF_YMMWORD
add rcx, byte 1*SIZEOF_YMMWORD
add rbx, byte 1*SIZEOF_YMMWORD
add rsi, byte 1*SIZEOF_YMMWORD
add rdx, byte 2*SIZEOF_YMMWORD
add rdi, byte 2*SIZEOF_YMMWORD
cmp rax, byte SIZEOF_YMMWORD
ja near .columnloop
test rax, rax
jnz near .columnloop_last
pop rsi
pop rdi
pop rcx
pop rax
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
sub rcx, byte 2 ; rowctr
jg near .rowloop
.return:
pop rbx
uncollect_args 4
pop_xmm 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v1_upsample_avx2 (int max_v_samp_factor,
; JDIMENSION output_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
; r10 = int max_v_samp_factor
; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
align 32
global EXTN(jsimd_h2v1_upsample_avx2)
EXTN(jsimd_h2v1_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
mov edx, r11d
add rdx, byte (SIZEOF_YMMWORD-1)
and rdx, -SIZEOF_YMMWORD
jz near .return
mov rcx, r10 ; rowctr
test rcx, rcx
jz short .return
mov rsi, r12 ; input_data
mov rdi, r13
mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
mov rsi, JSAMPROW [rsi] ; inptr
mov rdi, JSAMPROW [rdi] ; outptr
mov rax, rdx ; colctr
.columnloop:
cmp rax, byte SIZEOF_YMMWORD
ja near .above_16
vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
vpunpckhbw xmm1, xmm0, xmm0
vpunpcklbw xmm0, xmm0, xmm0
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
jmp short .nextrow
.above_16:
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
vpermq ymm0, ymm0, 0xd8
vpunpckhbw ymm1, ymm0, ymm0
vpunpcklbw ymm0, ymm0, ymm0
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
sub rax, byte 2*SIZEOF_YMMWORD
jz short .nextrow
add rsi, byte SIZEOF_YMMWORD ; inptr
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
jmp short .columnloop
.nextrow:
pop rsi
pop rdi
add rsi, byte SIZEOF_JSAMPROW ; input_data
add rdi, byte SIZEOF_JSAMPROW ; output_data
dec rcx ; rowctr
jg short .rowloop
.return:
uncollect_args 4
pop rbp
ret
; --------------------------------------------------------------------------
;
; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
; It's still a box filter.
;
; GLOBAL(void)
; jsimd_h2v2_upsample_avx2 (int max_v_samp_factor,
; JDIMENSION output_width,
; JSAMPARRAY input_data,
; JSAMPARRAY *output_data_ptr);
;
; r10 = int max_v_samp_factor
; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
align 32
global EXTN(jsimd_h2v2_upsample_avx2)
EXTN(jsimd_h2v2_upsample_avx2):
push rbp
mov rax, rsp
mov rbp, rsp
collect_args 4
push rbx
mov edx, r11d
add rdx, byte (SIZEOF_YMMWORD-1)
and rdx, -SIZEOF_YMMWORD
jz near .return
mov rcx, r10 ; rowctr
test rcx, rcx
jz near .return
mov rsi, r12 ; input_data
mov rdi, r13
mov rdi, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
mov rsi, JSAMPROW [rsi] ; inptr
mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
mov rax, rdx ; colctr
.columnloop:
cmp rax, byte SIZEOF_YMMWORD
ja short .above_16
vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
vpunpckhbw xmm1, xmm0, xmm0
vpunpcklbw xmm0, xmm0, xmm0
vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
jmp near .nextrow
.above_16:
vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
vpermq ymm0, ymm0, 0xd8
vpunpckhbw ymm1, ymm0, ymm0
vpunpcklbw ymm0, ymm0, ymm0
vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
sub rax, byte 2*SIZEOF_YMMWORD
jz short .nextrow
add rsi, byte SIZEOF_YMMWORD ; inptr
add rbx, 2*SIZEOF_YMMWORD ; outptr0
add rdi, 2*SIZEOF_YMMWORD ; outptr1
jmp short .columnloop
.nextrow:
pop rsi
pop rdi
add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
sub rcx, byte 2 ; rowctr
jg near .rowloop
.return:
pop rbx
uncollect_args 4
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32

View File

@@ -518,6 +518,13 @@ EXTERN(void) jsimd_h2v2_upsample_sse2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_upsample_avx2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_upsample_avx2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
@@ -553,6 +560,14 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
extern const int jconst_fancy_upsample_avx2[];
EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
(int max_v_samp_factor, JDIMENSION downsampled_width,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);

View File

@@ -374,6 +374,8 @@ jsimd_can_h2v2_upsample (void)
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_AVX2)
return 1;
if (simd_support & JSIMD_SSE2)
return 1;
@@ -391,6 +393,8 @@ jsimd_can_h2v1_upsample (void)
if (sizeof(JDIMENSION) != 4)
return 0;
if (simd_support & JSIMD_AVX2)
return 1;
if (simd_support & JSIMD_SSE2)
return 1;
@@ -403,8 +407,12 @@ jsimd_h2v2_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
else
jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(void)
@@ -413,8 +421,12 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
else
jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
}
GLOBAL(int)
@@ -428,6 +440,9 @@ jsimd_can_h2v2_fancy_upsample (void)
if (sizeof(JDIMENSION) != 4)
return 0;
if ((simd_support & JSIMD_AVX2) &&
IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
return 1;
if ((simd_support & JSIMD_SSE2) &&
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
return 1;
@@ -446,6 +461,9 @@ jsimd_can_h2v1_fancy_upsample (void)
if (sizeof(JDIMENSION) != 4)
return 0;
if ((simd_support & JSIMD_AVX2) &&
IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
return 1;
if ((simd_support & JSIMD_SSE2) &&
IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
return 1;
@@ -459,9 +477,14 @@ jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
else
jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(void)
@@ -470,9 +493,14 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr)
{
jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
else
jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
output_data_ptr);
}
GLOBAL(int)

View File

@@ -363,6 +363,34 @@ const_base:
add rsp, SIZEOF_XMMWORD
%endmacro
%imacro push_xmm 1
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
%if %1 > 1
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
%endif
%if %1 > 2
movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
%endif
%if %1 > 3
movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
%endif
sub rsp, %1 * SIZEOF_XMMWORD
%endmacro
%imacro pop_xmm 1
%if %1 > 3
movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
%endif
%if %1 > 2
movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
%endif
%if %1 > 1
movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
%endif
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, %1 * SIZEOF_XMMWORD
%endmacro
%else
%imacro collect_args 1
@@ -409,6 +437,12 @@ const_base:
pop r10
%endmacro
%imacro push_xmm 1
%endmacro
%imacro pop_xmm 1
%endmacro
%endif
%endif