64-bit AVX2 impl. of YCC->RGB color conversion
This commit is contained in:
@@ -25,7 +25,8 @@ if(SIMD_X86_64)
|
||||
jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
|
||||
jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
|
||||
jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
|
||||
jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64)
|
||||
jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64
|
||||
jdcolor-avx2-64)
|
||||
message(STATUS "Building x86_64 SIMD extensions")
|
||||
else()
|
||||
set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
|
||||
|
||||
@@ -19,7 +19,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
|
||||
jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \
|
||||
jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \
|
||||
jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm \
|
||||
jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm
|
||||
jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm \
|
||||
jdcolor-avx2-64.asm
|
||||
|
||||
jccolor-sse2-64.lo: jccolext-sse2-64.asm
|
||||
jcgray-sse2-64.lo: jcgryext-sse2-64.asm
|
||||
@@ -27,6 +28,7 @@ jdcolor-sse2-64.lo: jdcolext-sse2-64.asm
|
||||
jdmerge-sse2-64.lo: jdmrgext-sse2-64.asm
|
||||
jccolor-avx2-64.lo: jccolext-avx2-64.asm
|
||||
jcgray-avx2-64.lo: jcgryext-avx2-64.asm
|
||||
jdcolor-avx2-64.lo: jdcolext-avx2-64.asm
|
||||
|
||||
endif
|
||||
|
||||
|
||||
429
simd/jdcolext-avx2-64.asm
Normal file
429
simd/jdcolext-avx2-64.asm
Normal file
@@ -0,0 +1,429 @@
|
||||
;
|
||||
; jdcolext.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jcolsamp.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
;
|
||||
; Convert some rows of samples to the output colorspace.
|
||||
;
|
||||
; GLOBAL(void)
|
||||
; jsimd_ycc_rgb_convert_avx2 (JDIMENSION out_width,
|
||||
; JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
; JSAMPARRAY output_buf, int num_rows)
|
||||
;
|
||||
|
||||
; r10d = JDIMENSION out_width
|
||||
; r11 = JSAMPIMAGE input_buf
|
||||
; r12d = JDIMENSION input_row
|
||||
; r13 = JSAMPARRAY output_buf
|
||||
; r14d = int num_rows
|
||||
|
||||
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
|
||||
%define WK_NUM 2
|
||||
|
||||
align 32
|
||||
global EXTN(jsimd_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jsimd_ycc_rgb_convert_avx2):
|
||||
push rbp
|
||||
mov rax, rsp ; rax = original rbp
|
||||
sub rsp, byte 4
|
||||
and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
|
||||
mov [rsp], rax
|
||||
mov rbp, rsp ; rbp = aligned rbp
|
||||
lea rsp, [wk(0)]
|
||||
collect_args 5
|
||||
push rbx
|
||||
|
||||
mov ecx, r10d ; num_cols
|
||||
test rcx, rcx
|
||||
jz near .return
|
||||
|
||||
push rcx
|
||||
|
||||
mov rdi, r11
|
||||
mov ecx, r12d
|
||||
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
||||
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
||||
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
||||
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
||||
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
||||
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
||||
|
||||
pop rcx
|
||||
|
||||
mov rdi, r13
|
||||
mov eax, r14d
|
||||
test rax, rax
|
||||
jle near .return
|
||||
.rowloop:
|
||||
push rax
|
||||
push rdi
|
||||
push rdx
|
||||
push rbx
|
||||
push rsi
|
||||
push rcx ; col
|
||||
|
||||
mov rsi, JSAMPROW [rsi] ; inptr0
|
||||
mov rbx, JSAMPROW [rbx] ; inptr1
|
||||
mov rdx, JSAMPROW [rdx] ; inptr2
|
||||
mov rdi, JSAMPROW [rdi] ; outptr
|
||||
.columnloop:
|
||||
|
||||
vmovdqu ymm5, YMMWORD [rbx]
|
||||
vmovdqu ymm1, YMMWORD [rdx]
|
||||
|
||||
vpcmpeqw ymm0, ymm0, ymm0
|
||||
vpcmpeqw ymm7, ymm7, ymm7
|
||||
vpsrlw ymm0, ymm0, BYTE_BIT
|
||||
vpsllw ymm7, ymm7, 7
|
||||
|
||||
vpand ymm4, ymm0, ymm5
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT
|
||||
vpand ymm0, ymm0, ymm1
|
||||
vpsrlw ymm1, ymm1, BYTE_BIT
|
||||
|
||||
vpaddw ymm2, ymm4, ymm7
|
||||
vpaddw ymm3, ymm5, ymm7
|
||||
vpaddw ymm6, ymm0, ymm7
|
||||
vpaddw ymm7, ymm1, ymm7
|
||||
|
||||
vpaddw ymm4, ymm2, ymm2
|
||||
vpaddw ymm5, ymm3, ymm3
|
||||
vpaddw ymm0, ymm6, ymm6
|
||||
vpaddw ymm1, ymm7, ymm7
|
||||
|
||||
vpmulhw ymm4, ymm4, [rel PW_MF0228]
|
||||
vpmulhw ymm5, ymm5, [rel PW_MF0228]
|
||||
vpmulhw ymm0, ymm0, [rel PW_F0402]
|
||||
vpmulhw ymm1, ymm1, [rel PW_F0402]
|
||||
|
||||
vpaddw ymm4, ymm4, [rel PW_ONE]
|
||||
vpaddw ymm5, ymm5, [rel PW_ONE]
|
||||
vpsraw ymm4, ymm4, 1
|
||||
vpsraw ymm5, ymm5, 1
|
||||
vpaddw ymm0, ymm0, [rel PW_ONE]
|
||||
vpaddw ymm1, ymm1, [rel PW_ONE]
|
||||
vpsraw ymm0, ymm0, 1
|
||||
vpsraw ymm1, ymm1, 1
|
||||
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpaddw ymm4, ymm4, ymm2
|
||||
vpaddw ymm5, ymm5, ymm3
|
||||
vpaddw ymm0, ymm0, ymm6
|
||||
vpaddw ymm1, ymm1, ymm7
|
||||
|
||||
vmovdqa YMMWORD [wk(0)], ymm4
|
||||
vmovdqa YMMWORD [wk(1)], ymm5
|
||||
|
||||
vpunpckhwd ymm4, ymm2, ymm6
|
||||
vpunpcklwd ymm2, ymm2, ymm6
|
||||
vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285]
|
||||
vpunpckhwd ymm5, ymm3, ymm7
|
||||
vpunpcklwd ymm3, ymm3, ymm7
|
||||
vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285]
|
||||
vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285]
|
||||
|
||||
vpaddd ymm2, ymm2, [rel PD_ONEHALF]
|
||||
vpaddd ymm4, ymm4, [rel PD_ONEHALF]
|
||||
vpsrad ymm2, ymm2, SCALEBITS
|
||||
vpsrad ymm4, ymm4, SCALEBITS
|
||||
vpaddd ymm3, ymm3, [rel PD_ONEHALF]
|
||||
vpaddd ymm5, ymm5, [rel PD_ONEHALF]
|
||||
vpsrad ymm3, ymm3, SCALEBITS
|
||||
vpsrad ymm5, ymm5, SCALEBITS
|
||||
|
||||
vpackssdw ymm2, ymm2, ymm4
|
||||
vpackssdw ymm3, ymm3, ymm5
|
||||
vpsubw ymm2, ymm2, ymm6
|
||||
vpsubw ymm3, ymm3, ymm7
|
||||
|
||||
vmovdqu ymm5, YMMWORD [rsi]
|
||||
|
||||
vpcmpeqw ymm4, ymm4, ymm4
|
||||
vpsrlw ymm4, ymm4, BYTE_BIT
|
||||
vpand ymm4, ymm4, ymm5
|
||||
vpsrlw ymm5, ymm5, BYTE_BIT
|
||||
|
||||
vpaddw ymm0, ymm0, ymm4
|
||||
vpaddw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm0
|
||||
vpackuswb ymm1, ymm1, ymm1
|
||||
|
||||
vpaddw ymm2, ymm2, ymm4
|
||||
vpaddw ymm3, ymm3, ymm5
|
||||
vpackuswb ymm2, ymm2, ymm2
|
||||
vpackuswb ymm3, ymm3, ymm3
|
||||
|
||||
vpaddw ymm4, ymm4, YMMWORD [wk(0)]
|
||||
vpaddw ymm5, ymm5, YMMWORD [wk(1)]
|
||||
vpackuswb ymm4, ymm4, ymm4
|
||||
vpackuswb ymm5, ymm5, ymm5
|
||||
|
||||
%if RGB_PIXELSIZE == 3 ; ---------------
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC
|
||||
vpunpcklbw ymmE, ymmE, ymmB
|
||||
vpunpcklbw ymmD, ymmD, ymmF
|
||||
|
||||
vpsrldq ymmH, ymmA, 2
|
||||
vpunpckhwd ymmG, ymmA, ymmE
|
||||
vpunpcklwd ymmA, ymmA, ymmE
|
||||
|
||||
vpsrldq ymmE, ymmE, 2
|
||||
|
||||
vmovdqa ymmC, ymmD
|
||||
vpsrldq ymmB, ymmD, 2
|
||||
vpunpckhwd ymmC, ymmD, ymmH
|
||||
vpunpcklwd ymmD, ymmD, ymmH
|
||||
|
||||
vpunpckhwd ymmF, ymmE, ymmB
|
||||
vpunpcklwd ymmE, ymmE, ymmB
|
||||
|
||||
vpshufd ymmH, ymmA, 0x4E
|
||||
vpunpckldq ymmA, ymmA, ymmD
|
||||
vpunpckhdq ymmD, ymmD, ymmE
|
||||
vpunpckldq ymmE, ymmE, ymmH
|
||||
|
||||
vpshufd ymmH, ymmG, 0x4E
|
||||
vpunpckldq ymmG, ymmG, ymmC
|
||||
vpunpckhdq ymmC, ymmC, ymmF
|
||||
vpunpckldq ymmF, ymmF, ymmH
|
||||
|
||||
vpunpcklqdq ymmH, ymmA, ymmE
|
||||
vpunpcklqdq ymmG, ymmD, ymmG
|
||||
vpunpcklqdq ymmC, ymmF, ymmC
|
||||
|
||||
vperm2i128 ymmA, ymmH, ymmG, 0x20
|
||||
vperm2i128 ymmD, ymmC, ymmH, 0x30
|
||||
vperm2i128 ymmF, ymmG, ymmC, 0x31
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
|
||||
.out0:
|
||||
add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
||||
cmp rcx, byte 2*SIZEOF_YMMWORD
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmF
|
||||
sub rcx, byte 2*SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st31
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jmp short .column_st31
|
||||
.column_st31:
|
||||
cmp rcx, byte SIZEOF_XMMWORD
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
sub rcx, byte SIZEOF_XMMWORD
|
||||
.column_st15:
|
||||
; Store the lower 8 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_MMWORD
|
||||
jb short .column_st7
|
||||
vmovq XMM_MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_MMWORD
|
||||
sub rcx, byte SIZEOF_MMWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_MMWORD
|
||||
.column_st7:
|
||||
; Store the lower 4 bytes of xmmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_DWORD
|
||||
jb short .column_st3
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_DWORD
|
||||
sub rcx, byte SIZEOF_DWORD
|
||||
vpsrldq xmmA, xmmA, SIZEOF_DWORD
|
||||
.column_st3:
|
||||
; Store the lower 2 bytes of rax to the output when it has enough
|
||||
; space.
|
||||
vmovd eax, xmmA
|
||||
cmp rcx, byte SIZEOF_WORD
|
||||
jb short .column_st1
|
||||
mov WORD [rdi], ax
|
||||
add rdi, byte SIZEOF_WORD
|
||||
sub rcx, byte SIZEOF_WORD
|
||||
shr rax, 16
|
||||
.column_st1:
|
||||
; Store the lower 1 byte of rax to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
mov BYTE [rdi], al
|
||||
|
||||
%else ; RGB_PIXELSIZE == 4 ; -----------
|
||||
|
||||
%ifdef RGBX_FILLER_0XFF
|
||||
vpcmpeqb ymm6, ymm6, ymm6
|
||||
vpcmpeqb ymm7, ymm7, ymm7
|
||||
%else
|
||||
vpxor ymm6, ymm6, ymm6
|
||||
vpxor ymm7, ymm7, ymm7
|
||||
%endif
|
||||
|
||||
vpunpcklbw ymmA, ymmA, ymmC
|
||||
vpunpcklbw ymmE, ymmE, ymmG
|
||||
vpunpcklbw ymmB, ymmB, ymmD
|
||||
vpunpcklbw ymmF, ymmF, ymmH
|
||||
|
||||
vpunpckhwd ymmC, ymmA, ymmE
|
||||
vpunpcklwd ymmA, ymmA, ymmE
|
||||
vpunpckhwd ymmG, ymmB, ymmF
|
||||
vpunpcklwd ymmB, ymmB, ymmF
|
||||
|
||||
vpunpckhdq ymmE, ymmA, ymmB
|
||||
vpunpckldq ymmB, ymmA, ymmB
|
||||
vpunpckhdq ymmF, ymmC, ymmG
|
||||
vpunpckldq ymmG, ymmC, ymmG
|
||||
|
||||
|
||||
vperm2i128 ymmA, ymmB, ymmE, 0x20
|
||||
vperm2i128 ymmD, ymmG, ymmF, 0x20
|
||||
vperm2i128 ymmC, ymmB, ymmE, 0x31
|
||||
vperm2i128 ymmH, ymmG, ymmF, 0x31
|
||||
|
||||
cmp rcx, byte SIZEOF_YMMWORD
|
||||
jb short .column_st64
|
||||
|
||||
test rdi, SIZEOF_YMMWORD-1
|
||||
jnz short .out1
|
||||
; --(aligned)-------------------
|
||||
vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
jmp short .out0
|
||||
.out1: ; --(unaligned)-----------------
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
|
||||
vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
|
||||
.out0:
|
||||
add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD
|
||||
jz near .nextrow
|
||||
|
||||
add rsi, byte SIZEOF_YMMWORD ; inptr0
|
||||
add rbx, byte SIZEOF_YMMWORD ; inptr1
|
||||
add rdx, byte SIZEOF_YMMWORD ; inptr2
|
||||
jmp near .columnloop
|
||||
|
||||
.column_st64:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/2
|
||||
jb short .column_st32
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
|
||||
add rdi, byte 2*SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmC
|
||||
vmovdqa ymmD, ymmH
|
||||
sub rcx, byte SIZEOF_YMMWORD/2
|
||||
.column_st32:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/4
|
||||
jb short .column_st16
|
||||
vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
|
||||
add rdi, byte SIZEOF_YMMWORD ; outptr
|
||||
vmovdqa ymmA, ymmD
|
||||
sub rcx, byte SIZEOF_YMMWORD/4
|
||||
.column_st16:
|
||||
cmp rcx, byte SIZEOF_YMMWORD/8
|
||||
jb short .column_st15
|
||||
vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
||||
vperm2i128 ymmA, ymmA, ymmA, 1
|
||||
add rdi, byte SIZEOF_XMMWORD ; outptr
|
||||
sub rcx, byte SIZEOF_YMMWORD/8
|
||||
.column_st15:
|
||||
; Store two pixels (8 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
cmp rcx, byte SIZEOF_YMMWORD/16
|
||||
jb short .column_st7
|
||||
vmovq MMWORD [rdi], xmmA
|
||||
add rdi, byte SIZEOF_YMMWORD/16*4
|
||||
sub rcx, byte SIZEOF_YMMWORD/16
|
||||
vpsrldq xmmA, SIZEOF_YMMWORD/16*4
|
||||
.column_st7:
|
||||
; Store one pixel (4 bytes) of ymmA to the output when it has enough
|
||||
; space.
|
||||
test rcx, rcx
|
||||
jz short .nextrow
|
||||
vmovd XMM_DWORD [rdi], xmmA
|
||||
|
||||
%endif ; RGB_PIXELSIZE ; ---------------
|
||||
|
||||
.nextrow:
|
||||
pop rcx
|
||||
pop rsi
|
||||
pop rbx
|
||||
pop rdx
|
||||
pop rdi
|
||||
pop rax
|
||||
|
||||
add rsi, byte SIZEOF_JSAMPROW
|
||||
add rbx, byte SIZEOF_JSAMPROW
|
||||
add rdx, byte SIZEOF_JSAMPROW
|
||||
add rdi, byte SIZEOF_JSAMPROW ; output_buf
|
||||
dec rax ; num_rows
|
||||
jg near .rowloop
|
||||
|
||||
sfence ; flush the write buffer
|
||||
|
||||
.return:
|
||||
pop rbx
|
||||
uncollect_args 5
|
||||
mov rsp, rbp ; rsp <- aligned rbp
|
||||
pop rsp ; rsp <- original rbp
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
; For some reason, the OS X linker does not honor the request to align the
|
||||
; segment unless we do this.
|
||||
align 32
|
||||
120
simd/jdcolor-avx2-64.asm
Normal file
120
simd/jdcolor-avx2-64.asm
Normal file
@@ -0,0 +1,120 @@
|
||||
;
|
||||
; jdcolor.asm - colorspace conversion (64-bit AVX2)
|
||||
;
|
||||
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
; Copyright (C) 2009, 2016, D. R. Commander.
|
||||
; Copyright (C) 2015, Intel Corporation.
|
||||
;
|
||||
; Based on the x86 SIMD extension for IJG JPEG library
|
||||
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
;
|
||||
; This file should be assembled with NASM (Netwide Assembler),
|
||||
; can *not* be assembled with Microsoft's MASM or any compatible
|
||||
; assembler (including Borland's Turbo Assembler).
|
||||
; NASM is available from http://nasm.sourceforge.net/ or
|
||||
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
||||
;
|
||||
; [TAB8]
|
||||
|
||||
%include "jsimdext.inc"
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
|
||||
%define SCALEBITS 16
|
||||
|
||||
F_0_344 equ 22554 ; FIX(0.34414)
|
||||
F_0_714 equ 46802 ; FIX(0.71414)
|
||||
F_1_402 equ 91881 ; FIX(1.40200)
|
||||
F_1_772 equ 116130 ; FIX(1.77200)
|
||||
F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1)
|
||||
F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414)
|
||||
F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200)
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_CONST
|
||||
|
||||
alignz 32
|
||||
global EXTN(jconst_ycc_rgb_convert_avx2)
|
||||
|
||||
EXTN(jconst_ycc_rgb_convert_avx2):
|
||||
|
||||
PW_F0402 times 16 dw F_0_402
|
||||
PW_MF0228 times 16 dw -F_0_228
|
||||
PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285
|
||||
PW_ONE times 16 dw 1
|
||||
PD_ONEHALF times 8 dd 1 << (SCALEBITS-1)
|
||||
|
||||
alignz 32
|
||||
|
||||
; --------------------------------------------------------------------------
|
||||
SECTION SEG_TEXT
|
||||
BITS 64
|
||||
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGB_RED
|
||||
%define RGB_GREEN EXT_RGB_GREEN
|
||||
%define RGB_BLUE EXT_RGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_RGBX_RED
|
||||
%define RGB_GREEN EXT_RGBX_GREEN
|
||||
%define RGB_BLUE EXT_RGBX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGR_RED
|
||||
%define RGB_GREEN EXT_BGR_GREEN
|
||||
%define RGB_BLUE EXT_BGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_BGRX_RED
|
||||
%define RGB_GREEN EXT_BGRX_GREEN
|
||||
%define RGB_BLUE EXT_BGRX_BLUE
|
||||
%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XBGR_RED
|
||||
%define RGB_GREEN EXT_XBGR_GREEN
|
||||
%define RGB_BLUE EXT_XBGR_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
|
||||
%undef RGB_RED
|
||||
%undef RGB_GREEN
|
||||
%undef RGB_BLUE
|
||||
%undef RGB_PIXELSIZE
|
||||
%define RGB_RED EXT_XRGB_RED
|
||||
%define RGB_GREEN EXT_XRGB_GREEN
|
||||
%define RGB_BLUE EXT_XRGB_BLUE
|
||||
%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
|
||||
%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
|
||||
%include "jdcolext-avx2-64.asm"
|
||||
23
simd/jsimd.h
23
simd/jsimd.h
@@ -329,6 +329,29 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
extern const int jconst_ycc_rgb_convert_avx2[];
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extrgb_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extbgr_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_neon
|
||||
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows);
|
||||
|
||||
@@ -123,6 +123,9 @@ jsimd_can_ycc_rgb (void)
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_AVX2) &&
|
||||
IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
|
||||
return 1;
|
||||
if ((simd_support & JSIMD_SSE2) &&
|
||||
IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
|
||||
return 1;
|
||||
@@ -239,37 +242,48 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows)
|
||||
{
|
||||
void (*avx2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
|
||||
switch(cinfo->out_color_space) {
|
||||
case JCS_EXT_RGB:
|
||||
avx2fct=jsimd_ycc_extrgb_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extrgb_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
case JCS_EXT_RGBA:
|
||||
avx2fct=jsimd_ycc_extrgbx_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extrgbx_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
avx2fct=jsimd_ycc_extbgr_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extbgr_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
case JCS_EXT_BGRA:
|
||||
avx2fct=jsimd_ycc_extbgrx_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extbgrx_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
case JCS_EXT_ABGR:
|
||||
avx2fct=jsimd_ycc_extxbgr_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extxbgr_convert_sse2;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
case JCS_EXT_ARGB:
|
||||
avx2fct=jsimd_ycc_extxrgb_convert_avx2;
|
||||
sse2fct=jsimd_ycc_extxrgb_convert_sse2;
|
||||
break;
|
||||
default:
|
||||
avx2fct=jsimd_ycc_rgb_convert_avx2;
|
||||
sse2fct=jsimd_ycc_rgb_convert_sse2;
|
||||
break;
|
||||
}
|
||||
|
||||
sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
|
||||
if (simd_support & JSIMD_AVX2)
|
||||
avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
|
||||
else
|
||||
sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
|
||||
Reference in New Issue
Block a user