diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 97caf51d..492cbfe8 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -25,7 +25,8 @@ if(SIMD_X86_64) jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64 jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64 jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64 - jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64) + jquanti-sse2-64 jccolor-avx2-64 jcgray-avx2-64 jcsample-avx2-64 + jdcolor-avx2-64) message(STATUS "Building x86_64 SIMD extensions") else() set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx diff --git a/simd/Makefile.am b/simd/Makefile.am index edf29ff4..214f7012 100644 --- a/simd/Makefile.am +++ b/simd/Makefile.am @@ -19,7 +19,8 @@ libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \ jdsample-sse2-64.asm jfdctfst-sse2-64.asm jfdctint-sse2-64.asm \ jidctflt-sse2-64.asm jidctfst-sse2-64.asm jidctint-sse2-64.asm \ jidctred-sse2-64.asm jquantf-sse2-64.asm jquanti-sse2-64.asm \ - jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm + jccolor-avx2-64.asm jcgray-avx2-64.asm jcsample-avx2-64.asm \ + jdcolor-avx2-64.asm jccolor-sse2-64.lo: jccolext-sse2-64.asm jcgray-sse2-64.lo: jcgryext-sse2-64.asm @@ -27,6 +28,7 @@ jdcolor-sse2-64.lo: jdcolext-sse2-64.asm jdmerge-sse2-64.lo: jdmrgext-sse2-64.asm jccolor-avx2-64.lo: jccolext-avx2-64.asm jcgray-avx2-64.lo: jcgryext-avx2-64.asm +jdcolor-avx2-64.lo: jdcolext-avx2-64.asm endif diff --git a/simd/jdcolext-avx2-64.asm b/simd/jdcolext-avx2-64.asm new file mode 100644 index 00000000..28278bfc --- /dev/null +++ b/simd/jdcolext-avx2-64.asm @@ -0,0 +1,429 @@ +; +; jdcolext.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009, 2012 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2012, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jcolsamp.inc" + +; -------------------------------------------------------------------------- +; +; Convert some rows of samples to the output colorspace. +; +; GLOBAL(void) +; jsimd_ycc_rgb_convert_avx2 (JDIMENSION out_width, +; JSAMPIMAGE input_buf, JDIMENSION input_row, +; JSAMPARRAY output_buf, int num_rows) +; + +; r10d = JDIMENSION out_width +; r11 = JSAMPIMAGE input_buf +; r12d = JDIMENSION input_row +; r13 = JSAMPARRAY output_buf +; r14d = int num_rows + +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_YMMWORD ; ymmword wk[WK_NUM] +%define WK_NUM 2 + + align 32 + global EXTN(jsimd_ycc_rgb_convert_avx2) + +EXTN(jsimd_ycc_rgb_convert_avx2): + push rbp + mov rax, rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits + mov [rsp], rax + mov rbp, rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args 5 + push rbx + + mov ecx, r10d ; num_cols + test rcx, rcx + jz near .return + + push rcx + + mov rdi, r11 + mov ecx, r12d + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax, rax + jle near .return +.rowloop: + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr +.columnloop: + + vmovdqu ymm5, YMMWORD [rbx] + vmovdqu ymm1, YMMWORD [rdx] + + vpcmpeqw ymm0, ymm0, ymm0 + vpcmpeqw ymm7, ymm7, ymm7 + vpsrlw ymm0, ymm0, BYTE_BIT + vpsllw ymm7, ymm7, 7 + + vpand ymm4, ymm0, ymm5 + vpsrlw ymm5, ymm5, BYTE_BIT + vpand ymm0, ymm0, ymm1 + vpsrlw ymm1, ymm1, BYTE_BIT + + vpaddw ymm2, ymm4, ymm7 + vpaddw ymm3, ymm5, ymm7 + vpaddw ymm6, ymm0, ymm7 + vpaddw ymm7, ymm1, ymm7 + + vpaddw ymm4, ymm2, ymm2 + vpaddw ymm5, ymm3, ymm3 + vpaddw ymm0, ymm6, ymm6 + vpaddw ymm1, ymm7, ymm7 + + vpmulhw ymm4, ymm4, [rel PW_MF0228] + vpmulhw ymm5, ymm5, [rel PW_MF0228] + vpmulhw ymm0, ymm0, [rel PW_F0402] + vpmulhw ymm1, ymm1, [rel PW_F0402] + + vpaddw ymm4, ymm4, [rel PW_ONE] + vpaddw ymm5, ymm5, [rel PW_ONE] + vpsraw ymm4, ymm4, 1 + vpsraw ymm5, ymm5, 1 + vpaddw ymm0, ymm0, [rel PW_ONE] + vpaddw ymm1, ymm1, [rel PW_ONE] + vpsraw ymm0, ymm0, 1 + vpsraw ymm1, ymm1, 1 + + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm4, ymm4, ymm2 + vpaddw ymm5, ymm5, ymm3 + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + + vmovdqa YMMWORD [wk(0)], ymm4 + vmovdqa YMMWORD [wk(1)], ymm5 + + vpunpckhwd ymm4, ymm2, ymm6 + vpunpcklwd ymm2, ymm2, ymm6 + vpmaddwd ymm2, ymm2, [rel PW_MF0344_F0285] + vpmaddwd ymm4, ymm4, [rel PW_MF0344_F0285] + vpunpckhwd ymm5, ymm3, ymm7 + vpunpcklwd ymm3, ymm3, ymm7 + vpmaddwd ymm3, ymm3, [rel PW_MF0344_F0285] + vpmaddwd ymm5, ymm5, [rel PW_MF0344_F0285] + + vpaddd ymm2, ymm2, [rel PD_ONEHALF] + vpaddd ymm4, ymm4, [rel PD_ONEHALF] + vpsrad ymm2, ymm2, SCALEBITS + vpsrad ymm4, ymm4, SCALEBITS + vpaddd ymm3, ymm3, [rel PD_ONEHALF] + vpaddd ymm5, ymm5, [rel PD_ONEHALF] + vpsrad ymm3, ymm3, SCALEBITS + vpsrad ymm5, ymm5, SCALEBITS + + vpackssdw ymm2, ymm2, ymm4 + vpackssdw ymm3, ymm3, ymm5 + vpsubw ymm2, ymm2, ymm6 + vpsubw ymm3, ymm3, ymm7 + + vmovdqu ymm5, YMMWORD [rsi] + + vpcmpeqw ymm4, ymm4, ymm4 + vpsrlw ymm4, ymm4, BYTE_BIT + vpand ymm4, ymm4, ymm5 + vpsrlw ymm5, ymm5, BYTE_BIT + + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm0 + vpackuswb ymm1, ymm1, ymm1 + + vpaddw ymm2, ymm2, ymm4 + vpaddw ymm3, ymm3, ymm5 + vpackuswb ymm2, ymm2, ymm2 + vpackuswb ymm3, ymm3, ymm3 + + vpaddw ymm4, ymm4, YMMWORD [wk(0)] + vpaddw ymm5, ymm5, YMMWORD [wk(1)] + vpackuswb ymm4, ymm4, ymm4 + vpackuswb ymm5, ymm5, ymm5 + +%if RGB_PIXELSIZE == 3 ; --------------- + + vpunpcklbw ymmA, ymmA, ymmC + vpunpcklbw ymmE, ymmE, ymmB + vpunpcklbw ymmD, ymmD, ymmF + + vpsrldq ymmH, ymmA, 2 + vpunpckhwd ymmG, ymmA, ymmE + vpunpcklwd ymmA, ymmA, ymmE + + vpsrldq ymmE, ymmE, 2 + + vmovdqa ymmC, ymmD + vpsrldq ymmB, ymmD, 2 + vpunpckhwd ymmC, ymmD, ymmH + vpunpcklwd ymmD, ymmD, ymmH + + vpunpckhwd ymmF, ymmE, ymmB + vpunpcklwd ymmE, ymmE, ymmB + + vpshufd ymmH, ymmA, 0x4E + vpunpckldq ymmA, ymmA, ymmD + vpunpckhdq ymmD, ymmD, ymmE + vpunpckldq ymmE, ymmE, ymmH + + vpshufd ymmH, ymmG, 0x4E + vpunpckldq ymmG, ymmG, ymmC + vpunpckhdq ymmC, ymmC, ymmF + vpunpckldq ymmF, ymmF, ymmH + + vpunpcklqdq ymmH, ymmA, ymmE + vpunpcklqdq ymmG, ymmD, ymmG + vpunpcklqdq ymmC, ymmF, ymmC + + vperm2i128 ymmA, ymmH, ymmG, 0x20 + vperm2i128 ymmD, ymmC, ymmH, 0x30 + vperm2i128 ymmF, ymmG, ymmC, 0x31 + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF +.out0: + add rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_YMMWORD + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmF + sub rcx, byte 2*SIZEOF_YMMWORD + jmp short .column_st31 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st31 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD + jmp short .column_st31 +.column_st31: + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + vperm2i128 ymmA, ymmA, ymmA, 1 + sub rcx, byte SIZEOF_XMMWORD +.column_st15: + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + vmovq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + vpsrldq xmmA, xmmA, SIZEOF_MMWORD +.column_st7: + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + vmovd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + vpsrldq xmmA, xmmA, SIZEOF_DWORD +.column_st3: + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + vmovd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 +.column_st1: + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov BYTE [rdi], al + +%else ; RGB_PIXELSIZE == 4 ; ----------- + +%ifdef RGBX_FILLER_0XFF + vpcmpeqb ymm6, ymm6, ymm6 + vpcmpeqb ymm7, ymm7, ymm7 +%else + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 +%endif + + vpunpcklbw ymmA, ymmA, ymmC + vpunpcklbw ymmE, ymmE, ymmG + vpunpcklbw ymmB, ymmB, ymmD + vpunpcklbw ymmF, ymmF, ymmH + + vpunpckhwd ymmC, ymmA, ymmE + vpunpcklwd ymmA, ymmA, ymmE + vpunpckhwd ymmG, ymmB, ymmF + vpunpcklwd ymmB, ymmB, ymmF + + vpunpckhdq ymmE, ymmA, ymmB + vpunpckldq ymmB, ymmA, ymmB + vpunpckhdq ymmF, ymmC, ymmG + vpunpckldq ymmG, ymmC, ymmG + + + vperm2i128 ymmA, ymmB, ymmE, 0x20 + vperm2i128 ymmD, ymmG, ymmF, 0x20 + vperm2i128 ymmC, ymmB, ymmE, 0x31 + vperm2i128 ymmH, ymmG, ymmF, 0x31 + + cmp rcx, byte SIZEOF_YMMWORD + jb short .column_st64 + + test rdi, SIZEOF_YMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + vmovntdq YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovntdq YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovntdq YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovntdq YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC + vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH +.out0: + add rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD + jz near .nextrow + + add rsi, byte SIZEOF_YMMWORD ; inptr0 + add rbx, byte SIZEOF_YMMWORD ; inptr1 + add rdx, byte SIZEOF_YMMWORD ; inptr2 + jmp near .columnloop + +.column_st64: + cmp rcx, byte SIZEOF_YMMWORD/2 + jb short .column_st32 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD + add rdi, byte 2*SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmC + vmovdqa ymmD, ymmH + sub rcx, byte SIZEOF_YMMWORD/2 +.column_st32: + cmp rcx, byte SIZEOF_YMMWORD/4 + jb short .column_st16 + vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA + add rdi, byte SIZEOF_YMMWORD ; outptr + vmovdqa ymmA, ymmD + sub rcx, byte SIZEOF_YMMWORD/4 +.column_st16: + cmp rcx, byte SIZEOF_YMMWORD/8 + jb short .column_st15 + vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + vperm2i128 ymmA, ymmA, ymmA, 1 + add rdi, byte SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_YMMWORD/8 +.column_st15: + ; Store two pixels (8 bytes) of ymmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_YMMWORD/16 + jb short .column_st7 + vmovq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_YMMWORD/16*4 + sub rcx, byte SIZEOF_YMMWORD/16 + vpsrldq xmmA, SIZEOF_YMMWORD/16*4 +.column_st7: + ; Store one pixel (4 bytes) of ymmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + vmovd XMM_DWORD [rdi], xmmA + +%endif ; RGB_PIXELSIZE ; --------------- + +.nextrow: + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + +.return: + pop rbx + uncollect_args 5 + mov rsp, rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + +; For some reason, the OS X linker does not honor the request to align the +; segment unless we do this. + align 32 diff --git a/simd/jdcolor-avx2-64.asm b/simd/jdcolor-avx2-64.asm new file mode 100644 index 00000000..b9ca499f --- /dev/null +++ b/simd/jdcolor-avx2-64.asm @@ -0,0 +1,120 @@ +; +; jdcolor.asm - colorspace conversion (64-bit AVX2) +; +; Copyright 2009 Pierre Ossman for Cendio AB +; Copyright (C) 2009, 2016, D. R. Commander. +; Copyright (C) 2015, Intel Corporation. +; +; Based on the x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; This file should be assembled with NASM (Netwide Assembler), +; can *not* be assembled with Microsoft's MASM or any compatible +; assembler (including Borland's Turbo Assembler). +; NASM is available from http://nasm.sourceforge.net/ or +; http://sourceforge.net/project/showfiles.php?group_id=6208 +; +; [TAB8] + +%include "jsimdext.inc" + +; -------------------------------------------------------------------------- + +%define SCALEBITS 16 + +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + +; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 32 + global EXTN(jconst_ycc_rgb_convert_avx2) + +EXTN(jconst_ycc_rgb_convert_avx2): + +PW_F0402 times 16 dw F_0_402 +PW_MF0228 times 16 dw -F_0_228 +PW_MF0344_F0285 times 8 dw -F_0_344, F_0_285 +PW_ONE times 16 dw 1 +PD_ONEHALF times 8 dd 1 << (SCALEBITS-1) + + alignz 32 + +; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 64 + +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGB_RED +%define RGB_GREEN EXT_RGB_GREEN +%define RGB_BLUE EXT_RGB_BLUE +%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2 +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_RGBX_RED +%define RGB_GREEN EXT_RGBX_GREEN +%define RGB_BLUE EXT_RGBX_BLUE +%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2 +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGR_RED +%define RGB_GREEN EXT_BGR_GREEN +%define RGB_BLUE EXT_BGR_BLUE +%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2 +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_BGRX_RED +%define RGB_GREEN EXT_BGRX_GREEN +%define RGB_BLUE EXT_BGRX_BLUE +%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2 +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XBGR_RED +%define RGB_GREEN EXT_XBGR_GREEN +%define RGB_BLUE EXT_XBGR_BLUE +%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2 +%include "jdcolext-avx2-64.asm" + +%undef RGB_RED +%undef RGB_GREEN +%undef RGB_BLUE +%undef RGB_PIXELSIZE +%define RGB_RED EXT_XRGB_RED +%define RGB_GREEN EXT_XRGB_GREEN +%define RGB_BLUE EXT_XRGB_BLUE +%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2 +%include "jdcolext-avx2-64.asm" diff --git a/simd/jsimd.h b/simd/jsimd.h index 9869aec2..ec643833 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -329,6 +329,29 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); +extern const int jconst_ycc_rgb_convert_avx2[]; +EXTERN(void) jsimd_ycc_rgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extrgbx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extbgrx_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxbgr_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); +EXTERN(void) jsimd_ycc_extxrgb_convert_avx2 + (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows); + EXTERN(void) jsimd_ycc_rgb_convert_neon (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c index ddad42b9..73723403 100644 --- a/simd/jsimd_x86_64.c +++ b/simd/jsimd_x86_64.c @@ -123,6 +123,9 @@ jsimd_can_ycc_rgb (void) if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) return 0; + if ((simd_support & JSIMD_AVX2) && + IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2)) + return 1; if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) return 1; @@ -239,37 +242,48 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows) { + void (*avx2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); switch(cinfo->out_color_space) { case JCS_EXT_RGB: + avx2fct=jsimd_ycc_extrgb_convert_avx2; sse2fct=jsimd_ycc_extrgb_convert_sse2; break; case JCS_EXT_RGBX: case JCS_EXT_RGBA: + avx2fct=jsimd_ycc_extrgbx_convert_avx2; sse2fct=jsimd_ycc_extrgbx_convert_sse2; break; case JCS_EXT_BGR: + avx2fct=jsimd_ycc_extbgr_convert_avx2; sse2fct=jsimd_ycc_extbgr_convert_sse2; break; case JCS_EXT_BGRX: case JCS_EXT_BGRA: + avx2fct=jsimd_ycc_extbgrx_convert_avx2; sse2fct=jsimd_ycc_extbgrx_convert_sse2; break; case JCS_EXT_XBGR: case JCS_EXT_ABGR: + avx2fct=jsimd_ycc_extxbgr_convert_avx2; sse2fct=jsimd_ycc_extxbgr_convert_sse2; break; case JCS_EXT_XRGB: case JCS_EXT_ARGB: + avx2fct=jsimd_ycc_extxrgb_convert_avx2; sse2fct=jsimd_ycc_extxrgb_convert_sse2; break; default: + avx2fct=jsimd_ycc_rgb_convert_avx2; sse2fct=jsimd_ycc_rgb_convert_sse2; break; } - sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + if (simd_support & JSIMD_AVX2) + avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); + else + sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); } GLOBAL(void)