The IJG convention is to format copyright notices as: Copyright (C) YYYY, Owner. We try to maintain this convention for any code that is part of the libjpeg API library (with the exception of preserving the copyright notices from Cendio's code verbatim, since those predate libjpeg-turbo.) Note that the phrase "All Rights Reserved" is no longer necessary, since all Buenos Aires Convention signatories signed onto the Berne Convention in 2000. However, our convention is to retain this phrase for any files that have a self-contained copyright header but to leave it off of any files that refer to another file for conditions of distribution and use. For instance, all of the non-SIMD files in the libjpeg API library refer to README.ijg, and the copyright message in that file contains "All Rights Reserved", so it is unnecessary to add it to the individual files. The TurboJPEG code retains my preferred formatting convention for copyright notices, which is based on that of VirtualGL (where the TurboJPEG API originated.)
441 lines
17 KiB
NASM
441 lines
17 KiB
NASM
;
|
|
; jdcolext.asm - colorspace conversion (64-bit SSE2)
|
|
;
|
|
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
|
; Copyright (C) 2009, 2012, D. R. Commander.
|
|
;
|
|
; Based on the x86 SIMD extension for IJG JPEG library
|
|
; Copyright (C) 1999-2006, MIYASAKA Masaru.
|
|
; For conditions of distribution and use, see copyright notice in jsimdext.inc
|
|
;
|
|
; This file should be assembled with NASM (Netwide Assembler),
|
|
; can *not* be assembled with Microsoft's MASM or any compatible
|
|
; assembler (including Borland's Turbo Assembler).
|
|
; NASM is available from http://nasm.sourceforge.net/ or
|
|
; http://sourceforge.net/project/showfiles.php?group_id=6208
|
|
;
|
|
; [TAB8]
|
|
|
|
%include "jcolsamp.inc"
|
|
|
|
; --------------------------------------------------------------------------
|
|
;
|
|
; Convert some rows of samples to the output colorspace.
|
|
;
|
|
; GLOBAL(void)
|
|
; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
|
|
; JSAMPIMAGE input_buf, JDIMENSION input_row,
|
|
; JSAMPARRAY output_buf, int num_rows)
|
|
;
|
|
|
|
; r10 = JDIMENSION out_width
|
|
; r11 = JSAMPIMAGE input_buf
|
|
; r12 = JDIMENSION input_row
|
|
; r13 = JSAMPARRAY output_buf
|
|
; r14 = int num_rows
|
|
|
|
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
|
|
%define WK_NUM 2
|
|
|
|
align 16
|
|
global EXTN(jsimd_ycc_rgb_convert_sse2)
|
|
|
|
EXTN(jsimd_ycc_rgb_convert_sse2):
|
|
push rbp
|
|
mov rax,rsp ; rax = original rbp
|
|
sub rsp, byte 4
|
|
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
|
|
mov [rsp],rax
|
|
mov rbp,rsp ; rbp = aligned rbp
|
|
lea rsp, [wk(0)]
|
|
collect_args
|
|
push rbx
|
|
|
|
mov ecx, r10d ; num_cols
|
|
test rcx,rcx
|
|
jz near .return
|
|
|
|
push rcx
|
|
|
|
mov rdi, r11
|
|
mov ecx, r12d
|
|
mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
|
|
mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
|
|
mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
|
|
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
|
|
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
|
|
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
|
|
|
|
pop rcx
|
|
|
|
mov rdi, r13
|
|
mov eax, r14d
|
|
test rax,rax
|
|
jle near .return
|
|
.rowloop:
|
|
push rax
|
|
push rdi
|
|
push rdx
|
|
push rbx
|
|
push rsi
|
|
push rcx ; col
|
|
|
|
mov rsi, JSAMPROW [rsi] ; inptr0
|
|
mov rbx, JSAMPROW [rbx] ; inptr1
|
|
mov rdx, JSAMPROW [rdx] ; inptr2
|
|
mov rdi, JSAMPROW [rdi] ; outptr
|
|
.columnloop:
|
|
|
|
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
|
|
movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF)
|
|
|
|
pcmpeqw xmm4,xmm4
|
|
pcmpeqw xmm7,xmm7
|
|
psrlw xmm4,BYTE_BIT
|
|
psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
|
|
movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
|
|
|
|
pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE
|
|
psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO
|
|
pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE
|
|
psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO
|
|
|
|
paddw xmm4,xmm7
|
|
paddw xmm5,xmm7
|
|
paddw xmm0,xmm7
|
|
paddw xmm1,xmm7
|
|
|
|
; (Original)
|
|
; R = Y + 1.40200 * Cr
|
|
; G = Y - 0.34414 * Cb - 0.71414 * Cr
|
|
; B = Y + 1.77200 * Cb
|
|
;
|
|
; (This implementation)
|
|
; R = Y + 0.40200 * Cr + Cr
|
|
; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
|
|
; B = Y - 0.22800 * Cb + Cb + Cb
|
|
|
|
movdqa xmm2,xmm4 ; xmm2=CbE
|
|
movdqa xmm3,xmm5 ; xmm3=CbO
|
|
paddw xmm4,xmm4 ; xmm4=2*CbE
|
|
paddw xmm5,xmm5 ; xmm5=2*CbO
|
|
movdqa xmm6,xmm0 ; xmm6=CrE
|
|
movdqa xmm7,xmm1 ; xmm7=CrO
|
|
paddw xmm0,xmm0 ; xmm0=2*CrE
|
|
paddw xmm1,xmm1 ; xmm1=2*CrO
|
|
|
|
pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800))
|
|
pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800))
|
|
pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
|
|
pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
|
|
|
|
paddw xmm4,[rel PW_ONE]
|
|
paddw xmm5,[rel PW_ONE]
|
|
psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800))
|
|
psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800))
|
|
paddw xmm0,[rel PW_ONE]
|
|
paddw xmm1,[rel PW_ONE]
|
|
psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200))
|
|
psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200))
|
|
|
|
paddw xmm4,xmm2
|
|
paddw xmm5,xmm3
|
|
paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
|
|
paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
|
|
paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
|
|
paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
|
|
|
|
movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E
|
|
movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O
|
|
|
|
movdqa xmm4,xmm2
|
|
movdqa xmm5,xmm3
|
|
punpcklwd xmm2,xmm6
|
|
punpckhwd xmm4,xmm6
|
|
pmaddwd xmm2,[rel PW_MF0344_F0285]
|
|
pmaddwd xmm4,[rel PW_MF0344_F0285]
|
|
punpcklwd xmm3,xmm7
|
|
punpckhwd xmm5,xmm7
|
|
pmaddwd xmm3,[rel PW_MF0344_F0285]
|
|
pmaddwd xmm5,[rel PW_MF0344_F0285]
|
|
|
|
paddd xmm2,[rel PD_ONEHALF]
|
|
paddd xmm4,[rel PD_ONEHALF]
|
|
psrad xmm2,SCALEBITS
|
|
psrad xmm4,SCALEBITS
|
|
paddd xmm3,[rel PD_ONEHALF]
|
|
paddd xmm5,[rel PD_ONEHALF]
|
|
psrad xmm3,SCALEBITS
|
|
psrad xmm5,SCALEBITS
|
|
|
|
packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
|
|
packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
|
|
psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
|
|
psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
|
|
|
|
movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF)
|
|
|
|
pcmpeqw xmm4,xmm4
|
|
psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..}
|
|
pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE
|
|
psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO
|
|
|
|
paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
|
|
paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
|
|
packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
|
|
packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
|
|
|
|
paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
|
|
paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
|
|
packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
|
|
packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
|
|
|
|
paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
|
|
paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
|
|
packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
|
|
packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
|
|
|
|
%if RGB_PIXELSIZE == 3 ; ---------------
|
|
|
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
|
; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
|
|
|
|
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
|
punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
|
|
punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
|
|
|
|
movdqa xmmG,xmmA
|
|
movdqa xmmH,xmmA
|
|
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
|
|
punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
|
|
|
|
psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
|
|
psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
|
|
|
|
movdqa xmmC,xmmD
|
|
movdqa xmmB,xmmD
|
|
punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
|
|
punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
|
|
|
|
psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
|
|
|
|
movdqa xmmF,xmmE
|
|
punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
|
|
punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
|
|
|
|
pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
|
|
movdqa xmmB,xmmE
|
|
punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
|
|
punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
|
|
punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
|
|
|
|
pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
|
|
movdqa xmmB,xmmF
|
|
punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
|
|
punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
|
|
punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
|
|
|
|
punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
|
|
punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
|
|
punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
|
|
|
|
cmp rcx, byte SIZEOF_XMMWORD
|
|
jb short .column_st32
|
|
|
|
test rdi, SIZEOF_XMMWORD-1
|
|
jnz short .out1
|
|
; --(aligned)-------------------
|
|
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
|
jmp short .out0
|
|
.out1: ; --(unaligned)-----------------
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
|
|
.out0:
|
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
sub rcx, byte SIZEOF_XMMWORD
|
|
jz near .nextrow
|
|
|
|
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
|
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
|
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
|
jmp near .columnloop
|
|
|
|
.column_st32:
|
|
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
|
|
cmp rcx, byte 2*SIZEOF_XMMWORD
|
|
jb short .column_st16
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
|
movdqa xmmA,xmmF
|
|
sub rcx, byte 2*SIZEOF_XMMWORD
|
|
jmp short .column_st15
|
|
.column_st16:
|
|
cmp rcx, byte SIZEOF_XMMWORD
|
|
jb short .column_st15
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
movdqa xmmA,xmmD
|
|
sub rcx, byte SIZEOF_XMMWORD
|
|
.column_st15:
|
|
; Store the lower 8 bytes of xmmA to the output when it has enough
|
|
; space.
|
|
cmp rcx, byte SIZEOF_MMWORD
|
|
jb short .column_st7
|
|
movq XMM_MMWORD [rdi], xmmA
|
|
add rdi, byte SIZEOF_MMWORD
|
|
sub rcx, byte SIZEOF_MMWORD
|
|
psrldq xmmA, SIZEOF_MMWORD
|
|
.column_st7:
|
|
; Store the lower 4 bytes of xmmA to the output when it has enough
|
|
; space.
|
|
cmp rcx, byte SIZEOF_DWORD
|
|
jb short .column_st3
|
|
movd XMM_DWORD [rdi], xmmA
|
|
add rdi, byte SIZEOF_DWORD
|
|
sub rcx, byte SIZEOF_DWORD
|
|
psrldq xmmA, SIZEOF_DWORD
|
|
.column_st3:
|
|
; Store the lower 2 bytes of rax to the output when it has enough
|
|
; space.
|
|
movd eax, xmmA
|
|
cmp rcx, byte SIZEOF_WORD
|
|
jb short .column_st1
|
|
mov WORD [rdi], ax
|
|
add rdi, byte SIZEOF_WORD
|
|
sub rcx, byte SIZEOF_WORD
|
|
shr rax, 16
|
|
.column_st1:
|
|
; Store the lower 1 byte of rax to the output when it has enough
|
|
; space.
|
|
test rcx, rcx
|
|
jz short .nextrow
|
|
mov BYTE [rdi], al
|
|
|
|
%else ; RGB_PIXELSIZE == 4 ; -----------
|
|
|
|
%ifdef RGBX_FILLER_0XFF
|
|
pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
|
pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
|
%else
|
|
pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
|
|
pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
|
|
%endif
|
|
; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
|
|
; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
|
|
; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
|
|
; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
|
|
|
|
punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
|
|
punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
|
|
punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
|
|
punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
|
|
|
|
movdqa xmmC,xmmA
|
|
punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
|
|
punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
|
|
movdqa xmmG,xmmB
|
|
punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
|
|
punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
|
|
|
|
movdqa xmmD,xmmA
|
|
punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
|
|
punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
|
|
movdqa xmmH,xmmC
|
|
punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
|
|
punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
|
|
|
|
cmp rcx, byte SIZEOF_XMMWORD
|
|
jb short .column_st32
|
|
|
|
test rdi, SIZEOF_XMMWORD-1
|
|
jnz short .out1
|
|
; --(aligned)-------------------
|
|
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
|
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
|
jmp short .out0
|
|
.out1: ; --(unaligned)-----------------
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
|
|
movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
|
|
.out0:
|
|
add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
|
|
sub rcx, byte SIZEOF_XMMWORD
|
|
jz near .nextrow
|
|
|
|
add rsi, byte SIZEOF_XMMWORD ; inptr0
|
|
add rbx, byte SIZEOF_XMMWORD ; inptr1
|
|
add rdx, byte SIZEOF_XMMWORD ; inptr2
|
|
jmp near .columnloop
|
|
|
|
.column_st32:
|
|
cmp rcx, byte SIZEOF_XMMWORD/2
|
|
jb short .column_st16
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
|
|
add rdi, byte 2*SIZEOF_XMMWORD ; outptr
|
|
movdqa xmmA,xmmC
|
|
movdqa xmmD,xmmH
|
|
sub rcx, byte SIZEOF_XMMWORD/2
|
|
.column_st16:
|
|
cmp rcx, byte SIZEOF_XMMWORD/4
|
|
jb short .column_st15
|
|
movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
|
|
add rdi, byte SIZEOF_XMMWORD ; outptr
|
|
movdqa xmmA,xmmD
|
|
sub rcx, byte SIZEOF_XMMWORD/4
|
|
.column_st15:
|
|
; Store two pixels (8 bytes) of xmmA to the output when it has enough
|
|
; space.
|
|
cmp rcx, byte SIZEOF_XMMWORD/8
|
|
jb short .column_st7
|
|
movq MMWORD [rdi], xmmA
|
|
add rdi, byte SIZEOF_XMMWORD/8*4
|
|
sub rcx, byte SIZEOF_XMMWORD/8
|
|
psrldq xmmA, SIZEOF_XMMWORD/8*4
|
|
.column_st7:
|
|
; Store one pixel (4 bytes) of xmmA to the output when it has enough
|
|
; space.
|
|
test rcx, rcx
|
|
jz short .nextrow
|
|
movd XMM_DWORD [rdi], xmmA
|
|
|
|
%endif ; RGB_PIXELSIZE ; ---------------
|
|
|
|
.nextrow:
|
|
pop rcx
|
|
pop rsi
|
|
pop rbx
|
|
pop rdx
|
|
pop rdi
|
|
pop rax
|
|
|
|
add rsi, byte SIZEOF_JSAMPROW
|
|
add rbx, byte SIZEOF_JSAMPROW
|
|
add rdx, byte SIZEOF_JSAMPROW
|
|
add rdi, byte SIZEOF_JSAMPROW ; output_buf
|
|
dec rax ; num_rows
|
|
jg near .rowloop
|
|
|
|
sfence ; flush the write buffer
|
|
|
|
.return:
|
|
pop rbx
|
|
uncollect_args
|
|
mov rsp,rbp ; rsp <- aligned rbp
|
|
pop rsp ; rsp <- original rbp
|
|
pop rbp
|
|
ret
|
|
|
|
; For some reason, the OS X linker does not honor the request to align the
|
|
; segment unless we do this.
|
|
align 16
|