Modify the ARM64 assembly file so that it uses only syntax that the clang assembler in XCode 5.x can understand. These changes should all be cosmetic in nature-- they do not change the meaning or readability of the code nor the ability to build it for Linux. Actually, the code is now more in compliance with the ARM64 programming manual. In addition to these changes, there were a couple of instructions that clang simply doesn't support, so gas-preprocessor.pl was modified so that it now converts those into equivalent instructions that clang can handle.
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/branches/1.4.x@1450 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -6,6 +6,7 @@
|
||||
* Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
||||
* Copyright (C) 2013-2014, Linaro Limited
|
||||
* Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
|
||||
* Copyright (C) 2014, D. R. Commander. All rights reserved.
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
@@ -197,21 +198,21 @@ _\fname:
|
||||
tmp13 = q1; \
|
||||
}
|
||||
|
||||
#define XFIX_0_899976223 v0.4h[0]
|
||||
#define XFIX_0_541196100 v0.4h[1]
|
||||
#define XFIX_2_562915447 v0.4h[2]
|
||||
#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
|
||||
#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
|
||||
#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
|
||||
#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
|
||||
#define XFIX_1_175875602 v1.4h[3]
|
||||
#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
|
||||
#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
|
||||
#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
|
||||
#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
|
||||
#define XFIX_0_899976223 v0.h[0]
|
||||
#define XFIX_0_541196100 v0.h[1]
|
||||
#define XFIX_2_562915447 v0.h[2]
|
||||
#define XFIX_0_298631336_MINUS_0_899976223 v0.h[3]
|
||||
#define XFIX_1_501321110_MINUS_0_899976223 v1.h[0]
|
||||
#define XFIX_2_053119869_MINUS_2_562915447 v1.h[1]
|
||||
#define XFIX_0_541196100_PLUS_0_765366865 v1.h[2]
|
||||
#define XFIX_1_175875602 v1.h[3]
|
||||
#define XFIX_1_175875602_MINUS_0_390180644 v2.h[0]
|
||||
#define XFIX_0_541196100_MINUS_1_847759065 v2.h[1]
|
||||
#define XFIX_3_072711026_MINUS_2_562915447 v2.h[2]
|
||||
#define XFIX_1_175875602_MINUS_1_961570560 v2.h[3]
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_islow_neon_consts:
|
||||
Ljsimd_idct_islow_neon_consts:
|
||||
.short FIX_0_899976223 /* d0[0] */
|
||||
.short FIX_0_541196100 /* d0[1] */
|
||||
.short FIX_2_562915447 /* d0[2] */
|
||||
@@ -256,54 +257,54 @@ asm_function jsimd_idct_islow_neon
|
||||
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
adr x15, jsimd_idct_islow_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
||||
mul v16.4h, v16.4h, v0.4h
|
||||
mul v17.4h, v17.4h, v1.4h
|
||||
ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
|
||||
ins v16.d[1], v17.d[0] /* 128 bit q8 */
|
||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||
mul v18.4h, v18.4h, v2.4h
|
||||
mul v19.4h, v19.4h, v3.4h
|
||||
ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
|
||||
ins v18.d[1], v19.d[0] /* 128 bit q9 */
|
||||
ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
|
||||
mul v20.4h, v20.4h, v4.4h
|
||||
mul v21.4h, v21.4h, v5.4h
|
||||
ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
|
||||
ins v20.d[1], v21.d[0] /* 128 bit q10 */
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||
mul v22.4h, v22.4h, v6.4h
|
||||
mul v23.4h, v23.4h, v7.4h
|
||||
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
|
||||
ins v22.d[1], v23.d[0] /* 128 bit q11 */
|
||||
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
|
||||
mul v24.4h, v24.4h, v0.4h
|
||||
mul v25.4h, v25.4h, v1.4h
|
||||
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
|
||||
ins v24.d[1], v25.d[0] /* 128 bit q12 */
|
||||
ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
|
||||
mul v28.4h, v28.4h, v4.4h
|
||||
mul v29.4h, v29.4h, v5.4h
|
||||
ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
|
||||
ins v28.d[1], v29.d[0] /* 128 bit q14 */
|
||||
mul v26.4h, v26.4h, v2.4h
|
||||
mul v27.4h, v27.4h, v3.4h
|
||||
ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
|
||||
ins v26.d[1], v27.d[0] /* 128 bit q13 */
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
|
||||
add x15, x15, #16
|
||||
mul v30.4h, v30.4h, v6.4h
|
||||
mul v31.4h, v31.4h, v7.4h
|
||||
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
|
||||
ins v30.d[1], v31.d[0] /* 128 bit q15 */
|
||||
/* Go to the bottom of the stack */
|
||||
sub sp, sp, 352
|
||||
stp x4, x5, [sp], 16
|
||||
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
|
||||
st1 {v12.4h - v15.4h}, [sp], 32
|
||||
st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 /* save NEON registers */
|
||||
st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||
/* 1-D IDCT, pass 1, left 4x8 half */
|
||||
add v4.4h, ROW7L.4h, ROW3L.4h
|
||||
add v5.4h, ROW5L.4h, ROW1L.4h
|
||||
@@ -378,7 +379,7 @@ asm_function jsimd_idct_islow_neon
|
||||
rshrn ROW0L.4h, v12.4s, #11
|
||||
rshrn ROW4L.4h, v6.4s, #11
|
||||
|
||||
beq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
||||
b.eq 3f /* Go to do some special handling for the sparse right 4x8 half */
|
||||
|
||||
/* 1-D IDCT, pass 1, right 4x8 half */
|
||||
ld1 {v2.4h}, [x15] /* reload constants */
|
||||
@@ -553,33 +554,33 @@ asm_function jsimd_idct_islow_neon
|
||||
shrn ROW4R.4h, v6.4s, #16
|
||||
|
||||
2: /* Descale to 8-bit and range limit */
|
||||
ins v16.2d[1], v17.2d[0]
|
||||
ins v18.2d[1], v19.2d[0]
|
||||
ins v20.2d[1], v21.2d[0]
|
||||
ins v22.2d[1], v23.2d[0]
|
||||
ins v16.d[1], v17.d[0]
|
||||
ins v18.d[1], v19.d[0]
|
||||
ins v20.d[1], v21.d[0]
|
||||
ins v22.d[1], v23.d[0]
|
||||
sqrshrn v16.8b, v16.8h, #2
|
||||
sqrshrn2 v16.16b, v18.8h, #2
|
||||
sqrshrn v18.8b, v20.8h, #2
|
||||
sqrshrn2 v18.16b, v22.8h, #2
|
||||
|
||||
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
|
||||
ld1 {v8.4h - v11.4h}, [sp], 32
|
||||
ld1 {v12.4h - v15.4h}, [sp], 32
|
||||
ins v24.2d[1], v25.2d[0]
|
||||
ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
|
||||
ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
|
||||
ins v24.d[1], v25.d[0]
|
||||
|
||||
sqrshrn v20.8b, v24.8h, #2
|
||||
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
||||
/* trn1 v16.8h, v16.8h, v18.8h */
|
||||
transpose v16, v18, v3, .16b, .8h
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v28.2d[1], v29.2d[0]
|
||||
ins v30.2d[1], v31.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
ins v28.d[1], v29.d[0]
|
||||
ins v30.d[1], v31.d[0]
|
||||
sqrshrn2 v20.16b, v26.8h, #2
|
||||
sqrshrn v22.8b, v28.8h, #2
|
||||
movi v0.16b, #(CENTERJSAMPLE)
|
||||
sqrshrn2 v22.16b, v30.8h, #2
|
||||
transpose_single v16, v17, v3, .2d, .8b
|
||||
transpose_single v18, v19, v3, .2d, .8b
|
||||
transpose_single v16, v17, v3, .d, .8b
|
||||
transpose_single v18, v19, v3, .d, .8b
|
||||
add v16.8b, v16.8b, v0.8b
|
||||
add v17.8b, v17.8b, v0.8b
|
||||
add v18.8b, v18.8b, v0.8b
|
||||
@@ -590,7 +591,7 @@ asm_function jsimd_idct_islow_neon
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
st1 {v16.8b}, [TMP1]
|
||||
transpose_single v20, v21, v3, .2d, .8b
|
||||
transpose_single v20, v21, v3, .d, .8b
|
||||
st1 {v17.8b}, [TMP2]
|
||||
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
||||
add TMP1, TMP1, OUTPUT_COL
|
||||
@@ -605,7 +606,7 @@ asm_function jsimd_idct_islow_neon
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
add TMP3, TMP3, OUTPUT_COL
|
||||
add TMP4, TMP4, OUTPUT_COL
|
||||
transpose_single v22, v23, v3, .2d, .8b
|
||||
transpose_single v22, v23, v3, .d, .8b
|
||||
st1 {v20.8b}, [TMP1]
|
||||
add v22.8b, v22.8b, v0.8b
|
||||
add v23.8b, v23.8b, v0.8b
|
||||
@@ -613,14 +614,14 @@ asm_function jsimd_idct_islow_neon
|
||||
st1 {v22.8b}, [TMP3]
|
||||
st1 {v23.8b}, [TMP4]
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
||||
@@ -636,17 +637,17 @@ asm_function jsimd_idct_islow_neon
|
||||
transpose ROW0L, ROW2L, v3, .16b, .2s
|
||||
transpose ROW5L, ROW7L, v3, .16b, .2s
|
||||
cmp x0, #0
|
||||
beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
||||
b.eq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
|
||||
|
||||
/* Only row 0 is non-zero for the right 4x8 half */
|
||||
dup ROW1R.4h, ROW0R.4h[1]
|
||||
dup ROW2R.4h, ROW0R.4h[2]
|
||||
dup ROW3R.4h, ROW0R.4h[3]
|
||||
dup ROW4R.4h, ROW0R.4h[0]
|
||||
dup ROW5R.4h, ROW0R.4h[1]
|
||||
dup ROW6R.4h, ROW0R.4h[2]
|
||||
dup ROW7R.4h, ROW0R.4h[3]
|
||||
dup ROW0R.4h, ROW0R.4h[0]
|
||||
dup ROW1R.4h, ROW0R.h[1]
|
||||
dup ROW2R.4h, ROW0R.h[2]
|
||||
dup ROW3R.4h, ROW0R.h[3]
|
||||
dup ROW4R.4h, ROW0R.h[0]
|
||||
dup ROW5R.4h, ROW0R.h[1]
|
||||
dup ROW6R.4h, ROW0R.h[2]
|
||||
dup ROW7R.4h, ROW0R.h[3]
|
||||
dup ROW0R.4h, ROW0R.h[0]
|
||||
b 1b /* Go to 'normal' second pass */
|
||||
|
||||
4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
|
||||
@@ -770,13 +771,13 @@ asm_function jsimd_idct_islow_neon
|
||||
* per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
|
||||
*/
|
||||
|
||||
#define XFIX_1_082392200 v0.4h[0]
|
||||
#define XFIX_1_414213562 v0.4h[1]
|
||||
#define XFIX_1_847759065 v0.4h[2]
|
||||
#define XFIX_2_613125930 v0.4h[3]
|
||||
#define XFIX_1_082392200 v0.h[0]
|
||||
#define XFIX_1_414213562 v0.h[1]
|
||||
#define XFIX_1_847759065 v0.h[2]
|
||||
#define XFIX_2_613125930 v0.h[3]
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_ifast_neon_consts:
|
||||
Ljsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
@@ -810,12 +811,12 @@ asm_function jsimd_idct_ifast_neon
|
||||
/* Save NEON registers used in fast IDCT */
|
||||
sub sp, sp, #176
|
||||
stp x22, x23, [sp], 16
|
||||
adr x23, jsimd_idct_ifast_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
adr x23, Ljsimd_idct_ifast_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
||||
@@ -909,24 +910,24 @@ asm_function jsimd_idct_ifast_neon
|
||||
trn2 v15.4s, v18.4s, v15.4s
|
||||
/* vswp v14.4h, v10-MSB.4h */
|
||||
umov x22, v14.d[0]
|
||||
ins v14.2d[0], v10.2d[1]
|
||||
ins v10.2d[1], x22
|
||||
ins v14.d[0], v10.d[1]
|
||||
ins v10.d[1], x22
|
||||
/* vswp v13.4h, v9MSB.4h */
|
||||
|
||||
umov x22, v13.d[0]
|
||||
ins v13.2d[0], v9.2d[1]
|
||||
ins v9.2d[1], x22
|
||||
ins v13.d[0], v9.d[1]
|
||||
ins v9.d[1], x22
|
||||
/* 1-D IDCT, pass 2 */
|
||||
sub v2.8h, v10.8h, v14.8h
|
||||
/* vswp v15.4h, v11MSB.4h */
|
||||
umov x22, v15.d[0]
|
||||
ins v15.2d[0], v11.2d[1]
|
||||
ins v11.2d[1], x22
|
||||
ins v15.d[0], v11.d[1]
|
||||
ins v11.d[1], x22
|
||||
add v14.8h, v10.8h, v14.8h
|
||||
/* vswp v12.4h, v8-MSB.4h */
|
||||
umov x22, v12.d[0]
|
||||
ins v12.2d[0], v8.2d[1]
|
||||
ins v8.2d[1], x22
|
||||
ins v12.d[0], v8.d[1]
|
||||
ins v8.d[1], x22
|
||||
sub v1.8h, v11.8h, v13.8h
|
||||
add v13.8h, v11.8h, v13.8h
|
||||
sub v5.8h, v9.8h, v15.8h
|
||||
@@ -997,13 +998,13 @@ asm_function jsimd_idct_ifast_neon
|
||||
trn1 v9.4s, v9.4s, v11.4s
|
||||
trn2 v11.4s, v18.4s, v11.4s
|
||||
/* make copy */
|
||||
ins v17.2d[0], v8.2d[1]
|
||||
ins v17.d[0], v8.d[1]
|
||||
/* Transpose d16-d17-msb */
|
||||
mov v18.16b, v8.16b
|
||||
trn1 v8.8b, v8.8b, v17.8b
|
||||
trn2 v17.8b, v18.8b, v17.8b
|
||||
/* make copy */
|
||||
ins v19.2d[0], v9.2d[1]
|
||||
ins v19.d[0], v9.d[1]
|
||||
mov v18.16b, v9.16b
|
||||
trn1 v9.8b, v9.8b, v19.8b
|
||||
trn2 v19.8b, v18.8b, v19.8b
|
||||
@@ -1018,7 +1019,7 @@ asm_function jsimd_idct_ifast_neon
|
||||
add TMP2, TMP2, OUTPUT_COL
|
||||
st1 {v9.8b}, [TMP1]
|
||||
/* make copy */
|
||||
ins v7.2d[0], v10.2d[1]
|
||||
ins v7.d[0], v10.d[1]
|
||||
mov v18.16b, v10.16b
|
||||
trn1 v10.8b, v10.8b, v7.8b
|
||||
trn2 v7.8b, v18.8b, v7.8b
|
||||
@@ -1031,7 +1032,7 @@ asm_function jsimd_idct_ifast_neon
|
||||
add TMP5, TMP5, OUTPUT_COL
|
||||
st1 {v10.8b}, [TMP1]
|
||||
/* make copy */
|
||||
ins v16.2d[0], v11.2d[1]
|
||||
ins v16.d[0], v11.d[1]
|
||||
mov v18.16b, v11.16b
|
||||
trn1 v11.8b, v11.8b, v16.8b
|
||||
trn2 v16.8b, v18.8b, v16.8b
|
||||
@@ -1040,11 +1041,11 @@ asm_function jsimd_idct_ifast_neon
|
||||
st1 {v16.8b}, [TMP5]
|
||||
sub sp, sp, #176
|
||||
ldp x22, x23, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1095,38 +1096,38 @@ asm_function jsimd_idct_ifast_neon
|
||||
#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.4h[0] */
|
||||
.short -FIX_0_765366865 /* v0.4h[1] */
|
||||
.short -FIX_0_211164243 /* v0.4h[2] */
|
||||
.short FIX_1_451774981 /* v0.4h[3] */
|
||||
Ljsimd_idct_4x4_neon_consts:
|
||||
.short FIX_1_847759065 /* v0.h[0] */
|
||||
.short -FIX_0_765366865 /* v0.h[1] */
|
||||
.short -FIX_0_211164243 /* v0.h[2] */
|
||||
.short FIX_1_451774981 /* v0.h[3] */
|
||||
.short -FIX_2_172734803 /* d1[0] */
|
||||
.short FIX_1_061594337 /* d1[1] */
|
||||
.short -FIX_0_509795579 /* d1[2] */
|
||||
.short -FIX_0_601344887 /* d1[3] */
|
||||
.short FIX_0_899976223 /* v2.4h[0] */
|
||||
.short FIX_2_562915447 /* v2.4h[1] */
|
||||
.short 1 << (CONST_BITS+1) /* v2.4h[2] */
|
||||
.short 0 /* v2.4h[3] */
|
||||
.short FIX_0_899976223 /* v2.h[0] */
|
||||
.short FIX_2_562915447 /* v2.h[1] */
|
||||
.short 1 << (CONST_BITS+1) /* v2.h[2] */
|
||||
.short 0 /* v2.h[3] */
|
||||
|
||||
.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
|
||||
smull v28.4s, \x4, v2.4h[2]
|
||||
smlal v28.4s, \x8, v0.4h[0]
|
||||
smlal v28.4s, \x14, v0.4h[1]
|
||||
smull v28.4s, \x4, v2.h[2]
|
||||
smlal v28.4s, \x8, v0.h[0]
|
||||
smlal v28.4s, \x14, v0.h[1]
|
||||
|
||||
smull v26.4s, \x16, v1.4h[2]
|
||||
smlal v26.4s, \x12, v1.4h[3]
|
||||
smlal v26.4s, \x10, v2.4h[0]
|
||||
smlal v26.4s, \x6, v2.4h[1]
|
||||
smull v26.4s, \x16, v1.h[2]
|
||||
smlal v26.4s, \x12, v1.h[3]
|
||||
smlal v26.4s, \x10, v2.h[0]
|
||||
smlal v26.4s, \x6, v2.h[1]
|
||||
|
||||
smull v30.4s, \x4, v2.4h[2]
|
||||
smlsl v30.4s, \x8, v0.4h[0]
|
||||
smlsl v30.4s, \x14, v0.4h[1]
|
||||
smull v30.4s, \x4, v2.h[2]
|
||||
smlsl v30.4s, \x8, v0.h[0]
|
||||
smlsl v30.4s, \x14, v0.h[1]
|
||||
|
||||
smull v24.4s, \x16, v0.4h[2]
|
||||
smlal v24.4s, \x12, v0.4h[3]
|
||||
smlal v24.4s, \x10, v1.4h[0]
|
||||
smlal v24.4s, \x6, v1.4h[1]
|
||||
smull v24.4s, \x16, v0.h[2]
|
||||
smlal v24.4s, \x12, v0.h[3]
|
||||
smlal v24.4s, \x10, v1.h[0]
|
||||
smlal v24.4s, \x6, v1.h[1]
|
||||
|
||||
add v20.4s, v28.4s, v26.4s
|
||||
sub v28.4s, v28.4s, v26.4s
|
||||
@@ -1171,15 +1172,15 @@ asm_function jsimd_idct_4x4_neon
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, jsimd_idct_4x4_neon_consts
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1203,45 +1204,45 @@ asm_function jsimd_idct_4x4_neon
|
||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||
mul v4.4h, v4.4h, v18.4h
|
||||
mul v5.4h, v5.4h, v19.4h
|
||||
ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
|
||||
ins v4.d[1], v5.d[0] /* 128 bit q4 */
|
||||
ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
|
||||
mul v6.4h, v6.4h, v20.4h
|
||||
mul v7.4h, v7.4h, v21.4h
|
||||
ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
|
||||
ins v6.d[1], v7.d[0] /* 128 bit q6 */
|
||||
mul v8.4h, v8.4h, v22.4h
|
||||
mul v9.4h, v9.4h, v23.4h
|
||||
ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
|
||||
ins v8.d[1], v9.d[0] /* 128 bit q8 */
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
|
||||
mul v10.4h, v10.4h, v24.4h
|
||||
mul v11.4h, v11.4h, v25.4h
|
||||
ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
|
||||
ins v10.d[1], v11.d[0] /* 128 bit q10 */
|
||||
mul v12.4h, v12.4h, v26.4h
|
||||
mul v13.4h, v13.4h, v27.4h
|
||||
ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
|
||||
ins v12.d[1], v13.d[0] /* 128 bit q12 */
|
||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||
mul v14.4h, v14.4h, v28.4h
|
||||
mul v15.4h, v15.4h, v29.4h
|
||||
ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
|
||||
ins v14.d[1], v15.d[0] /* 128 bit q14 */
|
||||
mul v16.4h, v16.4h, v30.4h
|
||||
mul v17.4h, v17.4h, v31.4h
|
||||
ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
|
||||
ins v16.d[1], v17.d[0] /* 128 bit q16 */
|
||||
|
||||
/* Pass 1 */
|
||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
|
||||
transpose_4x4 v4, v6, v8, v10, v3
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
|
||||
transpose_4x4 v5, v7, v9, v11, v3
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
/* Pass 2 */
|
||||
idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
|
||||
transpose_4x4 v26, v27, v28, v29, v3
|
||||
|
||||
/* Range limit */
|
||||
movi v30.8h, #0x80
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v28.2d[1], v29.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
ins v28.d[1], v29.d[0]
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
add v28.8h, v28.8h, v30.8h
|
||||
sqxtun v26.8b, v26.8h
|
||||
@@ -1286,14 +1287,14 @@ asm_function jsimd_idct_4x4_neon
|
||||
/* vpop {v8.4h - v15.4h} ;not available */
|
||||
sub sp, sp, #272
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1325,7 +1326,7 @@ asm_function jsimd_idct_4x4_neon
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
jsimd_idct_2x2_neon_consts:
|
||||
Ljsimd_idct_2x2_neon_consts:
|
||||
.short -FIX_0_720959822 /* v14[0] */
|
||||
.short FIX_0_850430095 /* v14[1] */
|
||||
.short -FIX_1_272758580 /* v14[2] */
|
||||
@@ -1333,10 +1334,10 @@ jsimd_idct_2x2_neon_consts:
|
||||
|
||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||
sshll v15.4s, \x4, #15
|
||||
smull v26.4s, \x6, v14.4h[3]
|
||||
smlal v26.4s, \x10, v14.4h[2]
|
||||
smlal v26.4s, \x12, v14.4h[1]
|
||||
smlal v26.4s, \x16, v14.4h[0]
|
||||
smull v26.4s, \x6, v14.h[3]
|
||||
smlal v26.4s, \x10, v14.h[2]
|
||||
smlal v26.4s, \x12, v14.h[1]
|
||||
smlal v26.4s, \x16, v14.h[0]
|
||||
|
||||
add v20.4s, v15.4s, v26.4s
|
||||
sub v15.4s, v15.4s, v26.4s
|
||||
@@ -1367,14 +1368,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
str x15, [sp], 16
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, jsimd_idct_2x2_neon_consts
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v21.8b - v22.8b}, [sp], 16
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v30.8b - v31.8b}, [sp], 16
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v21.8b, v22.8b}, [sp], 16
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v30.8b, v31.8b}, [sp], 16
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1400,25 +1401,25 @@ asm_function jsimd_idct_2x2_neon
|
||||
ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
|
||||
mul v4.4h, v4.4h, v18.4h
|
||||
mul v5.4h, v5.4h, v19.4h
|
||||
ins v4.2d[1], v5.2d[0]
|
||||
ins v4.d[1], v5.d[0]
|
||||
mul v6.4h, v6.4h, v20.4h
|
||||
mul v7.4h, v7.4h, v21.4h
|
||||
ins v6.2d[1], v7.2d[0]
|
||||
ins v6.d[1], v7.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
|
||||
mul v10.4h, v10.4h, v24.4h
|
||||
mul v11.4h, v11.4h, v25.4h
|
||||
ins v10.2d[1], v11.2d[0]
|
||||
ins v10.d[1], v11.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
|
||||
mul v12.4h, v12.4h, v26.4h
|
||||
mul v13.4h, v13.4h, v27.4h
|
||||
ins v12.2d[1], v13.2d[0]
|
||||
ins v12.d[1], v13.d[0]
|
||||
add DCT_TABLE, DCT_TABLE, #16
|
||||
ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
|
||||
mul v16.4h, v16.4h, v30.4h
|
||||
mul v17.4h, v17.4h, v31.4h
|
||||
ins v16.2d[1], v17.2d[0]
|
||||
ins v16.d[1], v17.d[0]
|
||||
|
||||
/* Pass 1 */
|
||||
#if 0
|
||||
@@ -1427,14 +1428,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
||||
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
||||
#else
|
||||
smull v26.4s, v6.4h, v14.4h[3]
|
||||
smlal v26.4s, v10.4h, v14.4h[2]
|
||||
smlal v26.4s, v12.4h, v14.4h[1]
|
||||
smlal v26.4s, v16.4h, v14.4h[0]
|
||||
smull v24.4s, v7.4h, v14.4h[3]
|
||||
smlal v24.4s, v11.4h, v14.4h[2]
|
||||
smlal v24.4s, v13.4h, v14.4h[1]
|
||||
smlal v24.4s, v17.4h, v14.4h[0]
|
||||
smull v26.4s, v6.4h, v14.h[3]
|
||||
smlal v26.4s, v10.4h, v14.h[2]
|
||||
smlal v26.4s, v12.4h, v14.h[1]
|
||||
smlal v26.4s, v16.4h, v14.h[0]
|
||||
smull v24.4s, v7.4h, v14.h[3]
|
||||
smlal v24.4s, v11.4h, v14.h[2]
|
||||
smlal v24.4s, v13.4h, v14.h[1]
|
||||
smlal v24.4s, v17.4h, v14.h[0]
|
||||
sshll v15.4s, v4.4h, #15
|
||||
sshll v30.4s, v5.4h, #15
|
||||
add v20.4s, v15.4s, v26.4s
|
||||
@@ -1445,12 +1446,12 @@ asm_function jsimd_idct_2x2_neon
|
||||
sub v15.4s, v30.4s, v24.4s
|
||||
rshrn v5.4h, v20.4s, #13
|
||||
rshrn v7.4h, v15.4s, #13
|
||||
ins v4.2d[1], v5.2d[0]
|
||||
ins v6.2d[1], v7.2d[0]
|
||||
ins v4.d[1], v5.d[0]
|
||||
ins v6.d[1], v7.d[0]
|
||||
transpose v4, v6, v3, .16b, .8h
|
||||
transpose v6, v10, v3, .16b, .4s
|
||||
ins v11.2d[0], v10.2d[1]
|
||||
ins v7.2d[0], v6.2d[1]
|
||||
ins v11.d[0], v10.d[1]
|
||||
ins v7.d[0], v6.d[1]
|
||||
#endif
|
||||
|
||||
/* Pass 2 */
|
||||
@@ -1458,10 +1459,10 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
/* Range limit */
|
||||
movi v30.8h, #0x80
|
||||
ins v26.2d[1], v27.2d[0]
|
||||
ins v26.d[1], v27.d[0]
|
||||
add v26.8h, v26.8h, v30.8h
|
||||
sqxtun v30.8b, v26.8h
|
||||
ins v26.2d[0], v30.2d[0]
|
||||
ins v26.d[0], v30.d[0]
|
||||
sqxtun v27.8b, v26.8h
|
||||
|
||||
/* Store results to the output buffer */
|
||||
@@ -1476,13 +1477,13 @@ asm_function jsimd_idct_2x2_neon
|
||||
|
||||
sub sp, sp, #208
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v21.8b - v22.8b}, [sp], 16
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v30.8b - v31.8b}, [sp], 16
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v21.8b, v22.8b}, [sp], 16
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v30.8b, v31.8b}, [sp], 16
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1514,9 +1515,9 @@ asm_function jsimd_idct_2x2_neon
|
||||
ld1 {v4.8b}, [U], 8
|
||||
ld1 {v5.8b}, [V], 8
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
.elseif \size == 4
|
||||
ld1 {v4.b}[0], [U], 1
|
||||
ld1 {v4.b}[1], [U], 1
|
||||
@@ -1606,14 +1607,14 @@ asm_function jsimd_idct_2x2_neon
|
||||
.macro do_yuv_to_rgb_stage1
|
||||
uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb_stage2
|
||||
@@ -1656,18 +1657,18 @@ asm_function jsimd_idct_2x2_neon
|
||||
sqxtun v1\g_offs\defsize, v20.8h
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
sqxtun v1\r_offs\defsize, v24.8h
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
sqxtun v1\b_offs\defsize, v28.8h
|
||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
.else /**************************** rgb565 ***********************************/
|
||||
sqshlu v21.8h, v20.8h, #8
|
||||
sqshlu v25.8h, v24.8h, #8
|
||||
@@ -1675,21 +1676,21 @@ asm_function jsimd_idct_2x2_neon
|
||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||
ld1 {v0.8b}, [Y], 8
|
||||
smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
|
||||
smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
|
||||
smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
|
||||
smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
|
||||
smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
|
||||
sri v25.8h, v21.8h, #5
|
||||
smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
|
||||
prfm PLDL1KEEP, [U, #64]
|
||||
prfm PLDL1KEEP, [V, #64]
|
||||
prfm PLDL1KEEP, [Y, #64]
|
||||
smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
|
||||
smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
|
||||
prfm pldl1keep, [U, #64]
|
||||
prfm pldl1keep, [V, #64]
|
||||
prfm pldl1keep, [Y, #64]
|
||||
sri v25.8h, v29.8h, #11
|
||||
.endif
|
||||
do_store \bpp, 8
|
||||
smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
|
||||
smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
|
||||
smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
|
||||
.endm
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
@@ -1702,7 +1703,7 @@ asm_function jsimd_idct_2x2_neon
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
jsimd_ycc_\colorid\()_neon_consts:
|
||||
Ljsimd_ycc_\colorid\()_neon_consts:
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
@@ -1717,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
INPUT_BUF0 .req x5
|
||||
INPUT_BUF1 .req x6
|
||||
INPUT_BUF2 .req INPUT_BUF
|
||||
INPUT_BUF2 .req x1
|
||||
|
||||
RGB .req x7
|
||||
Y .req x8
|
||||
@@ -1728,16 +1729,16 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
sub sp, sp, 336
|
||||
str x15, [sp], 16
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
adr x15, jsimd_ycc_\colorid\()_neon_consts
|
||||
adr x15, Ljsimd_ycc_\colorid\()_neon_consts
|
||||
/* Save NEON registers */
|
||||
st1 {v0.8b - v3.8b}, [sp], 32
|
||||
st1 {v4.8b - v7.8b}, [sp], 32
|
||||
st1 {v8.8b - v11.8b}, [sp], 32
|
||||
st1 {v12.8b - v15.8b}, [sp], 32
|
||||
st1 {v16.8b - v19.8b}, [sp], 32
|
||||
st1 {v20.8b - v23.8b}, [sp], 32
|
||||
st1 {v24.8b - v27.8b}, [sp], 32
|
||||
st1 {v28.8b - v31.8b}, [sp], 32
|
||||
st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||
ld1 {v2.8h}, [x15]
|
||||
|
||||
@@ -1748,8 +1749,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
stp x8, x9, [sp], 16
|
||||
stp x10, x30, [sp], 16
|
||||
ldr INPUT_BUF0, [INPUT_BUF]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, 8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, 16]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, #8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, #16]
|
||||
.unreq INPUT_BUF
|
||||
|
||||
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
||||
@@ -1758,7 +1759,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
/* Outer loop over scanlines */
|
||||
cmp NUM_ROWS, #1
|
||||
blt 9f
|
||||
b.lt 9f
|
||||
0:
|
||||
lsl x16, INPUT_ROW, #3
|
||||
ldr Y, [INPUT_BUF0, x16]
|
||||
@@ -1770,60 +1771,60 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
|
||||
/* Inner loop over pixels */
|
||||
subs N, N, #8
|
||||
blt 3f
|
||||
b.lt 3f
|
||||
do_load 8
|
||||
do_yuv_to_rgb_stage1
|
||||
subs N, N, #8
|
||||
blt 2f
|
||||
b.lt 2f
|
||||
1:
|
||||
do_yuv_to_rgb_stage2_store_load_stage1
|
||||
subs N, N, #8
|
||||
bge 1b
|
||||
b.ge 1b
|
||||
2:
|
||||
do_yuv_to_rgb_stage2
|
||||
do_store \bpp, 8
|
||||
tst N, #7
|
||||
beq 8f
|
||||
b.eq 8f
|
||||
3:
|
||||
tst N, #4
|
||||
beq 3f
|
||||
b.eq 3f
|
||||
do_load 4
|
||||
3:
|
||||
tst N, #2
|
||||
beq 4f
|
||||
b.eq 4f
|
||||
do_load 2
|
||||
4:
|
||||
tst N, #1
|
||||
beq 5f
|
||||
b.eq 5f
|
||||
do_load 1
|
||||
5:
|
||||
do_yuv_to_rgb
|
||||
tst N, #4
|
||||
beq 6f
|
||||
b.eq 6f
|
||||
do_store \bpp, 4
|
||||
6:
|
||||
tst N, #2
|
||||
beq 7f
|
||||
b.eq 7f
|
||||
do_store \bpp, 2
|
||||
7:
|
||||
tst N, #1
|
||||
beq 8f
|
||||
b.eq 8f
|
||||
do_store \bpp, 1
|
||||
8:
|
||||
subs NUM_ROWS, NUM_ROWS, #1
|
||||
bgt 0b
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
sub sp, sp, #336
|
||||
ldr x15, [sp], 16
|
||||
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||
ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||
ldp x4, x5, [sp], 16
|
||||
ldp x6, x7, [sp], 16
|
||||
|
||||
Reference in New Issue
Block a user