Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-07-23 14:14:14 +00:00
parent d762c19b98
commit 3728aa01d8
2 changed files with 193 additions and 270 deletions

View File

@@ -27,98 +27,29 @@
static unsigned int simd_support = ~0;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
LOCAL(int)
check_feature (char *buffer, char *feature)
{
char *p;
if (*feature == 0)
return 0;
if (strncmp(buffer, "Features", 8) != 0)
return 0;
buffer += 8;
while (isspace(*buffer))
buffer++;
/* Check if 'feature' is present in the buffer as a separate word */
while ((p = strstr(buffer, feature))) {
if (p > buffer && !isspace(*(p - 1))) {
buffer++;
continue;
}
p += strlen(feature);
if (*p != 0 && !isspace(*p)) {
buffer++;
continue;
}
return 1;
}
return 0;
}
LOCAL(int)
parse_proc_cpuinfo (int bufsize)
{
char *buffer = (char *)malloc(bufsize);
FILE *fd;
simd_support = 0;
if (!buffer)
return 0;
fd = fopen("/proc/cpuinfo", "r");
if (fd) {
while (fgets(buffer, bufsize, fd)) {
if (!strchr(buffer, '\n') && !feof(fd)) {
/* "impossible" happened - insufficient size of the buffer! */
fclose(fd);
free(buffer);
return 0;
}
if (check_feature(buffer, "neon"))
simd_support |= JSIMD_ARM_NEON;
}
fclose(fd);
}
free(buffer);
return 1;
}
#endif
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
/*
* ARMv8 architectures support NEON extensions by default.
* It is no longer optional as it was with ARMv7.
*/
LOCAL(void)
init_simd (void)
{
char *env = NULL;
#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
int bufsize = 1024; /* an initial guess for the line buffer size limit */
#endif
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__ARM_NEON__)
simd_support |= JSIMD_ARM_NEON;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
/* We still have a chance to use NEON regardless of globally used
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux/android */
while (!parse_proc_cpuinfo(bufsize)) {
bufsize *= 2;
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
break;
}
#endif
/* Force different settings through environment variables */
env = getenv("JSIMD_FORCENEON");

View File

@@ -34,7 +34,6 @@
#define RESPECT_STRICT_ALIGNMENT 1
#define RTSM_SQSHRN_SIM_ISSUE
/*****************************************************************************/
@@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon
ROW6R .req v29
ROW7L .req v30
ROW7R .req v31
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
sub sp, sp, 272
str x15, [sp], 16
adr x15, jsimd_idct_islow_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
@@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon
mul v22.4h, v22.4h, v6.4h
mul v23.4h, v23.4h, v7.4h
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
mul v24.4h, v24.4h, v0.4h
mul v25.4h, v25.4h, v1.4h
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
@@ -293,10 +302,11 @@ asm_function jsimd_idct_islow_neon
mul v30.4h, v30.4h, v6.4h
mul v31.4h, v31.4h, v7.4h
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Go to the bottom of the stack */
sub sp, sp, 352
stp x4, x5, [sp], 16
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
st1 {v12.4h - v15.4h}, [sp], 32
/* 1-D IDCT, pass 1, left 4x8 half */
add v4.4h, ROW7L.4h, ROW3L.4h
add v5.4h, ROW5L.4h, ROW1L.4h
@@ -304,25 +314,22 @@ asm_function jsimd_idct_islow_neon
smlal v12.4s, v5.4h, XFIX_1_175875602
smull v14.4s, v4.4h, XFIX_1_175875602
/* Check for the zero coefficients in the right 4x8 half */
/* push {x4, x5} */
stp x4, x5, [sp, -16]!
mov x5, #0
smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
ssubl v6.4s, ROW0L.4h, ROW4L.4h
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
smull v4.4s, ROW2L.4h, XFIX_0_541196100
smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
orr x0, x4, x5
mov v8.16b, v12.16b
smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
shl v6.4s, v6.4s, #13
orr x0, x0, x4
smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
orr x0, x0 , x5
add v2.4s, v6.4s, v4.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
mov v10.16b, v14.16b
add v2.4s, v2.4s, v12.4s
orr x0, x0, x4
@@ -330,7 +337,7 @@ asm_function jsimd_idct_islow_neon
orr x0, x0, x5
smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
rshrn ROW1L.4h, v2.4s, #11
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
sub v2.4s, v2.4s, v12.4s
smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
orr x0, x0, x4
@@ -338,21 +345,21 @@ asm_function jsimd_idct_islow_neon
orr x0, x0, x5
sub v2.4s, v2.4s, v12.4s
smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
smlal v12.4s, ROW6L.4h, XFIX_0_541196100
sub v6.4s, v6.4s, v4.4s
orr x0, x0, x4
rshrn ROW6L.4h, v2.4s, #11
orr x0, x0, x5
add v2.4s, v6.4s, v10.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
sub v6.4s, v6.4s, v10.4s
saddl v10.4s, ROW0L.4h, ROW4L.4h
orr x0, x0, x4
rshrn ROW2L.4h, v2.4s, #11
orr x0, x0, x5
rshrn ROW5L.4h, v6.4s, #11
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
shl v10.4s, v10.4s, #13
smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
orr x0, x0, x4
@@ -360,12 +367,13 @@ asm_function jsimd_idct_islow_neon
orr x0, x0, x5
sub v2.4s, v10.4s, v12.4s
add v12.4s, v4.4s, v14.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
sub v4.4s, v4.4s, v14.4s
add v10.4s, v2.4s, v8.4s
orr x0, x4, x5
sub v6.4s, v2.4s, v8.4s
/* pop {x4, x5} */
sub sp, sp, 80
ldp x4, x5, [sp], 16
rshrn ROW7L.4h, v4.4s, #11
rshrn ROW3L.4h, v10.4s, #11
@@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon
ins v18.2d[1], v19.2d[0]
ins v20.2d[1], v21.2d[0]
ins v22.2d[1], v23.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v16.8b, v16.8h, #2
sqrshrn2 v16.16b, v18.8h, #2
sqrshrn v18.8b, v20.8h, #2
sqrshrn2 v18.16b, v22.8h, #2
#else
sqrshrn v16.4h, v16.4s, #2
sqrshrn2 v16.8h, v18.4s, #2
sqrshrn v18.4h, v20.4s, #2
sqrshrn2 v18.8h, v22.4s, #2
#endif
/* vpop {v8.4h-d15.4h} */ /* restore NEON registers */
ld1 {v12.4h-v15.4h}, [sp], 32
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
ld1 {v8.4h - v11.4h}, [sp], 32
ld1 {v12.4h - v15.4h}, [sp], 32
ins v24.2d[1], v25.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v20.8b, v24.8h, #2
#else
sqrshrn v20.4h, v24.4s, #2
#endif
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
/* trn1 v16.8h, v16.8h, v18.8h */
transpose v16, v18, v3, .16b, .8h
ins v26.2d[1], v27.2d[0]
ins v28.2d[1], v29.2d[0]
ins v30.2d[1], v31.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v20.16b, v26.8h, #2
sqrshrn v22.8b, v28.8h, #2
#else
sqrshrn2 v20.8h, v26.4s, #2
sqrshrn v22.4h, v28.4s, #2
#endif
movi v0.16b, #(CENTERJSAMPLE)
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v22.16b, v30.8h, #2
#else
sqrshrn2 v22.8h, v30.4s, #2
#endif
transpose_single v16, v17, v3, .2d, .8b
transpose_single v18, v19, v3, .2d, .8b
add v16.8b, v16.8b, v0.8b
@@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon
st1 {v21.8b}, [TMP2]
st1 {v22.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon
TMP1 .req x0
TMP2 .req x1
TMP3 .req x2
TMP4 .req x15
TMP4 .req x22
TMP5 .req x23
/* Load and dequantize coefficients into NEON registers
* with the following allocation:
@@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon
* 6 | d28 | d29 ( v14.8h )
* 7 | d30 | d31 ( v15.8h )
*/
adr x15, jsimd_idct_ifast_neon_consts
/* Save NEON registers used in fast IDCT */
sub sp, sp, #176
stp x22, x23, [sp], 16
adr x23, jsimd_idct_ifast_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon
ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
mul v14.8h, v14.8h, v2.8h
mul v13.8h, v13.8h, v1.8h
ld1 {v0.4h}, [x15] /* load constants */
ld1 {v0.4h}, [x23] /* load constants */
mul v15.8h, v15.8h, v3.8h
/* vpush {v4.8h-v6.8h} */ /* save NEON registers */
sub sp, sp, #32
st1 {v4.8h-v5.8h}, [sp] /* save NEON registers */
sub sp, sp, #16
st1 {v6.8h}, [sp]
/* 1-D IDCT, pass 1 */
sub v2.8h, v10.8h, v14.8h
add v14.8h, v10.8h, v14.8h
@@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon
trn1 v13.4s, v13.4s, v15.4s
trn2 v15.4s, v18.4s, v15.4s
/* vswp v14.4h, v10-MSB.4h */
umov x10, v14.d[0]
umov x22, v14.d[0]
ins v14.2d[0], v10.2d[1]
ins v10.2d[1], x10
ins v10.2d[1], x22
/* vswp v13.4h, v9MSB.4h */
umov x10, v13.d[0]
umov x22, v13.d[0]
ins v13.2d[0], v9.2d[1]
ins v9.2d[1], x10
ins v9.2d[1], x22
/* 1-D IDCT, pass 2 */
sub v2.8h, v10.8h, v14.8h
/* vswp v15.4h, v11MSB.4h */
umov x10, v15.d[0]
umov x22, v15.d[0]
ins v15.2d[0], v11.2d[1]
ins v11.2d[1], x10
ins v11.2d[1], x22
add v14.8h, v10.8h, v14.8h
/* vswp v12.4h, v8-MSB.4h */
umov x10, v12.d[0]
umov x22, v12.d[0]
ins v12.2d[0], v8.2d[1]
ins v8.2d[1], x10
ins v8.2d[1], x22
sub v1.8h, v11.8h, v13.8h
add v13.8h, v11.8h, v13.8h
sub v5.8h, v9.8h, v15.8h
@@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon
add v14.8h, v5.8h, v3.8h
sub v9.8h, v5.8h, v3.8h
sub v13.8h, v10.8h, v2.8h
/* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */
ld1 {v6.8h}, [sp], 16
ld1 {v4.8h-v5.8h}, [sp], 32
add v10.8h, v10.8h, v2.8h
sub v11.8h, v12.8h, v1.8h
add v12.8h, v12.8h, v1.8h
/* Descale to 8-bit and range limit */
movi v0.16b, #0x80
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqshrn v8.8b, v8.8h, #5
sqshrn2 v8.16b, v9.8h, #5
sqshrn v9.8b, v10.8h, #5
@@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon
sqshrn2 v10.16b, v13.8h, #5
sqshrn v11.8b, v14.8h, #5
sqshrn2 v11.16b, v15.8h, #5
#else
sqshrn v8.4h, v8.4s, #5
sqshrn2 v8.8h, v9.4s, #5
sqshrn v9.4h, v10.4s, #5
sqshrn2 v9.8h, v11.4s, #5
sqshrn v10.4h, v12.4s, #5
sqshrn2 v10.8h, v13.4s, #5
sqshrn v11.4h, v14.4s, #5
sqshrn2 v11.8h, v15.4s, #5
#endif
add v8.16b, v8.16b, v0.16b
add v9.16b, v9.16b, v0.16b
add v10.16b, v10.16b, v0.16b
@@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon
add TMP2, TMP2, OUTPUT_COL
st1 {v9.8b}, [TMP1]
/* make copy */
ins v21.2d[0], v10.2d[1]
ins v7.2d[0], v10.2d[1]
mov v18.16b, v10.16b
trn1 v10.8b, v10.8b, v21.8b
trn2 v21.8b, v18.8b, v21.8b
trn1 v10.8b, v10.8b, v7.8b
trn2 v7.8b, v18.8b, v7.8b
st1 {v19.8b}, [TMP2]
ldp TMP1, TMP2, [OUTPUT_BUF], 16
ldp TMP3, TMP4, [OUTPUT_BUF]
ldp TMP4, TMP5, [OUTPUT_BUF], 16
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
add TMP5, TMP5, OUTPUT_COL
st1 {v10.8b}, [TMP1]
/* make copy */
ins v23.2d[0], v11.2d[1]
ins v16.2d[0], v11.2d[1]
mov v18.16b, v11.16b
trn1 v11.8b, v11.8b, v23.8b
trn2 v23.8b, v18.8b, v23.8b
st1 {v21.8b}, [TMP2]
st1 {v11.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
trn1 v11.8b, v11.8b, v16.8b
trn2 v16.8b, v18.8b, v16.8b
st1 {v7.8b}, [TMP2]
st1 {v11.8b}, [TMP4]
st1 {v16.8b}, [TMP5]
sub sp, sp, #176
ldp x22, x23, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15
/* vpush {v8.4h-v15.4h} */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Save all used NEON registers */
sub sp, sp, 272
str x15, [sp], 16
/* Load constants (v3.4h is just used for padding) */
adr TMP4, jsimd_idct_4x4_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1291,9 +1289,16 @@ asm_function jsimd_idct_4x4_neon
#endif
/* vpop {v8.4h - v15.4h} ;not available */
ld1 {v12.4h-v15.4h}, [sp], 32
ld1 {v8.4h-v11.4h}, [sp], 32
sub sp, sp, #272
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts:
.short FIX_3_624509785 /* d0[3] */
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
sshll v28.4s, \x4, #15
sshll v15.4s, \x4, #15
smull v26.4s, \x6, v0.4h[3]
smlal v26.4s, \x10, v0.4h[2]
smlal v26.4s, \x12, v0.4h[1]
smlal v26.4s, \x16, v0.4h[0]
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
srshr v28.4s, v28.4s, #\shift
srshr v15.4s, v15.4s, #\shift
xtn \y26, v20.4s
xtn \y27, v28.4s
xtn \y27, v15.4s
.else
rshrn \y26, v20.4s, #\shift
rshrn \y27, v28.4s, #\shift
rshrn \y27, v15.4s, #\shift
.endif
.endm
@@ -1364,14 +1369,19 @@ asm_function jsimd_idct_2x2_neon
TMP2 .req x15
/* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
sub sp, sp, 208
str x15, [sp], 16
/* Load constants */
adr TMP2, jsimd_idct_2x2_neon_consts
ld1 {v0.4h}, [TMP2]
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v21.8b - v22.8b}, [sp], 16
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v30.8b - v31.8b}, [sp], 16
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
#else
smull v26.4s, v6.4h, v0.4h[3]
smlal v26.4s, v10.4h, v0.4h[2]
smlal v26.4s, v12.4h, v0.4h[1]
smlal v26.4s, v16.4h, v0.4h[0]
smull v24.4s, v7.4h, v0.4h[3]
smlal v24.4s, v11.4h, v0.4h[2]
smlal v24.4s, v13.4h, v0.4h[1]
smlal v24.4s, v17.4h, v0.4h[0]
sshll v28.4s, v4.4h, #15
smull v26.4s, v6.4h, v14.4h[3]
smlal v26.4s, v10.4h, v14.4h[2]
smlal v26.4s, v12.4h, v14.4h[1]
smlal v26.4s, v16.4h, v14.4h[0]
smull v24.4s, v7.4h, v14.4h[3]
smlal v24.4s, v11.4h, v14.4h[2]
smlal v24.4s, v13.4h, v14.4h[1]
smlal v24.4s, v17.4h, v14.4h[0]
sshll v15.4s, v4.4h, #15
sshll v30.4s, v5.4h, #15
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
rshrn v4.4h, v20.4s, #13
rshrn v6.4h, v28.4s, #13
rshrn v6.4h, v15.4s, #13
add v20.4s, v30.4s, v24.4s
sub v28.4s, v30.4s, v24.4s
sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
rshrn v7.4h, v28.4s, #13
rshrn v7.4h, v15.4s, #13
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
#endif
@@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
/* vpop {v8.4h-v15.4h} ;not available */
ld1 {v12.4h-v15.4h}, [sp], 32
ld1 {v8.4h-v11.4h}, [sp], 32
sub sp, sp, #208
ldr x15, [sp], 16
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v21.8b - v22.8b}, [sp], 16
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v30.8b - v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon
.error unsupported bpp
.endif
.endm
#ifdef RTSM_SQSHRN_SIM_ISSUE
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
#else
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
#endif
/*
* 2 stage pipelined YCbCr->RGB conversion
* 2-stage pipelined YCbCr->RGB conversion
*/
.macro do_yuv_to_rgb_stage1
@@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
sqxtun v1\r_offs\defsize, v24.8h
sqxtun v1\b_offs\defsize, v28.8h
#else
sqxtun v1\g_offs\gsize, v20.4s
sqxtun v1\r_offs\rsize, v24.4s
sqxtun v1\b_offs\bsize, v28.4s
#endif
.endm
.macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
#else
sqxtun v1\g_offs\gsize, v20.4s
#endif
ld1 {v0.8b}, [Y], 8
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\r_offs\defsize, v24.8h
#else
sqxtun v1\r_offs\rsize, v24.4s
#endif
prfm PLDL1KEEP, [U, #64]
prfm PLDL1KEEP, [V, #64]
prfm PLDL1KEEP, [Y, #64]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\b_offs\defsize, v28.8h
#else
sqxtun v1\b_offs\gsize, v28.4s
#endif
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
do_store \bpp, 8
@@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
V .req x10
N .req x15
sub sp, sp, 336
str x15, [sp], 16
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
adr x15, jsimd_ycc_\colorid\()_neon_consts
/* Save NEON registers */
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
/* Save ARM registers and handle input arguments */
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
stp x4, x5, [sp,-16]!
stp x6, x7, [sp,-16]!
stp x8, x9, [sp,-16]!
stp x10, x30, [sp,-16]!
stp x4, x5, [sp], 16
stp x6, x7, [sp], 16
stp x8, x9, [sp], 16
stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, 8]
ldr INPUT_BUF2, [INPUT_BUF, 16]
.unreq INPUT_BUF
/* Save NEON registers */
/* vpush {v8.4h-v15.4h} */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp]
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
movi v10.16b, #255
movi v12.16b, #255
@@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
bgt 0b
9:
/* Restore all registers and return */
/* vpop {v8.4h-v15.4h} */
ld1 {v12.4h-v15.4h}, [sp], #32
ld1 {v8.4h-v11.4h}, [sp], #32
sub sp, sp, #336
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
ldp x10, x30, [sp], #16
ldp x8, x9, [sp], #16
ldp x6, x5, [sp], #16
ldp x4, x5, [sp], #16
ldp x4, x5, [sp], 16
ldp x6, x7, [sp], 16
ldp x8, x9, [sp], 16
ldp x10, x30, [sp], 16
br x30
.unreq OUTPUT_WIDTH
.unreq INPUT_ROW
@@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.purgem do_yuv_to_rgb_stage2_store_load_stage1
.endm
/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
* as a workaround for the simulator fix
*/
#ifdef RTSM_SQSHRN_SIM_ISSUE
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
@@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
#else
/*--------------------------------- id ----- bpp R rsize G gsize B bsize */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h
#endif
.purgem do_load
.purgem do_store