Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8)

git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-07-23 14:14:14 +00:00
parent d762c19b98
commit 3728aa01d8
2 changed files with 193 additions and 270 deletions

View File

@@ -27,98 +27,29 @@
static unsigned int simd_support = ~0;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
LOCAL(int)
check_feature (char *buffer, char *feature)
{
char *p;
if (*feature == 0)
return 0;
if (strncmp(buffer, "Features", 8) != 0)
return 0;
buffer += 8;
while (isspace(*buffer))
buffer++;
/* Check if 'feature' is present in the buffer as a separate word */
while ((p = strstr(buffer, feature))) {
if (p > buffer && !isspace(*(p - 1))) {
buffer++;
continue;
}
p += strlen(feature);
if (*p != 0 && !isspace(*p)) {
buffer++;
continue;
}
return 1;
}
return 0;
}
LOCAL(int)
parse_proc_cpuinfo (int bufsize)
{
char *buffer = (char *)malloc(bufsize);
FILE *fd;
simd_support = 0;
if (!buffer)
return 0;
fd = fopen("/proc/cpuinfo", "r");
if (fd) {
while (fgets(buffer, bufsize, fd)) {
if (!strchr(buffer, '\n') && !feof(fd)) {
/* "impossible" happened - insufficient size of the buffer! */
fclose(fd);
free(buffer);
return 0;
}
if (check_feature(buffer, "neon"))
simd_support |= JSIMD_ARM_NEON;
}
fclose(fd);
}
free(buffer);
return 1;
}
#endif
/*
* Check what SIMD accelerations are supported.
*
* FIXME: This code is racy under a multi-threaded environment.
*/
/*
* ARMv8 architectures support NEON extensions by default.
* It is no longer optional as it was with ARMv7.
*/
LOCAL(void)
init_simd (void)
{
char *env = NULL;
#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
int bufsize = 1024; /* an initial guess for the line buffer size limit */
#endif
if (simd_support != ~0U)
return;
simd_support = 0;
#if defined(__ARM_NEON__)
simd_support |= JSIMD_ARM_NEON;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
/* We still have a chance to use NEON regardless of globally used
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux/android */
while (!parse_proc_cpuinfo(bufsize)) {
bufsize *= 2;
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
break;
}
#endif
/* Force different settings through environment variables */
env = getenv("JSIMD_FORCENEON");

View File

@@ -34,7 +34,6 @@
#define RESPECT_STRICT_ALIGNMENT 1
#define RTSM_SQSHRN_SIM_ISSUE
/*****************************************************************************/
@@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon
ROW6R .req v29
ROW7L .req v30
ROW7R .req v31
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
sub sp, sp, 272
str x15, [sp], 16
adr x15, jsimd_idct_islow_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
@@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon
mul v22.4h, v22.4h, v6.4h
mul v23.4h, v23.4h, v7.4h
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
mul v24.4h, v24.4h, v0.4h
mul v25.4h, v25.4h, v1.4h
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
@@ -293,80 +302,79 @@ asm_function jsimd_idct_islow_neon
mul v30.4h, v30.4h, v6.4h
mul v31.4h, v31.4h, v7.4h
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Go to the bottom of the stack */
sub sp, sp, 352
stp x4, x5, [sp], 16
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
st1 {v12.4h - v15.4h}, [sp], 32
/* 1-D IDCT, pass 1, left 4x8 half */
add v4.4h, ROW7L.4h, ROW3L.4h
add v5.4h, ROW5L.4h, ROW1L.4h
smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
smlal v12.4s, v5.4h, XFIX_1_175875602
smull v14.4s, v4.4h, XFIX_1_175875602
/* Check for the zero coefficients in the right 4x8 half */
/* push {x4, x5} */
stp x4, x5, [sp, -16]!
mov x5, #0
/* Check for the zero coefficients in the right 4x8 half */
smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
ssubl v6.4s, ROW0L.4h, ROW4L.4h
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
smull v4.4s, ROW2L.4h, XFIX_0_541196100
smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
orr x0, x4, x5
orr x0, x4, x5
mov v8.16b, v12.16b
smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
shl v6.4s, v6.4s, #13
orr x0, x0, x4
orr x0, x0, x4
smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
orr x0, x0 , x5
orr x0, x0 , x5
add v2.4s, v6.4s, v4.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
mov v10.16b, v14.16b
add v2.4s, v2.4s, v12.4s
orr x0, x0, x4
orr x0, x0, x4
smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
orr x0, x0, x5
orr x0, x0, x5
smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
rshrn ROW1L.4h, v2.4s, #11
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
sub v2.4s, v2.4s, v12.4s
smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
orr x0, x0, x4
orr x0, x0, x4
smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
orr x0, x0, x5
orr x0, x0, x5
sub v2.4s, v2.4s, v12.4s
smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
smlal v12.4s, ROW6L.4h, XFIX_0_541196100
sub v6.4s, v6.4s, v4.4s
orr x0, x0, x4
orr x0, x0, x4
rshrn ROW6L.4h, v2.4s, #11
orr x0, x0, x5
orr x0, x0, x5
add v2.4s, v6.4s, v10.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
sub v6.4s, v6.4s, v10.4s
saddl v10.4s, ROW0L.4h, ROW4L.4h
orr x0, x0, x4
orr x0, x0, x4
rshrn ROW2L.4h, v2.4s, #11
orr x0, x0, x5
orr x0, x0, x5
rshrn ROW5L.4h, v6.4s, #11
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
shl v10.4s, v10.4s, #13
smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
orr x0, x0, x4
orr x0, x0, x4
add v4.4s, v10.4s, v12.4s
orr x0, x0, x5
orr x0, x0, x5
sub v2.4s, v10.4s, v12.4s
add v12.4s, v4.4s, v14.4s
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
sub v4.4s, v4.4s, v14.4s
add v10.4s, v2.4s, v8.4s
orr x0, x4, x5
orr x0, x4, x5
sub v6.4s, v2.4s, v8.4s
/* pop {x4, x5} */
ldp x4, x5, [sp], 16
sub sp, sp, 80
ldp x4, x5, [sp], 16
rshrn ROW7L.4h, v4.4s, #11
rshrn ROW3L.4h, v10.4s, #11
rshrn ROW0L.4h, v12.4s, #11
@@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon
ins v18.2d[1], v19.2d[0]
ins v20.2d[1], v21.2d[0]
ins v22.2d[1], v23.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v16.8b, v16.8h, #2
sqrshrn2 v16.16b, v18.8h, #2
sqrshrn v18.8b, v20.8h, #2
sqrshrn2 v18.16b, v22.8h, #2
#else
sqrshrn v16.4h, v16.4s, #2
sqrshrn2 v16.8h, v18.4s, #2
sqrshrn v18.4h, v20.4s, #2
sqrshrn2 v18.8h, v22.4s, #2
#endif
/* vpop {v8.4h-d15.4h} */ /* restore NEON registers */
ld1 {v12.4h-v15.4h}, [sp], 32
ld1 {v8.4h-v11.4h}, [sp], 32
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
ld1 {v8.4h - v11.4h}, [sp], 32
ld1 {v12.4h - v15.4h}, [sp], 32
ins v24.2d[1], v25.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn v20.8b, v24.8h, #2
#else
sqrshrn v20.4h, v24.4s, #2
#endif
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
/* trn1 v16.8h, v16.8h, v18.8h */
transpose v16, v18, v3, .16b, .8h
ins v26.2d[1], v27.2d[0]
ins v28.2d[1], v29.2d[0]
ins v30.2d[1], v31.2d[0]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v20.16b, v26.8h, #2
sqrshrn v22.8b, v28.8h, #2
#else
sqrshrn2 v20.8h, v26.4s, #2
sqrshrn v22.4h, v28.4s, #2
#endif
movi v0.16b, #(CENTERJSAMPLE)
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqrshrn2 v22.16b, v30.8h, #2
#else
sqrshrn2 v22.8h, v30.4s, #2
#endif
transpose_single v16, v17, v3, .2d, .8b
transpose_single v18, v19, v3, .2d, .8b
add v16.8b, v16.8b, v0.8b
@@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon
st1 {v21.8b}, [TMP2]
st1 {v22.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
@@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon
TMP1 .req x0
TMP2 .req x1
TMP3 .req x2
TMP4 .req x15
TMP4 .req x22
TMP5 .req x23
/* Load and dequantize coefficients into NEON registers
* with the following allocation:
@@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon
* 6 | d28 | d29 ( v14.8h )
* 7 | d30 | d31 ( v15.8h )
*/
adr x15, jsimd_idct_ifast_neon_consts
/* Save NEON registers used in fast IDCT */
sub sp, sp, #176
stp x22, x23, [sp], 16
adr x23, jsimd_idct_ifast_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
@@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon
ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
mul v14.8h, v14.8h, v2.8h
mul v13.8h, v13.8h, v1.8h
ld1 {v0.4h}, [x15] /* load constants */
ld1 {v0.4h}, [x23] /* load constants */
mul v15.8h, v15.8h, v3.8h
/* vpush {v4.8h-v6.8h} */ /* save NEON registers */
sub sp, sp, #32
st1 {v4.8h-v5.8h}, [sp] /* save NEON registers */
sub sp, sp, #16
st1 {v6.8h}, [sp]
/* 1-D IDCT, pass 1 */
sub v2.8h, v10.8h, v14.8h
add v14.8h, v10.8h, v14.8h
@@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon
trn1 v13.4s, v13.4s, v15.4s
trn2 v15.4s, v18.4s, v15.4s
/* vswp v14.4h, v10-MSB.4h */
umov x10, v14.d[0]
umov x22, v14.d[0]
ins v14.2d[0], v10.2d[1]
ins v10.2d[1], x10
ins v10.2d[1], x22
/* vswp v13.4h, v9MSB.4h */
umov x10, v13.d[0]
umov x22, v13.d[0]
ins v13.2d[0], v9.2d[1]
ins v9.2d[1], x10
ins v9.2d[1], x22
/* 1-D IDCT, pass 2 */
sub v2.8h, v10.8h, v14.8h
/* vswp v15.4h, v11MSB.4h */
umov x10, v15.d[0]
umov x22, v15.d[0]
ins v15.2d[0], v11.2d[1]
ins v11.2d[1], x10
ins v11.2d[1], x22
add v14.8h, v10.8h, v14.8h
/* vswp v12.4h, v8-MSB.4h */
umov x10, v12.d[0]
umov x22, v12.d[0]
ins v12.2d[0], v8.2d[1]
ins v8.2d[1], x10
ins v8.2d[1], x22
sub v1.8h, v11.8h, v13.8h
add v13.8h, v11.8h, v13.8h
sub v5.8h, v9.8h, v15.8h
@@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon
add v14.8h, v5.8h, v3.8h
sub v9.8h, v5.8h, v3.8h
sub v13.8h, v10.8h, v2.8h
/* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */
ld1 {v6.8h}, [sp], 16
ld1 {v4.8h-v5.8h}, [sp], 32
add v10.8h, v10.8h, v2.8h
sub v11.8h, v12.8h, v1.8h
add v12.8h, v12.8h, v1.8h
/* Descale to 8-bit and range limit */
movi v0.16b, #0x80
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqshrn v8.8b, v8.8h, #5
sqshrn2 v8.16b, v9.8h, #5
sqshrn v9.8b, v10.8h, #5
@@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon
sqshrn2 v10.16b, v13.8h, #5
sqshrn v11.8b, v14.8h, #5
sqshrn2 v11.16b, v15.8h, #5
#else
sqshrn v8.4h, v8.4s, #5
sqshrn2 v8.8h, v9.4s, #5
sqshrn v9.4h, v10.4s, #5
sqshrn2 v9.8h, v11.4s, #5
sqshrn v10.4h, v12.4s, #5
sqshrn2 v10.8h, v13.4s, #5
sqshrn v11.4h, v14.4s, #5
sqshrn2 v11.8h, v15.4s, #5
#endif
add v8.16b, v8.16b, v0.16b
add v9.16b, v9.16b, v0.16b
add v10.16b, v10.16b, v0.16b
@@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon
add TMP2, TMP2, OUTPUT_COL
st1 {v9.8b}, [TMP1]
/* make copy */
ins v21.2d[0], v10.2d[1]
ins v7.2d[0], v10.2d[1]
mov v18.16b, v10.16b
trn1 v10.8b, v10.8b, v21.8b
trn2 v21.8b, v18.8b, v21.8b
trn1 v10.8b, v10.8b, v7.8b
trn2 v7.8b, v18.8b, v7.8b
st1 {v19.8b}, [TMP2]
ldp TMP1, TMP2, [OUTPUT_BUF], 16
ldp TMP3, TMP4, [OUTPUT_BUF]
ldp TMP4, TMP5, [OUTPUT_BUF], 16
add TMP1, TMP1, OUTPUT_COL
add TMP2, TMP2, OUTPUT_COL
add TMP3, TMP3, OUTPUT_COL
add TMP4, TMP4, OUTPUT_COL
add TMP5, TMP5, OUTPUT_COL
st1 {v10.8b}, [TMP1]
/* make copy */
ins v23.2d[0], v11.2d[1]
ins v16.2d[0], v11.2d[1]
mov v18.16b, v11.16b
trn1 v11.8b, v11.8b, v23.8b
trn2 v23.8b, v18.8b, v23.8b
st1 {v21.8b}, [TMP2]
st1 {v11.8b}, [TMP3]
st1 {v23.8b}, [TMP4]
trn1 v11.8b, v11.8b, v16.8b
trn2 v16.8b, v18.8b, v16.8b
st1 {v7.8b}, [TMP2]
st1 {v11.8b}, [TMP4]
st1 {v16.8b}, [TMP5]
sub sp, sp, #176
ldp x22, x23, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15
/* vpush {v8.4h-v15.4h} */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Save all used NEON registers */
sub sp, sp, 272
str x15, [sp], 16
/* Load constants (v3.4h is just used for padding) */
adr TMP4, jsimd_idct_4x4_neon_consts
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1290,10 +1288,17 @@ asm_function jsimd_idct_4x4_neon
st1 {v27.b}[7], [TMP4], 1
#endif
/* vpop {v8.4h-v15.4h} ;not available */
ld1 {v12.4h-v15.4h}, [sp], 32
ld1 {v8.4h-v11.4h}, [sp], 32
/* vpop {v8.4h - v15.4h} ;not available */
sub sp, sp, #272
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts:
.short FIX_3_624509785 /* d0[3] */
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
sshll v28.4s, \x4, #15
sshll v15.4s, \x4, #15
smull v26.4s, \x6, v0.4h[3]
smlal v26.4s, \x10, v0.4h[2]
smlal v26.4s, \x12, v0.4h[1]
smlal v26.4s, \x16, v0.4h[0]
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
.if \shift > 16
srshr v20.4s, v20.4s, #\shift
srshr v28.4s, v28.4s, #\shift
srshr v15.4s, v15.4s, #\shift
xtn \y26, v20.4s
xtn \y27, v28.4s
xtn \y27, v15.4s
.else
rshrn \y26, v20.4s, #\shift
rshrn \y27, v28.4s, #\shift
rshrn \y27, v15.4s, #\shift
.endif
.endm
@@ -1363,15 +1368,20 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req x0
TMP2 .req x15
/* vpush {v8.4h-v15.4h} ; not available */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, 208
str x15, [sp], 16
/* Load constants */
adr TMP2, jsimd_idct_2x2_neon_consts
ld1 {v0.4h}, [TMP2]
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v21.8b - v22.8b}, [sp], 16
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v30.8b - v31.8b}, [sp], 16
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
#else
smull v26.4s, v6.4h, v0.4h[3]
smlal v26.4s, v10.4h, v0.4h[2]
smlal v26.4s, v12.4h, v0.4h[1]
smlal v26.4s, v16.4h, v0.4h[0]
smull v24.4s, v7.4h, v0.4h[3]
smlal v24.4s, v11.4h, v0.4h[2]
smlal v24.4s, v13.4h, v0.4h[1]
smlal v24.4s, v17.4h, v0.4h[0]
sshll v28.4s, v4.4h, #15
smull v26.4s, v6.4h, v14.4h[3]
smlal v26.4s, v10.4h, v14.4h[2]
smlal v26.4s, v12.4h, v14.4h[1]
smlal v26.4s, v16.4h, v14.4h[0]
smull v24.4s, v7.4h, v14.4h[3]
smlal v24.4s, v11.4h, v14.4h[2]
smlal v24.4s, v13.4h, v14.4h[1]
smlal v24.4s, v17.4h, v14.4h[0]
sshll v15.4s, v4.4h, #15
sshll v30.4s, v5.4h, #15
add v20.4s, v28.4s, v26.4s
sub v28.4s, v28.4s, v26.4s
add v20.4s, v15.4s, v26.4s
sub v15.4s, v15.4s, v26.4s
rshrn v4.4h, v20.4s, #13
rshrn v6.4h, v28.4s, #13
rshrn v6.4h, v15.4s, #13
add v20.4s, v30.4s, v24.4s
sub v28.4s, v30.4s, v24.4s
sub v15.4s, v30.4s, v24.4s
rshrn v5.4h, v20.4s, #13
rshrn v7.4h, v28.4s, #13
rshrn v7.4h, v15.4s, #13
transpose v4, v6, v3, .16b, .8h
transpose v6, v10, v3, .16b, .4s
#endif
@@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
/* vpop {v8.4h-v15.4h} ;not available */
ld1 {v12.4h-v15.4h}, [sp], 32
ld1 {v8.4h-v11.4h}, [sp], 32
sub sp, sp, #208
ldr x15, [sp], 16
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v21.8b - v22.8b}, [sp], 16
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v30.8b - v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon
.error unsupported bpp
.endif
.endm
#ifdef RTSM_SQSHRN_SIM_ISSUE
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
#else
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
#endif
/*
* 2 stage pipelined YCbCr->RGB conversion
* 2-stage pipelined YCbCr->RGB conversion
*/
.macro do_yuv_to_rgb_stage1
@@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
sqxtun v1\r_offs\defsize, v24.8h
sqxtun v1\b_offs\defsize, v28.8h
#else
sqxtun v1\g_offs\gsize, v20.4s
sqxtun v1\r_offs\rsize, v24.4s
sqxtun v1\b_offs\bsize, v28.4s
#endif
.endm
.macro do_yuv_to_rgb_stage2_store_load_stage1
@@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon
uaddw v20.8h, v20.8h, v0.8b
uaddw v24.8h, v24.8h, v0.8b
uaddw v28.8h, v28.8h, v0.8b
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\g_offs\defsize, v20.8h
#else
sqxtun v1\g_offs\gsize, v20.4s
#endif
ld1 {v0.8b}, [Y], 8
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\r_offs\defsize, v24.8h
#else
sqxtun v1\r_offs\rsize, v24.4s
#endif
prfm PLDL1KEEP, [U, #64]
prfm PLDL1KEEP, [V, #64]
prfm PLDL1KEEP, [Y, #64]
#ifdef RTSM_SQSHRN_SIM_ISSUE
sqxtun v1\b_offs\defsize, v28.8h
#else
sqxtun v1\b_offs\gsize, v28.4s
#endif
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
do_store \bpp, 8
@@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
V .req x10
N .req x15
sub sp, sp, 336
str x15, [sp], 16
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
adr x15, jsimd_ycc_\colorid\()_neon_consts
/* Save NEON registers */
st1 {v0.8b - v3.8b}, [sp], 32
st1 {v4.8b - v7.8b}, [sp], 32
st1 {v8.8b - v11.8b}, [sp], 32
st1 {v12.8b - v15.8b}, [sp], 32
st1 {v16.8b - v19.8b}, [sp], 32
st1 {v20.8b - v23.8b}, [sp], 32
st1 {v24.8b - v27.8b}, [sp], 32
st1 {v28.8b - v31.8b}, [sp], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
/* Save ARM registers and handle input arguments */
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
stp x4, x5, [sp,-16]!
stp x6, x7, [sp,-16]!
stp x8, x9, [sp,-16]!
stp x10, x30, [sp,-16]!
stp x4, x5, [sp], 16
stp x6, x7, [sp], 16
stp x8, x9, [sp], 16
stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, 8]
ldr INPUT_BUF2, [INPUT_BUF, 16]
.unreq INPUT_BUF
/* Save NEON registers */
/* vpush {v8.4h-v15.4h} */
sub sp, sp, #32
st1 {v8.4h-v11.4h}, [sp]
sub sp, sp, #32
st1 {v12.4h-v15.4h}, [sp]
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
movi v10.16b, #255
movi v12.16b, #255
@@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
bgt 0b
9:
/* Restore all registers and return */
/* vpop {v8.4h-v15.4h} */
ld1 {v12.4h-v15.4h}, [sp], #32
ld1 {v8.4h-v11.4h}, [sp], #32
sub sp, sp, #336
ldr x15, [sp], 16
ld1 {v0.8b - v3.8b}, [sp], 32
ld1 {v4.8b - v7.8b}, [sp], 32
ld1 {v8.8b - v11.8b}, [sp], 32
ld1 {v12.8b - v15.8b}, [sp], 32
ld1 {v16.8b - v19.8b}, [sp], 32
ld1 {v20.8b - v23.8b}, [sp], 32
ld1 {v24.8b - v27.8b}, [sp], 32
ld1 {v28.8b - v31.8b}, [sp], 32
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
ldp x10, x30, [sp], #16
ldp x8, x9, [sp], #16
ldp x6, x5, [sp], #16
ldp x4, x5, [sp], #16
ldp x4, x5, [sp], 16
ldp x6, x7, [sp], 16
ldp x8, x9, [sp], 16
ldp x10, x30, [sp], 16
br x30
.unreq OUTPUT_WIDTH
.unreq INPUT_ROW
@@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.purgem do_yuv_to_rgb_stage2_store_load_stage1
.endm
/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
* as a workaround for the simulator fix
*/
#ifdef RTSM_SQSHRN_SIM_ISSUE
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
@@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
#else
/*--------------------------------- id ----- bpp R rsize G gsize B bsize */
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h
#endif
.purgem do_load
.purgem do_store