Fix performance and other issues uncovered in testing with actual ARM64 hardware; formatting tweaks; remove NEON platform check (NEON is always available with ARMv8)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1333 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -27,98 +27,29 @@
|
|||||||
|
|
||||||
static unsigned int simd_support = ~0;
|
static unsigned int simd_support = ~0;
|
||||||
|
|
||||||
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
|
||||||
|
|
||||||
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
|
|
||||||
|
|
||||||
LOCAL(int)
|
|
||||||
check_feature (char *buffer, char *feature)
|
|
||||||
{
|
|
||||||
char *p;
|
|
||||||
if (*feature == 0)
|
|
||||||
return 0;
|
|
||||||
if (strncmp(buffer, "Features", 8) != 0)
|
|
||||||
return 0;
|
|
||||||
buffer += 8;
|
|
||||||
while (isspace(*buffer))
|
|
||||||
buffer++;
|
|
||||||
|
|
||||||
/* Check if 'feature' is present in the buffer as a separate word */
|
|
||||||
while ((p = strstr(buffer, feature))) {
|
|
||||||
if (p > buffer && !isspace(*(p - 1))) {
|
|
||||||
buffer++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
p += strlen(feature);
|
|
||||||
if (*p != 0 && !isspace(*p)) {
|
|
||||||
buffer++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOCAL(int)
|
|
||||||
parse_proc_cpuinfo (int bufsize)
|
|
||||||
{
|
|
||||||
char *buffer = (char *)malloc(bufsize);
|
|
||||||
FILE *fd;
|
|
||||||
simd_support = 0;
|
|
||||||
|
|
||||||
if (!buffer)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
fd = fopen("/proc/cpuinfo", "r");
|
|
||||||
if (fd) {
|
|
||||||
while (fgets(buffer, bufsize, fd)) {
|
|
||||||
if (!strchr(buffer, '\n') && !feof(fd)) {
|
|
||||||
/* "impossible" happened - insufficient size of the buffer! */
|
|
||||||
fclose(fd);
|
|
||||||
free(buffer);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (check_feature(buffer, "neon"))
|
|
||||||
simd_support |= JSIMD_ARM_NEON;
|
|
||||||
}
|
|
||||||
fclose(fd);
|
|
||||||
}
|
|
||||||
free(buffer);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check what SIMD accelerations are supported.
|
* Check what SIMD accelerations are supported.
|
||||||
*
|
*
|
||||||
* FIXME: This code is racy under a multi-threaded environment.
|
* FIXME: This code is racy under a multi-threaded environment.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ARMv8 architectures support NEON extensions by default.
|
||||||
|
* It is no longer optional as it was with ARMv7.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
LOCAL(void)
|
LOCAL(void)
|
||||||
init_simd (void)
|
init_simd (void)
|
||||||
{
|
{
|
||||||
char *env = NULL;
|
char *env = NULL;
|
||||||
#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
|
||||||
int bufsize = 1024; /* an initial guess for the line buffer size limit */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (simd_support != ~0U)
|
if (simd_support != ~0U)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
simd_support = 0;
|
simd_support = 0;
|
||||||
|
|
||||||
#if defined(__ARM_NEON__)
|
|
||||||
simd_support |= JSIMD_ARM_NEON;
|
simd_support |= JSIMD_ARM_NEON;
|
||||||
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
|
|
||||||
/* We still have a chance to use NEON regardless of globally used
|
|
||||||
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
|
|
||||||
* /proc/cpuinfo parsing on linux/android */
|
|
||||||
while (!parse_proc_cpuinfo(bufsize)) {
|
|
||||||
bufsize *= 2;
|
|
||||||
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Force different settings through environment variables */
|
/* Force different settings through environment variables */
|
||||||
env = getenv("JSIMD_FORCENEON");
|
env = getenv("JSIMD_FORCENEON");
|
||||||
|
|||||||
@@ -34,7 +34,6 @@
|
|||||||
|
|
||||||
#define RESPECT_STRICT_ALIGNMENT 1
|
#define RESPECT_STRICT_ALIGNMENT 1
|
||||||
|
|
||||||
#define RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
|
|
||||||
|
|
||||||
/*****************************************************************************/
|
/*****************************************************************************/
|
||||||
@@ -257,8 +256,18 @@ asm_function jsimd_idct_islow_neon
|
|||||||
ROW6R .req v29
|
ROW6R .req v29
|
||||||
ROW7L .req v30
|
ROW7L .req v30
|
||||||
ROW7R .req v31
|
ROW7R .req v31
|
||||||
|
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
|
||||||
|
sub sp, sp, 272
|
||||||
|
str x15, [sp], 16
|
||||||
adr x15, jsimd_idct_islow_neon_consts
|
adr x15, jsimd_idct_islow_neon_consts
|
||||||
|
st1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
st1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
st1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
st1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
st1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
st1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
st1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
st1 {v28.8b - v31.8b}, [sp], 32
|
||||||
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
|
||||||
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
|
||||||
@@ -277,7 +286,7 @@ asm_function jsimd_idct_islow_neon
|
|||||||
mul v22.4h, v22.4h, v6.4h
|
mul v22.4h, v22.4h, v6.4h
|
||||||
mul v23.4h, v23.4h, v7.4h
|
mul v23.4h, v23.4h, v7.4h
|
||||||
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
|
ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
|
||||||
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK], 32
|
ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
|
||||||
mul v24.4h, v24.4h, v0.4h
|
mul v24.4h, v24.4h, v0.4h
|
||||||
mul v25.4h, v25.4h, v1.4h
|
mul v25.4h, v25.4h, v1.4h
|
||||||
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
|
ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
|
||||||
@@ -293,80 +302,79 @@ asm_function jsimd_idct_islow_neon
|
|||||||
mul v30.4h, v30.4h, v6.4h
|
mul v30.4h, v30.4h, v6.4h
|
||||||
mul v31.4h, v31.4h, v7.4h
|
mul v31.4h, v31.4h, v7.4h
|
||||||
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
|
ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
|
||||||
sub sp, sp, #32
|
/* Go to the bottom of the stack */
|
||||||
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
|
sub sp, sp, 352
|
||||||
sub sp, sp, #32
|
stp x4, x5, [sp], 16
|
||||||
st1 {v12.4h-v15.4h}, [sp]
|
st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
|
||||||
|
st1 {v12.4h - v15.4h}, [sp], 32
|
||||||
/* 1-D IDCT, pass 1, left 4x8 half */
|
/* 1-D IDCT, pass 1, left 4x8 half */
|
||||||
add v4.4h, ROW7L.4h, ROW3L.4h
|
add v4.4h, ROW7L.4h, ROW3L.4h
|
||||||
add v5.4h, ROW5L.4h, ROW1L.4h
|
add v5.4h, ROW5L.4h, ROW1L.4h
|
||||||
smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
|
smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
|
||||||
smlal v12.4s, v5.4h, XFIX_1_175875602
|
smlal v12.4s, v5.4h, XFIX_1_175875602
|
||||||
smull v14.4s, v4.4h, XFIX_1_175875602
|
smull v14.4s, v4.4h, XFIX_1_175875602
|
||||||
/* Check for the zero coefficients in the right 4x8 half */
|
/* Check for the zero coefficients in the right 4x8 half */
|
||||||
/* push {x4, x5} */
|
|
||||||
stp x4, x5, [sp, -16]!
|
|
||||||
mov x5, #0
|
|
||||||
smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
|
smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
|
||||||
ssubl v6.4s, ROW0L.4h, ROW4L.4h
|
ssubl v6.4s, ROW0L.4h, ROW4L.4h
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
|
||||||
smull v4.4s, ROW2L.4h, XFIX_0_541196100
|
smull v4.4s, ROW2L.4h, XFIX_0_541196100
|
||||||
smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
|
smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
|
||||||
orr x0, x4, x5
|
orr x0, x4, x5
|
||||||
mov v8.16b, v12.16b
|
mov v8.16b, v12.16b
|
||||||
smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
|
smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
|
||||||
smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
|
smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
|
||||||
shl v6.4s, v6.4s, #13
|
shl v6.4s, v6.4s, #13
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
|
smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
|
||||||
orr x0, x0 , x5
|
orr x0, x0 , x5
|
||||||
add v2.4s, v6.4s, v4.4s
|
add v2.4s, v6.4s, v4.4s
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
|
||||||
mov v10.16b, v14.16b
|
mov v10.16b, v14.16b
|
||||||
add v2.4s, v2.4s, v12.4s
|
add v2.4s, v2.4s, v12.4s
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
|
smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
|
||||||
orr x0, x0, x5
|
orr x0, x0, x5
|
||||||
smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
|
smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
|
||||||
rshrn ROW1L.4h, v2.4s, #11
|
rshrn ROW1L.4h, v2.4s, #11
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
|
||||||
sub v2.4s, v2.4s, v12.4s
|
sub v2.4s, v2.4s, v12.4s
|
||||||
smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
|
smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
|
smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
|
||||||
orr x0, x0, x5
|
orr x0, x0, x5
|
||||||
sub v2.4s, v2.4s, v12.4s
|
sub v2.4s, v2.4s, v12.4s
|
||||||
smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
|
smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
|
||||||
smlal v12.4s, ROW6L.4h, XFIX_0_541196100
|
smlal v12.4s, ROW6L.4h, XFIX_0_541196100
|
||||||
sub v6.4s, v6.4s, v4.4s
|
sub v6.4s, v6.4s, v4.4s
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
rshrn ROW6L.4h, v2.4s, #11
|
rshrn ROW6L.4h, v2.4s, #11
|
||||||
orr x0, x0, x5
|
orr x0, x0, x5
|
||||||
add v2.4s, v6.4s, v10.4s
|
add v2.4s, v6.4s, v10.4s
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
|
||||||
sub v6.4s, v6.4s, v10.4s
|
sub v6.4s, v6.4s, v10.4s
|
||||||
saddl v10.4s, ROW0L.4h, ROW4L.4h
|
saddl v10.4s, ROW0L.4h, ROW4L.4h
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
rshrn ROW2L.4h, v2.4s, #11
|
rshrn ROW2L.4h, v2.4s, #11
|
||||||
orr x0, x0, x5
|
orr x0, x0, x5
|
||||||
rshrn ROW5L.4h, v6.4s, #11
|
rshrn ROW5L.4h, v6.4s, #11
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
|
||||||
shl v10.4s, v10.4s, #13
|
shl v10.4s, v10.4s, #13
|
||||||
smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
|
smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
|
||||||
orr x0, x0, x4
|
orr x0, x0, x4
|
||||||
add v4.4s, v10.4s, v12.4s
|
add v4.4s, v10.4s, v12.4s
|
||||||
orr x0, x0, x5
|
orr x0, x0, x5
|
||||||
sub v2.4s, v10.4s, v12.4s
|
sub v2.4s, v10.4s, v12.4s
|
||||||
add v12.4s, v4.4s, v14.4s
|
add v12.4s, v4.4s, v14.4s
|
||||||
ldr x4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
|
ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
|
||||||
sub v4.4s, v4.4s, v14.4s
|
sub v4.4s, v4.4s, v14.4s
|
||||||
add v10.4s, v2.4s, v8.4s
|
add v10.4s, v2.4s, v8.4s
|
||||||
orr x0, x4, x5
|
orr x0, x4, x5
|
||||||
sub v6.4s, v2.4s, v8.4s
|
sub v6.4s, v2.4s, v8.4s
|
||||||
/* pop {x4, x5} */
|
/* pop {x4, x5} */
|
||||||
ldp x4, x5, [sp], 16
|
sub sp, sp, 80
|
||||||
|
ldp x4, x5, [sp], 16
|
||||||
rshrn ROW7L.4h, v4.4s, #11
|
rshrn ROW7L.4h, v4.4s, #11
|
||||||
rshrn ROW3L.4h, v10.4s, #11
|
rshrn ROW3L.4h, v10.4s, #11
|
||||||
rshrn ROW0L.4h, v12.4s, #11
|
rshrn ROW0L.4h, v12.4s, #11
|
||||||
@@ -552,48 +560,27 @@ asm_function jsimd_idct_islow_neon
|
|||||||
ins v18.2d[1], v19.2d[0]
|
ins v18.2d[1], v19.2d[0]
|
||||||
ins v20.2d[1], v21.2d[0]
|
ins v20.2d[1], v21.2d[0]
|
||||||
ins v22.2d[1], v23.2d[0]
|
ins v22.2d[1], v23.2d[0]
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqrshrn v16.8b, v16.8h, #2
|
sqrshrn v16.8b, v16.8h, #2
|
||||||
sqrshrn2 v16.16b, v18.8h, #2
|
sqrshrn2 v16.16b, v18.8h, #2
|
||||||
sqrshrn v18.8b, v20.8h, #2
|
sqrshrn v18.8b, v20.8h, #2
|
||||||
sqrshrn2 v18.16b, v22.8h, #2
|
sqrshrn2 v18.16b, v22.8h, #2
|
||||||
#else
|
|
||||||
sqrshrn v16.4h, v16.4s, #2
|
|
||||||
sqrshrn2 v16.8h, v18.4s, #2
|
|
||||||
sqrshrn v18.4h, v20.4s, #2
|
|
||||||
sqrshrn2 v18.8h, v22.4s, #2
|
|
||||||
#endif
|
|
||||||
/* vpop {v8.4h-d15.4h} */ /* restore NEON registers */
|
|
||||||
|
|
||||||
ld1 {v12.4h-v15.4h}, [sp], 32
|
/* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
|
||||||
ld1 {v8.4h-v11.4h}, [sp], 32
|
ld1 {v8.4h - v11.4h}, [sp], 32
|
||||||
|
ld1 {v12.4h - v15.4h}, [sp], 32
|
||||||
ins v24.2d[1], v25.2d[0]
|
ins v24.2d[1], v25.2d[0]
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
|
|
||||||
sqrshrn v20.8b, v24.8h, #2
|
sqrshrn v20.8b, v24.8h, #2
|
||||||
#else
|
|
||||||
|
|
||||||
sqrshrn v20.4h, v24.4s, #2
|
|
||||||
#endif
|
|
||||||
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
/* Transpose the final 8-bit samples and do signed->unsigned conversion */
|
||||||
/* trn1 v16.8h, v16.8h, v18.8h */
|
/* trn1 v16.8h, v16.8h, v18.8h */
|
||||||
transpose v16, v18, v3, .16b, .8h
|
transpose v16, v18, v3, .16b, .8h
|
||||||
ins v26.2d[1], v27.2d[0]
|
ins v26.2d[1], v27.2d[0]
|
||||||
ins v28.2d[1], v29.2d[0]
|
ins v28.2d[1], v29.2d[0]
|
||||||
ins v30.2d[1], v31.2d[0]
|
ins v30.2d[1], v31.2d[0]
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqrshrn2 v20.16b, v26.8h, #2
|
sqrshrn2 v20.16b, v26.8h, #2
|
||||||
sqrshrn v22.8b, v28.8h, #2
|
sqrshrn v22.8b, v28.8h, #2
|
||||||
#else
|
|
||||||
sqrshrn2 v20.8h, v26.4s, #2
|
|
||||||
sqrshrn v22.4h, v28.4s, #2
|
|
||||||
#endif
|
|
||||||
movi v0.16b, #(CENTERJSAMPLE)
|
movi v0.16b, #(CENTERJSAMPLE)
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqrshrn2 v22.16b, v30.8h, #2
|
sqrshrn2 v22.16b, v30.8h, #2
|
||||||
#else
|
|
||||||
sqrshrn2 v22.8h, v30.4s, #2
|
|
||||||
#endif
|
|
||||||
transpose_single v16, v17, v3, .2d, .8b
|
transpose_single v16, v17, v3, .2d, .8b
|
||||||
transpose_single v18, v19, v3, .2d, .8b
|
transpose_single v18, v19, v3, .2d, .8b
|
||||||
add v16.8b, v16.8b, v0.8b
|
add v16.8b, v16.8b, v0.8b
|
||||||
@@ -628,6 +615,15 @@ asm_function jsimd_idct_islow_neon
|
|||||||
st1 {v21.8b}, [TMP2]
|
st1 {v21.8b}, [TMP2]
|
||||||
st1 {v22.8b}, [TMP3]
|
st1 {v22.8b}, [TMP3]
|
||||||
st1 {v23.8b}, [TMP4]
|
st1 {v23.8b}, [TMP4]
|
||||||
|
ldr x15, [sp], 16
|
||||||
|
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
|
||||||
@@ -799,7 +795,8 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
TMP1 .req x0
|
TMP1 .req x0
|
||||||
TMP2 .req x1
|
TMP2 .req x1
|
||||||
TMP3 .req x2
|
TMP3 .req x2
|
||||||
TMP4 .req x15
|
TMP4 .req x22
|
||||||
|
TMP5 .req x23
|
||||||
|
|
||||||
/* Load and dequantize coefficients into NEON registers
|
/* Load and dequantize coefficients into NEON registers
|
||||||
* with the following allocation:
|
* with the following allocation:
|
||||||
@@ -814,7 +811,15 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
* 6 | d28 | d29 ( v14.8h )
|
* 6 | d28 | d29 ( v14.8h )
|
||||||
* 7 | d30 | d31 ( v15.8h )
|
* 7 | d30 | d31 ( v15.8h )
|
||||||
*/
|
*/
|
||||||
adr x15, jsimd_idct_ifast_neon_consts
|
/* Save NEON registers used in fast IDCT */
|
||||||
|
sub sp, sp, #176
|
||||||
|
stp x22, x23, [sp], 16
|
||||||
|
adr x23, jsimd_idct_ifast_neon_consts
|
||||||
|
st1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
st1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
st1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
st1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
st1 {v16.8b - v19.8b}, [sp], 32
|
||||||
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
|
||||||
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
|
||||||
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
|
||||||
@@ -830,14 +835,9 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
|
ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
|
||||||
mul v14.8h, v14.8h, v2.8h
|
mul v14.8h, v14.8h, v2.8h
|
||||||
mul v13.8h, v13.8h, v1.8h
|
mul v13.8h, v13.8h, v1.8h
|
||||||
ld1 {v0.4h}, [x15] /* load constants */
|
ld1 {v0.4h}, [x23] /* load constants */
|
||||||
mul v15.8h, v15.8h, v3.8h
|
mul v15.8h, v15.8h, v3.8h
|
||||||
|
|
||||||
/* vpush {v4.8h-v6.8h} */ /* save NEON registers */
|
|
||||||
sub sp, sp, #32
|
|
||||||
st1 {v4.8h-v5.8h}, [sp] /* save NEON registers */
|
|
||||||
sub sp, sp, #16
|
|
||||||
st1 {v6.8h}, [sp]
|
|
||||||
/* 1-D IDCT, pass 1 */
|
/* 1-D IDCT, pass 1 */
|
||||||
sub v2.8h, v10.8h, v14.8h
|
sub v2.8h, v10.8h, v14.8h
|
||||||
add v14.8h, v10.8h, v14.8h
|
add v14.8h, v10.8h, v14.8h
|
||||||
@@ -912,25 +912,25 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
trn1 v13.4s, v13.4s, v15.4s
|
trn1 v13.4s, v13.4s, v15.4s
|
||||||
trn2 v15.4s, v18.4s, v15.4s
|
trn2 v15.4s, v18.4s, v15.4s
|
||||||
/* vswp v14.4h, v10-MSB.4h */
|
/* vswp v14.4h, v10-MSB.4h */
|
||||||
umov x10, v14.d[0]
|
umov x22, v14.d[0]
|
||||||
ins v14.2d[0], v10.2d[1]
|
ins v14.2d[0], v10.2d[1]
|
||||||
ins v10.2d[1], x10
|
ins v10.2d[1], x22
|
||||||
/* vswp v13.4h, v9MSB.4h */
|
/* vswp v13.4h, v9MSB.4h */
|
||||||
|
|
||||||
umov x10, v13.d[0]
|
umov x22, v13.d[0]
|
||||||
ins v13.2d[0], v9.2d[1]
|
ins v13.2d[0], v9.2d[1]
|
||||||
ins v9.2d[1], x10
|
ins v9.2d[1], x22
|
||||||
/* 1-D IDCT, pass 2 */
|
/* 1-D IDCT, pass 2 */
|
||||||
sub v2.8h, v10.8h, v14.8h
|
sub v2.8h, v10.8h, v14.8h
|
||||||
/* vswp v15.4h, v11MSB.4h */
|
/* vswp v15.4h, v11MSB.4h */
|
||||||
umov x10, v15.d[0]
|
umov x22, v15.d[0]
|
||||||
ins v15.2d[0], v11.2d[1]
|
ins v15.2d[0], v11.2d[1]
|
||||||
ins v11.2d[1], x10
|
ins v11.2d[1], x22
|
||||||
add v14.8h, v10.8h, v14.8h
|
add v14.8h, v10.8h, v14.8h
|
||||||
/* vswp v12.4h, v8-MSB.4h */
|
/* vswp v12.4h, v8-MSB.4h */
|
||||||
umov x10, v12.d[0]
|
umov x22, v12.d[0]
|
||||||
ins v12.2d[0], v8.2d[1]
|
ins v12.2d[0], v8.2d[1]
|
||||||
ins v8.2d[1], x10
|
ins v8.2d[1], x22
|
||||||
sub v1.8h, v11.8h, v13.8h
|
sub v1.8h, v11.8h, v13.8h
|
||||||
add v13.8h, v11.8h, v13.8h
|
add v13.8h, v11.8h, v13.8h
|
||||||
sub v5.8h, v9.8h, v15.8h
|
sub v5.8h, v9.8h, v15.8h
|
||||||
@@ -966,15 +966,11 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
add v14.8h, v5.8h, v3.8h
|
add v14.8h, v5.8h, v3.8h
|
||||||
sub v9.8h, v5.8h, v3.8h
|
sub v9.8h, v5.8h, v3.8h
|
||||||
sub v13.8h, v10.8h, v2.8h
|
sub v13.8h, v10.8h, v2.8h
|
||||||
/* vpop {v4.8h-v7.4h} */ /* restore NEON registers...not available */
|
|
||||||
ld1 {v6.8h}, [sp], 16
|
|
||||||
ld1 {v4.8h-v5.8h}, [sp], 32
|
|
||||||
add v10.8h, v10.8h, v2.8h
|
add v10.8h, v10.8h, v2.8h
|
||||||
sub v11.8h, v12.8h, v1.8h
|
sub v11.8h, v12.8h, v1.8h
|
||||||
add v12.8h, v12.8h, v1.8h
|
add v12.8h, v12.8h, v1.8h
|
||||||
/* Descale to 8-bit and range limit */
|
/* Descale to 8-bit and range limit */
|
||||||
movi v0.16b, #0x80
|
movi v0.16b, #0x80
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqshrn v8.8b, v8.8h, #5
|
sqshrn v8.8b, v8.8h, #5
|
||||||
sqshrn2 v8.16b, v9.8h, #5
|
sqshrn2 v8.16b, v9.8h, #5
|
||||||
sqshrn v9.8b, v10.8h, #5
|
sqshrn v9.8b, v10.8h, #5
|
||||||
@@ -983,16 +979,6 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
sqshrn2 v10.16b, v13.8h, #5
|
sqshrn2 v10.16b, v13.8h, #5
|
||||||
sqshrn v11.8b, v14.8h, #5
|
sqshrn v11.8b, v14.8h, #5
|
||||||
sqshrn2 v11.16b, v15.8h, #5
|
sqshrn2 v11.16b, v15.8h, #5
|
||||||
#else
|
|
||||||
sqshrn v8.4h, v8.4s, #5
|
|
||||||
sqshrn2 v8.8h, v9.4s, #5
|
|
||||||
sqshrn v9.4h, v10.4s, #5
|
|
||||||
sqshrn2 v9.8h, v11.4s, #5
|
|
||||||
sqshrn v10.4h, v12.4s, #5
|
|
||||||
sqshrn2 v10.8h, v13.4s, #5
|
|
||||||
sqshrn v11.4h, v14.4s, #5
|
|
||||||
sqshrn2 v11.8h, v15.4s, #5
|
|
||||||
#endif
|
|
||||||
add v8.16b, v8.16b, v0.16b
|
add v8.16b, v8.16b, v0.16b
|
||||||
add v9.16b, v9.16b, v0.16b
|
add v9.16b, v9.16b, v0.16b
|
||||||
add v10.16b, v10.16b, v0.16b
|
add v10.16b, v10.16b, v0.16b
|
||||||
@@ -1036,26 +1022,33 @@ asm_function jsimd_idct_ifast_neon
|
|||||||
add TMP2, TMP2, OUTPUT_COL
|
add TMP2, TMP2, OUTPUT_COL
|
||||||
st1 {v9.8b}, [TMP1]
|
st1 {v9.8b}, [TMP1]
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v21.2d[0], v10.2d[1]
|
ins v7.2d[0], v10.2d[1]
|
||||||
mov v18.16b, v10.16b
|
mov v18.16b, v10.16b
|
||||||
trn1 v10.8b, v10.8b, v21.8b
|
trn1 v10.8b, v10.8b, v7.8b
|
||||||
trn2 v21.8b, v18.8b, v21.8b
|
trn2 v7.8b, v18.8b, v7.8b
|
||||||
st1 {v19.8b}, [TMP2]
|
st1 {v19.8b}, [TMP2]
|
||||||
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
ldp TMP1, TMP2, [OUTPUT_BUF], 16
|
||||||
ldp TMP3, TMP4, [OUTPUT_BUF]
|
ldp TMP4, TMP5, [OUTPUT_BUF], 16
|
||||||
add TMP1, TMP1, OUTPUT_COL
|
add TMP1, TMP1, OUTPUT_COL
|
||||||
add TMP2, TMP2, OUTPUT_COL
|
add TMP2, TMP2, OUTPUT_COL
|
||||||
add TMP3, TMP3, OUTPUT_COL
|
|
||||||
add TMP4, TMP4, OUTPUT_COL
|
add TMP4, TMP4, OUTPUT_COL
|
||||||
|
add TMP5, TMP5, OUTPUT_COL
|
||||||
st1 {v10.8b}, [TMP1]
|
st1 {v10.8b}, [TMP1]
|
||||||
/* make copy */
|
/* make copy */
|
||||||
ins v23.2d[0], v11.2d[1]
|
ins v16.2d[0], v11.2d[1]
|
||||||
mov v18.16b, v11.16b
|
mov v18.16b, v11.16b
|
||||||
trn1 v11.8b, v11.8b, v23.8b
|
trn1 v11.8b, v11.8b, v16.8b
|
||||||
trn2 v23.8b, v18.8b, v23.8b
|
trn2 v16.8b, v18.8b, v16.8b
|
||||||
st1 {v21.8b}, [TMP2]
|
st1 {v7.8b}, [TMP2]
|
||||||
st1 {v11.8b}, [TMP3]
|
st1 {v11.8b}, [TMP4]
|
||||||
st1 {v23.8b}, [TMP4]
|
st1 {v16.8b}, [TMP5]
|
||||||
|
sub sp, sp, #176
|
||||||
|
ldp x22, x23, [sp], 16
|
||||||
|
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1179,14 +1172,19 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
TMP3 .req x2
|
TMP3 .req x2
|
||||||
TMP4 .req x15
|
TMP4 .req x15
|
||||||
|
|
||||||
/* vpush {v8.4h-v15.4h} */
|
/* Save all used NEON registers */
|
||||||
sub sp, sp, #32
|
sub sp, sp, 272
|
||||||
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
|
str x15, [sp], 16
|
||||||
sub sp, sp, #32
|
|
||||||
st1 {v12.4h-v15.4h}, [sp]
|
|
||||||
|
|
||||||
/* Load constants (v3.4h is just used for padding) */
|
/* Load constants (v3.4h is just used for padding) */
|
||||||
adr TMP4, jsimd_idct_4x4_neon_consts
|
adr TMP4, jsimd_idct_4x4_neon_consts
|
||||||
|
st1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
st1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
st1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
st1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
st1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
st1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
st1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
st1 {v28.8b - v31.8b}, [sp], 32
|
||||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||||
|
|
||||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||||
@@ -1290,10 +1288,17 @@ asm_function jsimd_idct_4x4_neon
|
|||||||
st1 {v27.b}[7], [TMP4], 1
|
st1 {v27.b}[7], [TMP4], 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vpop {v8.4h-v15.4h} ;not available */
|
/* vpop {v8.4h - v15.4h} ;not available */
|
||||||
ld1 {v12.4h-v15.4h}, [sp], 32
|
sub sp, sp, #272
|
||||||
ld1 {v8.4h-v11.4h}, [sp], 32
|
ldr x15, [sp], 16
|
||||||
|
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1333,23 +1338,23 @@ jsimd_idct_2x2_neon_consts:
|
|||||||
.short FIX_3_624509785 /* d0[3] */
|
.short FIX_3_624509785 /* d0[3] */
|
||||||
|
|
||||||
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
|
||||||
sshll v28.4s, \x4, #15
|
sshll v15.4s, \x4, #15
|
||||||
smull v26.4s, \x6, v0.4h[3]
|
smull v26.4s, \x6, v0.4h[3]
|
||||||
smlal v26.4s, \x10, v0.4h[2]
|
smlal v26.4s, \x10, v0.4h[2]
|
||||||
smlal v26.4s, \x12, v0.4h[1]
|
smlal v26.4s, \x12, v0.4h[1]
|
||||||
smlal v26.4s, \x16, v0.4h[0]
|
smlal v26.4s, \x16, v0.4h[0]
|
||||||
|
|
||||||
add v20.4s, v28.4s, v26.4s
|
add v20.4s, v15.4s, v26.4s
|
||||||
sub v28.4s, v28.4s, v26.4s
|
sub v15.4s, v15.4s, v26.4s
|
||||||
|
|
||||||
.if \shift > 16
|
.if \shift > 16
|
||||||
srshr v20.4s, v20.4s, #\shift
|
srshr v20.4s, v20.4s, #\shift
|
||||||
srshr v28.4s, v28.4s, #\shift
|
srshr v15.4s, v15.4s, #\shift
|
||||||
xtn \y26, v20.4s
|
xtn \y26, v20.4s
|
||||||
xtn \y27, v28.4s
|
xtn \y27, v15.4s
|
||||||
.else
|
.else
|
||||||
rshrn \y26, v20.4s, #\shift
|
rshrn \y26, v20.4s, #\shift
|
||||||
rshrn \y27, v28.4s, #\shift
|
rshrn \y27, v15.4s, #\shift
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
@@ -1363,15 +1368,20 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
TMP1 .req x0
|
TMP1 .req x0
|
||||||
TMP2 .req x15
|
TMP2 .req x15
|
||||||
|
|
||||||
/* vpush {v8.4h-v15.4h} ; not available */
|
/* vpush {v8.4h - v15.4h} ; not available */
|
||||||
sub sp, sp, #32
|
sub sp, sp, 208
|
||||||
st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */
|
str x15, [sp], 16
|
||||||
sub sp, sp, #32
|
|
||||||
st1 {v12.4h-v15.4h}, [sp]
|
|
||||||
|
|
||||||
/* Load constants */
|
/* Load constants */
|
||||||
adr TMP2, jsimd_idct_2x2_neon_consts
|
adr TMP2, jsimd_idct_2x2_neon_consts
|
||||||
ld1 {v0.4h}, [TMP2]
|
st1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
st1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
st1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
st1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
st1 {v21.8b - v22.8b}, [sp], 16
|
||||||
|
st1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
st1 {v30.8b - v31.8b}, [sp], 16
|
||||||
|
ld1 {v14.4h}, [TMP2]
|
||||||
|
|
||||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||||
* 0 1 2 3 | 4 5 6 7
|
* 0 1 2 3 | 4 5 6 7
|
||||||
@@ -1423,24 +1433,24 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
|
||||||
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
|
||||||
#else
|
#else
|
||||||
smull v26.4s, v6.4h, v0.4h[3]
|
smull v26.4s, v6.4h, v14.4h[3]
|
||||||
smlal v26.4s, v10.4h, v0.4h[2]
|
smlal v26.4s, v10.4h, v14.4h[2]
|
||||||
smlal v26.4s, v12.4h, v0.4h[1]
|
smlal v26.4s, v12.4h, v14.4h[1]
|
||||||
smlal v26.4s, v16.4h, v0.4h[0]
|
smlal v26.4s, v16.4h, v14.4h[0]
|
||||||
smull v24.4s, v7.4h, v0.4h[3]
|
smull v24.4s, v7.4h, v14.4h[3]
|
||||||
smlal v24.4s, v11.4h, v0.4h[2]
|
smlal v24.4s, v11.4h, v14.4h[2]
|
||||||
smlal v24.4s, v13.4h, v0.4h[1]
|
smlal v24.4s, v13.4h, v14.4h[1]
|
||||||
smlal v24.4s, v17.4h, v0.4h[0]
|
smlal v24.4s, v17.4h, v14.4h[0]
|
||||||
sshll v28.4s, v4.4h, #15
|
sshll v15.4s, v4.4h, #15
|
||||||
sshll v30.4s, v5.4h, #15
|
sshll v30.4s, v5.4h, #15
|
||||||
add v20.4s, v28.4s, v26.4s
|
add v20.4s, v15.4s, v26.4s
|
||||||
sub v28.4s, v28.4s, v26.4s
|
sub v15.4s, v15.4s, v26.4s
|
||||||
rshrn v4.4h, v20.4s, #13
|
rshrn v4.4h, v20.4s, #13
|
||||||
rshrn v6.4h, v28.4s, #13
|
rshrn v6.4h, v15.4s, #13
|
||||||
add v20.4s, v30.4s, v24.4s
|
add v20.4s, v30.4s, v24.4s
|
||||||
sub v28.4s, v30.4s, v24.4s
|
sub v15.4s, v30.4s, v24.4s
|
||||||
rshrn v5.4h, v20.4s, #13
|
rshrn v5.4h, v20.4s, #13
|
||||||
rshrn v7.4h, v28.4s, #13
|
rshrn v7.4h, v15.4s, #13
|
||||||
transpose v4, v6, v3, .16b, .8h
|
transpose v4, v6, v3, .16b, .8h
|
||||||
transpose v6, v10, v3, .16b, .4s
|
transpose v6, v10, v3, .16b, .4s
|
||||||
#endif
|
#endif
|
||||||
@@ -1466,11 +1476,15 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
st1 {v26.b}[1], [TMP2], 1
|
st1 {v26.b}[1], [TMP2], 1
|
||||||
st1 {v27.b}[5], [TMP2], 1
|
st1 {v27.b}[5], [TMP2], 1
|
||||||
|
|
||||||
/* vpop {v8.4h-v15.4h} ;not available */
|
sub sp, sp, #208
|
||||||
|
ldr x15, [sp], 16
|
||||||
ld1 {v12.4h-v15.4h}, [sp], 32
|
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||||
ld1 {v8.4h-v11.4h}, [sp], 32
|
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
ld1 {v21.8b - v22.8b}, [sp], 16
|
||||||
|
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
ld1 {v30.8b - v31.8b}, [sp], 16
|
||||||
blr x30
|
blr x30
|
||||||
|
|
||||||
.unreq DCT_TABLE
|
.unreq DCT_TABLE
|
||||||
@@ -1572,13 +1586,11 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
.error unsupported bpp
|
.error unsupported bpp
|
||||||
.endif
|
.endif
|
||||||
.endm
|
.endm
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
|
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
|
||||||
#else
|
|
||||||
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize
|
|
||||||
#endif
|
|
||||||
/*
|
/*
|
||||||
* 2 stage pipelined YCbCr->RGB conversion
|
* 2-stage pipelined YCbCr->RGB conversion
|
||||||
*/
|
*/
|
||||||
|
|
||||||
.macro do_yuv_to_rgb_stage1
|
.macro do_yuv_to_rgb_stage1
|
||||||
@@ -1604,16 +1616,10 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
uaddw v20.8h, v20.8h, v0.8b
|
uaddw v20.8h, v20.8h, v0.8b
|
||||||
uaddw v24.8h, v24.8h, v0.8b
|
uaddw v24.8h, v24.8h, v0.8b
|
||||||
uaddw v28.8h, v28.8h, v0.8b
|
uaddw v28.8h, v28.8h, v0.8b
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqxtun v1\g_offs\defsize, v20.8h
|
sqxtun v1\g_offs\defsize, v20.8h
|
||||||
sqxtun v1\r_offs\defsize, v24.8h
|
sqxtun v1\r_offs\defsize, v24.8h
|
||||||
sqxtun v1\b_offs\defsize, v28.8h
|
sqxtun v1\b_offs\defsize, v28.8h
|
||||||
|
|
||||||
#else
|
|
||||||
sqxtun v1\g_offs\gsize, v20.4s
|
|
||||||
sqxtun v1\r_offs\rsize, v24.4s
|
|
||||||
sqxtun v1\b_offs\bsize, v28.4s
|
|
||||||
#endif
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro do_yuv_to_rgb_stage2_store_load_stage1
|
.macro do_yuv_to_rgb_stage2_store_load_stage1
|
||||||
@@ -1628,25 +1634,13 @@ asm_function jsimd_idct_2x2_neon
|
|||||||
uaddw v20.8h, v20.8h, v0.8b
|
uaddw v20.8h, v20.8h, v0.8b
|
||||||
uaddw v24.8h, v24.8h, v0.8b
|
uaddw v24.8h, v24.8h, v0.8b
|
||||||
uaddw v28.8h, v28.8h, v0.8b
|
uaddw v28.8h, v28.8h, v0.8b
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqxtun v1\g_offs\defsize, v20.8h
|
sqxtun v1\g_offs\defsize, v20.8h
|
||||||
#else
|
|
||||||
sqxtun v1\g_offs\gsize, v20.4s
|
|
||||||
#endif
|
|
||||||
ld1 {v0.8b}, [Y], 8
|
ld1 {v0.8b}, [Y], 8
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqxtun v1\r_offs\defsize, v24.8h
|
sqxtun v1\r_offs\defsize, v24.8h
|
||||||
#else
|
|
||||||
sqxtun v1\r_offs\rsize, v24.4s
|
|
||||||
#endif
|
|
||||||
prfm PLDL1KEEP, [U, #64]
|
prfm PLDL1KEEP, [U, #64]
|
||||||
prfm PLDL1KEEP, [V, #64]
|
prfm PLDL1KEEP, [V, #64]
|
||||||
prfm PLDL1KEEP, [Y, #64]
|
prfm PLDL1KEEP, [Y, #64]
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
sqxtun v1\b_offs\defsize, v28.8h
|
sqxtun v1\b_offs\defsize, v28.8h
|
||||||
#else
|
|
||||||
sqxtun v1\b_offs\gsize, v28.4s
|
|
||||||
#endif
|
|
||||||
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
|
||||||
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
|
||||||
do_store \bpp, 8
|
do_store \bpp, 8
|
||||||
@@ -1693,29 +1687,33 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
V .req x10
|
V .req x10
|
||||||
N .req x15
|
N .req x15
|
||||||
|
|
||||||
|
sub sp, sp, 336
|
||||||
|
str x15, [sp], 16
|
||||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||||
adr x15, jsimd_ycc_\colorid\()_neon_consts
|
adr x15, jsimd_ycc_\colorid\()_neon_consts
|
||||||
|
/* Save NEON registers */
|
||||||
|
st1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
st1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
st1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
st1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
st1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
st1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
st1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
st1 {v28.8b - v31.8b}, [sp], 32
|
||||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||||
ld1 {v2.8h}, [x15]
|
ld1 {v2.8h}, [x15]
|
||||||
|
|
||||||
/* Save ARM registers and handle input arguments */
|
/* Save ARM registers and handle input arguments */
|
||||||
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
|
/* push {x4, x5, x6, x7, x8, x9, x10, x30} */
|
||||||
stp x4, x5, [sp,-16]!
|
stp x4, x5, [sp], 16
|
||||||
stp x6, x7, [sp,-16]!
|
stp x6, x7, [sp], 16
|
||||||
stp x8, x9, [sp,-16]!
|
stp x8, x9, [sp], 16
|
||||||
stp x10, x30, [sp,-16]!
|
stp x10, x30, [sp], 16
|
||||||
ldr INPUT_BUF0, [INPUT_BUF]
|
ldr INPUT_BUF0, [INPUT_BUF]
|
||||||
ldr INPUT_BUF1, [INPUT_BUF, 8]
|
ldr INPUT_BUF1, [INPUT_BUF, 8]
|
||||||
ldr INPUT_BUF2, [INPUT_BUF, 16]
|
ldr INPUT_BUF2, [INPUT_BUF, 16]
|
||||||
.unreq INPUT_BUF
|
.unreq INPUT_BUF
|
||||||
|
|
||||||
/* Save NEON registers */
|
|
||||||
/* vpush {v8.4h-v15.4h} */
|
|
||||||
sub sp, sp, #32
|
|
||||||
st1 {v8.4h-v11.4h}, [sp]
|
|
||||||
sub sp, sp, #32
|
|
||||||
st1 {v12.4h-v15.4h}, [sp]
|
|
||||||
|
|
||||||
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
/* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
|
||||||
movi v10.16b, #255
|
movi v10.16b, #255
|
||||||
movi v12.16b, #255
|
movi v12.16b, #255
|
||||||
@@ -1778,14 +1776,21 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
bgt 0b
|
bgt 0b
|
||||||
9:
|
9:
|
||||||
/* Restore all registers and return */
|
/* Restore all registers and return */
|
||||||
/* vpop {v8.4h-v15.4h} */
|
sub sp, sp, #336
|
||||||
ld1 {v12.4h-v15.4h}, [sp], #32
|
ldr x15, [sp], 16
|
||||||
ld1 {v8.4h-v11.4h}, [sp], #32
|
ld1 {v0.8b - v3.8b}, [sp], 32
|
||||||
|
ld1 {v4.8b - v7.8b}, [sp], 32
|
||||||
|
ld1 {v8.8b - v11.8b}, [sp], 32
|
||||||
|
ld1 {v12.8b - v15.8b}, [sp], 32
|
||||||
|
ld1 {v16.8b - v19.8b}, [sp], 32
|
||||||
|
ld1 {v20.8b - v23.8b}, [sp], 32
|
||||||
|
ld1 {v24.8b - v27.8b}, [sp], 32
|
||||||
|
ld1 {v28.8b - v31.8b}, [sp], 32
|
||||||
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
/* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||||
ldp x10, x30, [sp], #16
|
ldp x4, x5, [sp], 16
|
||||||
ldp x8, x9, [sp], #16
|
ldp x6, x7, [sp], 16
|
||||||
ldp x6, x5, [sp], #16
|
ldp x8, x9, [sp], 16
|
||||||
ldp x4, x5, [sp], #16
|
ldp x10, x30, [sp], 16
|
||||||
br x30
|
br x30
|
||||||
.unreq OUTPUT_WIDTH
|
.unreq OUTPUT_WIDTH
|
||||||
.unreq INPUT_ROW
|
.unreq INPUT_ROW
|
||||||
@@ -1807,10 +1812,6 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
|||||||
.purgem do_yuv_to_rgb_stage2_store_load_stage1
|
.purgem do_yuv_to_rgb_stage2_store_load_stage1
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/* RTSM simulator fix integer saturation works on 8b boundry add a new parameter
|
|
||||||
* as a workaround for the simulator fix
|
|
||||||
*/
|
|
||||||
#ifdef RTSM_SQSHRN_SIM_ISSUE
|
|
||||||
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
|
/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
|
||||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
|
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
|
||||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
|
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
|
||||||
@@ -1818,15 +1819,6 @@ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .
|
|||||||
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
|
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
|
||||||
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
|
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
|
||||||
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
|
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
|
||||||
#else
|
|
||||||
/*--------------------------------- id ----- bpp R rsize G gsize B bsize */
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h
|
|
||||||
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h
|
|
||||||
#endif
|
|
||||||
|
|
||||||
.purgem do_load
|
.purgem do_load
|
||||||
.purgem do_store
|
.purgem do_store
|
||||||
|
|||||||
Reference in New Issue
Block a user