ARM NEON support
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@607 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
@@ -40,6 +40,10 @@ still is-- see README-turbo.txt.)
|
||||
|
||||
[11] libjpeg-turbo can now be built with YASM.
|
||||
|
||||
[12] Added SIMD-accelerated fast integer inverse DCT and YCbCr-to-RGB color
|
||||
conversion routines to accelerate JPEG decoding on ARM Linux platforms that
|
||||
have NEON instructions.
|
||||
|
||||
|
||||
1.1.1
|
||||
=====
|
||||
|
||||
@@ -149,7 +149,7 @@ endif
|
||||
./cjpeg -dct fast -quality 100 -opt -outfile testoutfst100.jpg $(srcdir)/testorig.ppm
|
||||
cmp $(srcdir)/testimgfst100.jpg testoutfst100.jpg
|
||||
./cjpeg -dct float -outfile testoutflt.jpg $(srcdir)/testorig.ppm
|
||||
if WITH_SIMD
|
||||
if WITH_SSE_FLOAT_DCT
|
||||
cmp $(srcdir)/testimgflt.jpg testoutflt.jpg
|
||||
else
|
||||
cmp $(srcdir)/testimgflt-nosimd.jpg testoutflt.jpg
|
||||
@@ -161,7 +161,7 @@ endif
|
||||
./djpeg -dct fast -ppm -outfile testoutfst.ppm $(srcdir)/testorig.jpg
|
||||
cmp $(srcdir)/testimgfst.ppm testoutfst.ppm
|
||||
./djpeg -dct float -ppm -outfile testoutflt.ppm $(srcdir)/testorig.jpg
|
||||
if WITH_SIMD
|
||||
if WITH_SSE_FLOAT_DCT
|
||||
cmp $(srcdir)/testimgflt.ppm testoutflt.ppm
|
||||
else
|
||||
cmp $(srcdir)/testorig.ppm testoutflt.ppm
|
||||
|
||||
24
acinclude.m4
24
acinclude.m4
@@ -134,3 +134,27 @@ else
|
||||
fi
|
||||
|
||||
])
|
||||
|
||||
# AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE
|
||||
# --------------------------
|
||||
# Test whether the assembler is suitable and supports NEON instructions
|
||||
AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[
|
||||
ac_good_gnu_arm_assembler=no
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-x assembler-with-cpp $CFLAGS"
|
||||
AC_COMPILE_IFELSE([[
|
||||
.text
|
||||
.fpu neon
|
||||
.arch armv7a
|
||||
.object_arch armv4
|
||||
.arm
|
||||
.altmacro
|
||||
pld [r0]
|
||||
vmovn.u16 d0, q0]], ac_good_gnu_arm_assembler=yes)
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
|
||||
$1
|
||||
else
|
||||
$2
|
||||
fi
|
||||
])
|
||||
|
||||
13
configure.ac
13
configure.ac
@@ -16,6 +16,7 @@ SAVED_CFLAGS=${CFLAGS}
|
||||
SAVED_CPPFLAGS=${CPPFLAGS}
|
||||
AC_PROG_CPP
|
||||
AC_PROG_CC
|
||||
AM_PROG_AS
|
||||
AC_PROG_INSTALL
|
||||
AC_PROG_LIBTOOL
|
||||
AC_PROG_LN_S
|
||||
@@ -276,6 +277,16 @@ if test "x${with_simd}" != "xno"; then
|
||||
AC_PROG_NASM
|
||||
simd_arch=i386
|
||||
;;
|
||||
arm*)
|
||||
AC_MSG_RESULT([yes (arm)])
|
||||
AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
|
||||
AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
|
||||
[AC_MSG_RESULT([yes])
|
||||
simd_arch=arm],
|
||||
[AC_MSG_RESULT([no])
|
||||
with_simd=no
|
||||
AC_MSG_WARN([SIMD support can't be enabled. Performance will suffer.])])
|
||||
;;
|
||||
*)
|
||||
AC_MSG_RESULT([no ("$host_cpu")])
|
||||
AC_MSG_WARN([SIMD support not available for this CPU. Performance will suffer.])
|
||||
@@ -289,8 +300,10 @@ if test "x${with_simd}" != "xno"; then
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
|
||||
AM_CONDITIONAL([WITH_SSE_FLOAT_DCT], [test "x$simd_arch" = "xx86_64" -o "x$simd_arch" = "xi386"])
|
||||
AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
|
||||
AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
|
||||
AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
|
||||
AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
|
||||
|
||||
case "$host_cpu" in
|
||||
|
||||
@@ -52,6 +52,12 @@ jdmermmx.lo: jdmrgmmx.asm
|
||||
jdmerss2.lo: jdmrgss2.asm
|
||||
endif
|
||||
|
||||
if SIMD_ARM
|
||||
|
||||
libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
|
||||
|
||||
endif
|
||||
|
||||
AM_CPPFLAGS = -I$(top_srcdir)
|
||||
|
||||
.asm.lo:
|
||||
|
||||
45
simd/jsimd.h
45
simd/jsimd.h
@@ -12,11 +12,12 @@
|
||||
|
||||
/* Bitmask for supported acceleration methods */
|
||||
|
||||
#define JSIMD_NONE 0x00
|
||||
#define JSIMD_MMX 0x01
|
||||
#define JSIMD_3DNOW 0x02
|
||||
#define JSIMD_SSE 0x04
|
||||
#define JSIMD_SSE2 0x08
|
||||
#define JSIMD_NONE 0x00
|
||||
#define JSIMD_MMX 0x01
|
||||
#define JSIMD_3DNOW 0x02
|
||||
#define JSIMD_SSE 0x04
|
||||
#define JSIMD_SSE2 0x08
|
||||
#define JSIMD_ARM_NEON 0x10
|
||||
|
||||
/* Short forms of external names for systems with brain-damaged linkers. */
|
||||
|
||||
@@ -327,6 +328,35 @@ EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
|
||||
EXTERN(void) jsimd_ycc_rgb_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extrgb_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extrgbx_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extbgr_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extbgrx_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extxbgr_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
EXTERN(void) jsimd_ycc_extxrgb_convert_neon
|
||||
JPP((JDIMENSION out_width,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows));
|
||||
|
||||
/* SIMD Downsample */
|
||||
EXTERN(void) jsimd_h2v2_downsample_mmx
|
||||
JPP((JDIMENSION image_width, int max_v_samp_factor,
|
||||
@@ -560,6 +590,11 @@ EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col));
|
||||
|
||||
EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
|
||||
JCOEFPTR coef_block,
|
||||
JSAMPARRAY output_buf,
|
||||
|
||||
524
simd/jsimd_arm.c
Normal file
524
simd/jsimd_arm.c
Normal file
@@ -0,0 +1,524 @@
|
||||
/*
|
||||
* jsimd_arm.c
|
||||
*
|
||||
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
|
||||
* Copyright 2009-2011 D. R. Commander
|
||||
*
|
||||
* Based on the x86 SIMD extension for IJG JPEG library,
|
||||
* Copyright (C) 1999-2006, MIYASAKA Masaru.
|
||||
* For conditions of distribution and use, see copyright notice in jsimdext.inc
|
||||
*
|
||||
* This file contains the interface between the "normal" portions
|
||||
* of the library and the SIMD implementations when running on
|
||||
* ARM architecture.
|
||||
*
|
||||
* Based on the stubs from 'jsimd_none.c'
|
||||
*/
|
||||
|
||||
#define JPEG_INTERNALS
|
||||
#include "../jinclude.h"
|
||||
#include "../jpeglib.h"
|
||||
#include "../jsimd.h"
|
||||
#include "../jdct.h"
|
||||
#include "../jsimddct.h"
|
||||
#include "jsimd.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static unsigned int simd_support = ~0;
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
|
||||
|
||||
LOCAL(int)
|
||||
check_feature (char *buffer, char *feature)
|
||||
{
|
||||
char *p;
|
||||
if (*feature == 0)
|
||||
return 0;
|
||||
if (strncmp(buffer, "Features", 8) != 0)
|
||||
return 0;
|
||||
buffer += 8;
|
||||
while (isspace(*buffer))
|
||||
buffer++;
|
||||
|
||||
/* Check if 'feature' is present in the buffer as a separate word */
|
||||
while ((p = strstr(buffer, feature))) {
|
||||
if (p > buffer && !isspace(*(p - 1))) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
p += strlen(feature);
|
||||
if (*p != 0 && !isspace(*p)) {
|
||||
buffer++;
|
||||
continue;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LOCAL(int)
|
||||
parse_proc_cpuinfo (int bufsize)
|
||||
{
|
||||
char *buffer = (char *)malloc(bufsize);
|
||||
FILE *fd;
|
||||
simd_support = 0;
|
||||
|
||||
if (!buffer)
|
||||
return 0;
|
||||
|
||||
fd = fopen("/proc/cpuinfo", "r");
|
||||
if (fd) {
|
||||
while (fgets(buffer, bufsize, fd)) {
|
||||
if (!strchr(buffer, '\n') && !feof(fd)) {
|
||||
/* "impossible" happened - insufficient size of the buffer! */
|
||||
fclose(fd);
|
||||
free(buffer);
|
||||
return 0;
|
||||
}
|
||||
if (check_feature(buffer, "neon"))
|
||||
simd_support |= JSIMD_ARM_NEON;
|
||||
}
|
||||
fclose(fd);
|
||||
}
|
||||
free(buffer);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Check what SIMD accelerations are supported.
|
||||
*
|
||||
* FIXME: This code is racy under a multi-threaded environment.
|
||||
*/
|
||||
LOCAL(void)
|
||||
init_simd (void)
|
||||
{
|
||||
char *env = NULL;
|
||||
int bufsize = 1024; /* an initial guess for the line buffer size limit */
|
||||
|
||||
if (simd_support != ~0)
|
||||
return;
|
||||
|
||||
simd_support = 0;
|
||||
|
||||
#ifdef __linux__
|
||||
while (!parse_proc_cpuinfo(bufsize)) {
|
||||
bufsize *= 2;
|
||||
if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Force different settings through environment variables */
|
||||
env = getenv("JSIMD_FORCE_ARM_NEON");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support &= JSIMD_ARM_NEON;
|
||||
env = getenv("JSIMD_FORCE_NO_SIMD");
|
||||
if ((env != NULL) && (strcmp(env, "1") == 0))
|
||||
simd_support = 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_ycc (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_rgb_gray (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_ycc_rgb (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
|
||||
return 0;
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_rgb_gray_convert (j_compress_ptr cinfo,
|
||||
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
|
||||
JDIMENSION output_row, int num_rows)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf, JDIMENSION input_row,
|
||||
JSAMPARRAY output_buf, int num_rows)
|
||||
{
|
||||
void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
|
||||
|
||||
switch(cinfo->out_color_space)
|
||||
{
|
||||
case JCS_EXT_RGB:
|
||||
neonfct=jsimd_ycc_extrgb_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_RGBX:
|
||||
neonfct=jsimd_ycc_extrgbx_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGR:
|
||||
neonfct=jsimd_ycc_extbgr_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_BGRX:
|
||||
neonfct=jsimd_ycc_extbgrx_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XBGR:
|
||||
neonfct=jsimd_ycc_extxbgr_convert_neon;
|
||||
break;
|
||||
case JCS_EXT_XRGB:
|
||||
neonfct=jsimd_ycc_extxrgb_convert_neon;
|
||||
break;
|
||||
default:
|
||||
neonfct=jsimd_ycc_extrgb_convert_neon;
|
||||
break;
|
||||
}
|
||||
|
||||
if (simd_support & JSIMD_ARM_NEON)
|
||||
neonfct(cinfo->output_width, input_buf,
|
||||
input_row, output_buf, num_rows);
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_downsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_downsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data, JSAMPARRAY output_data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_upsample (j_decompress_ptr cinfo,
|
||||
jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_upsample (j_decompress_ptr cinfo,
|
||||
jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_fancy_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
|
||||
jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
|
||||
jpeg_component_info * compptr,
|
||||
JSAMPARRAY input_data,
|
||||
JSAMPARRAY * output_data_ptr)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v2_merged_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_h2v1_merged_upsample (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
|
||||
JSAMPIMAGE input_buf,
|
||||
JDIMENSION in_row_group_ctr,
|
||||
JSAMPARRAY output_buf)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_convsamp_float (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
|
||||
FAST_FLOAT * workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_fdct_float (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_islow (DCTELEM * data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_ifast (DCTELEM * data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_fdct_float (FAST_FLOAT * data)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_quantize_float (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
|
||||
DCTELEM * workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
|
||||
FAST_FLOAT * workspace)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_2x2 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_4x4 (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_islow (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_ifast (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
/* The code is optimised for these values only */
|
||||
if (DCTSIZE != 8)
|
||||
return 0;
|
||||
if (sizeof(JCOEF) != 2)
|
||||
return 0;
|
||||
if (BITS_IN_JSAMPLE != 8)
|
||||
return 0;
|
||||
if (sizeof(JDIMENSION) != 4)
|
||||
return 0;
|
||||
if (sizeof(IFAST_MULT_TYPE) != 2)
|
||||
return 0;
|
||||
if (IFAST_SCALE_BITS != 2)
|
||||
return 0;
|
||||
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(int)
|
||||
jsimd_can_idct_float (void)
|
||||
{
|
||||
init_simd();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
if ((simd_support & JSIMD_ARM_NEON))
|
||||
jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, output_col);
|
||||
}
|
||||
|
||||
GLOBAL(void)
|
||||
jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
|
||||
JCOEFPTR coef_block, JSAMPARRAY output_buf,
|
||||
JDIMENSION output_col)
|
||||
{
|
||||
}
|
||||
|
||||
484
simd/jsimd_arm_neon.S
Normal file
484
simd/jsimd_arm_neon.S
Normal file
@@ -0,0 +1,484 @@
|
||||
/*
|
||||
* ARM NEON optimizations for libjpeg-turbo
|
||||
*
|
||||
* Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
|
||||
* All rights reserved.
|
||||
* Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
|
||||
#endif
|
||||
|
||||
.text
|
||||
.fpu neon
|
||||
.arch armv7a
|
||||
.object_arch armv4
|
||||
.altmacro
|
||||
.arm
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/* Supplementary macro for setting function attributes */
|
||||
.macro asm_function fname
|
||||
.func fname
|
||||
.global fname
|
||||
#ifdef __ELF__
|
||||
.hidden fname
|
||||
.type fname, %function
|
||||
#endif
|
||||
fname:
|
||||
.endm
|
||||
|
||||
/* Transpose a block of 4x4 coefficients in four 64-bit registers */
|
||||
.macro transpose_4x4 x0, x1, x2, x3
|
||||
vtrn.16 x0, x1
|
||||
vtrn.16 x2, x3
|
||||
vtrn.32 x0, x2
|
||||
vtrn.32 x1, x3
|
||||
.endm
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_idct_ifast_neon
|
||||
*
|
||||
* This function contains a fast, not so accurate integer implementation of
|
||||
* the inverse DCT (Discrete Cosine Transform). It uses the same calculations
|
||||
* and produces exactly the same output as IJG's original 'jpeg_idct_fast'
|
||||
* function from jidctfst.c
|
||||
*
|
||||
* TODO: a bit better instructions scheduling is needed.
|
||||
*/
|
||||
|
||||
#define XFIX_1_082392200 d0[0]
|
||||
#define XFIX_1_414213562 d0[1]
|
||||
#define XFIX_1_847759065 d0[2]
|
||||
#define XFIX_2_613125930 d0[3]
|
||||
|
||||
.balign 16
|
||||
jsimd_idct_ifast_neon_consts:
|
||||
.short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
|
||||
.short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
|
||||
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
|
||||
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
|
||||
|
||||
/* 1-D IDCT helper macro */
|
||||
|
||||
.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
|
||||
t10, t11, t12, t13, t14
|
||||
|
||||
vsub.s16 \t10, \x0, \x4
|
||||
vadd.s16 \x4, \x0, \x4
|
||||
vswp.s16 \t10, \x0
|
||||
vsub.s16 \t11, \x2, \x6
|
||||
vadd.s16 \x6, \x2, \x6
|
||||
vswp.s16 \t11, \x2
|
||||
vsub.s16 \t10, \x3, \x5
|
||||
vadd.s16 \x5, \x3, \x5
|
||||
vswp.s16 \t10, \x3
|
||||
vsub.s16 \t11, \x1, \x7
|
||||
vadd.s16 \x7, \x1, \x7
|
||||
vswp.s16 \t11, \x1
|
||||
|
||||
vqdmulh.s16 \t13, \x2, d0[1]
|
||||
vadd.s16 \t12, \x3, \x3
|
||||
vadd.s16 \x2, \x2, \t13
|
||||
vqdmulh.s16 \t13, \x3, d0[3]
|
||||
vsub.s16 \t10, \x1, \x3
|
||||
vadd.s16 \t12, \t12, \t13
|
||||
vqdmulh.s16 \t13, \t10, d0[2]
|
||||
vsub.s16 \t11, \x7, \x5
|
||||
vadd.s16 \t10, \t10, \t13
|
||||
vqdmulh.s16 \t13, \t11, d0[1]
|
||||
vadd.s16 \t11, \t11, \t13
|
||||
|
||||
vqdmulh.s16 \t13, \x1, d0[0]
|
||||
vsub.s16 \x2, \x6, \x2
|
||||
vsub.s16 \t14, \x0, \x2
|
||||
vadd.s16 \x2, \x0, \x2
|
||||
vadd.s16 \x0, \x4, \x6
|
||||
vsub.s16 \x4, \x4, \x6
|
||||
vadd.s16 \x1, \x1, \t13
|
||||
vadd.s16 \t13, \x7, \x5
|
||||
vsub.s16 \t12, \t13, \t12
|
||||
vsub.s16 \t12, \t12, \t10
|
||||
vadd.s16 \t11, \t12, \t11
|
||||
vsub.s16 \t10, \x1, \t10
|
||||
vadd.s16 \t10, \t10, \t11
|
||||
|
||||
vsub.s16 \x7, \x0, \t13
|
||||
vadd.s16 \x0, \x0, \t13
|
||||
vadd.s16 \x6, \t14, \t12
|
||||
vsub.s16 \x1, \t14, \t12
|
||||
vsub.s16 \x5, \x2, \t11
|
||||
vadd.s16 \x2, \x2, \t11
|
||||
vsub.s16 \x3, \x4, \t10
|
||||
vadd.s16 \x4, \x4, \t10
|
||||
.endm
|
||||
|
||||
asm_function jsimd_idct_ifast_neon
|
||||
|
||||
DCT_TABLE .req r0
|
||||
COEF_BLOCK .req r1
|
||||
OUTPUT_BUF .req r2
|
||||
OUTPUT_COL .req r3
|
||||
TMP .req ip
|
||||
|
||||
vpush {d8-d15}
|
||||
|
||||
/* Load constants */
|
||||
adr TMP, jsimd_idct_ifast_neon_consts
|
||||
vld1.16 {d0}, [TMP, :64]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
* ---------+--------
|
||||
* 0 | d4 | d5
|
||||
* 1 | d6 | d7
|
||||
* 2 | d8 | d9
|
||||
* 3 | d10 | d11
|
||||
* 4 | d12 | d13
|
||||
* 5 | d14 | d15
|
||||
* 6 | d16 | d17
|
||||
* 7 | d18 | d19
|
||||
*/
|
||||
vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
|
||||
vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
|
||||
vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
|
||||
vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
|
||||
/* Dequantize */
|
||||
vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
|
||||
vmul.s16 q2, q2, q10
|
||||
vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
|
||||
vmul.s16 q3, q3, q11
|
||||
vmul.s16 q4, q4, q12
|
||||
vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
|
||||
vmul.s16 q5, q5, q13
|
||||
vmul.s16 q6, q6, q14
|
||||
vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
|
||||
vmul.s16 q7, q7, q15
|
||||
vmul.s16 q8, q8, q10
|
||||
vmul.s16 q9, q9, q11
|
||||
|
||||
/* Pass 1 */
|
||||
idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
|
||||
/* Transpose */
|
||||
transpose_4x4 d4, d6, d8, d10
|
||||
transpose_4x4 d5, d7, d9, d11
|
||||
transpose_4x4 d12, d14, d16, d18
|
||||
transpose_4x4 d13, d15, d17, d19
|
||||
vswp d12, d5
|
||||
vswp d14, d7
|
||||
vswp d16, d9
|
||||
vswp d18, d11
|
||||
|
||||
/* Pass 2 */
|
||||
idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
|
||||
/* Transpose */
|
||||
transpose_4x4 d4, d6, d8, d10
|
||||
transpose_4x4 d5, d7, d9, d11
|
||||
transpose_4x4 d12, d14, d16, d18
|
||||
transpose_4x4 d13, d15, d17, d19
|
||||
vswp d12, d5
|
||||
vswp d14, d7
|
||||
vswp d16, d9
|
||||
vswp d18, d11
|
||||
|
||||
/* Descale and range limit */
|
||||
vmov.s16 q15, #(0x80 << 5)
|
||||
vqadd.s16 q2, q2, q15
|
||||
vqadd.s16 q3, q3, q15
|
||||
vqadd.s16 q4, q4, q15
|
||||
vqadd.s16 q5, q5, q15
|
||||
vqadd.s16 q6, q6, q15
|
||||
vqadd.s16 q7, q7, q15
|
||||
vqadd.s16 q8, q8, q15
|
||||
vqadd.s16 q9, q9, q15
|
||||
vqshrun.s16 d4, q2, #5
|
||||
vqshrun.s16 d6, q3, #5
|
||||
vqshrun.s16 d8, q4, #5
|
||||
vqshrun.s16 d10, q5, #5
|
||||
vqshrun.s16 d12, q6, #5
|
||||
vqshrun.s16 d14, q7, #5
|
||||
vqshrun.s16 d16, q8, #5
|
||||
vqshrun.s16 d18, q9, #5
|
||||
|
||||
/* Store results to the output buffer */
|
||||
.irp x, d4, d6, d8, d10, d12, d14, d16, d18
|
||||
ldr TMP, [OUTPUT_BUF], #4
|
||||
add TMP, TMP, OUTPUT_COL
|
||||
vst1.8 {x}, [TMP]!
|
||||
.endr
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
.unreq DCT_TABLE
|
||||
.unreq COEF_BLOCK
|
||||
.unreq OUTPUT_BUF
|
||||
.unreq OUTPUT_COL
|
||||
.unreq TMP
|
||||
.endfunc
|
||||
|
||||
.purgem idct_helper
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/*
|
||||
* jsimd_ycc_extrgb_convert_neon
|
||||
* jsimd_ycc_extbgr_convert_neon
|
||||
* jsimd_ycc_extrgbx_convert_neon
|
||||
* jsimd_ycc_extbgrx_convert_neon
|
||||
* jsimd_ycc_extxbgr_convert_neon
|
||||
* jsimd_ycc_extxrgb_convert_neon
|
||||
*
|
||||
* Colorspace conversion YCbCr -> RGB
|
||||
*/
|
||||
|
||||
.balign 16
|
||||
jsimd_ycc_rgb_neon_consts:
|
||||
.short 0, 0, 0, 0
|
||||
.short 22971, -11277, -23401, 29033
|
||||
.short -128, -128, -128, -128
|
||||
.short -128, -128, -128, -128
|
||||
|
||||
.macro do_load size
|
||||
.if size == 8
|
||||
vld1.8 {d4}, [U]!
|
||||
vld1.8 {d5}, [V]!
|
||||
vld1.8 {d0}, [Y]!
|
||||
pld [Y, #64]
|
||||
pld [U, #64]
|
||||
pld [V, #64]
|
||||
.elseif size == 4
|
||||
vld1.8 {d4[0]}, [U]!
|
||||
vld1.8 {d4[1]}, [U]!
|
||||
vld1.8 {d4[2]}, [U]!
|
||||
vld1.8 {d4[3]}, [U]!
|
||||
vld1.8 {d5[0]}, [V]!
|
||||
vld1.8 {d5[1]}, [V]!
|
||||
vld1.8 {d5[2]}, [V]!
|
||||
vld1.8 {d5[3]}, [V]!
|
||||
vld1.8 {d0[0]}, [Y]!
|
||||
vld1.8 {d0[1]}, [Y]!
|
||||
vld1.8 {d0[2]}, [Y]!
|
||||
vld1.8 {d0[3]}, [Y]!
|
||||
.elseif size == 2
|
||||
vld1.8 {d4[4]}, [U]!
|
||||
vld1.8 {d4[5]}, [U]!
|
||||
vld1.8 {d5[4]}, [V]!
|
||||
vld1.8 {d5[5]}, [V]!
|
||||
vld1.8 {d0[4]}, [Y]!
|
||||
vld1.8 {d0[5]}, [Y]!
|
||||
.elseif size == 1
|
||||
vld1.8 {d4[6]}, [U]!
|
||||
vld1.8 {d5[6]}, [V]!
|
||||
vld1.8 {d0[6]}, [Y]!
|
||||
.else
|
||||
.error unsupported macroblock size
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro do_store bpp, size
|
||||
.if bpp == 24
|
||||
.if size == 8
|
||||
vst3.8 {d10, d11, d12}, [RGB]!
|
||||
.elseif size == 4
|
||||
vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
|
||||
vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
|
||||
vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
|
||||
vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
|
||||
.elseif size == 2
|
||||
vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
|
||||
vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
|
||||
.elseif size == 1
|
||||
vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
|
||||
.else
|
||||
.error unsupported macroblock size
|
||||
.endif
|
||||
.elseif bpp == 32
|
||||
.if size == 8
|
||||
vst4.8 {d10, d11, d12, d13}, [RGB]!
|
||||
.elseif size == 4
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
|
||||
.elseif size == 2
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
|
||||
.elseif size == 1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
|
||||
.else
|
||||
.error unsupported macroblock size
|
||||
.endif
|
||||
.else
|
||||
.error unsupported bpp
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
|
||||
|
||||
.macro do_yuv_to_rgb
|
||||
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
|
||||
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
|
||||
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
|
||||
vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
|
||||
vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
|
||||
vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
|
||||
vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
|
||||
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
|
||||
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
|
||||
vrshrn.s32 d20, q10, #15
|
||||
vrshrn.s32 d21, q11, #15
|
||||
vrshrn.s32 d24, q12, #14
|
||||
vrshrn.s32 d25, q13, #14
|
||||
vrshrn.s32 d28, q14, #14
|
||||
vrshrn.s32 d29, q15, #14
|
||||
vaddw.u8 q10, q10, d0
|
||||
vaddw.u8 q12, q12, d0
|
||||
vaddw.u8 q14, q14, d0
|
||||
vqmovun.s16 d1&g_offs, q10
|
||||
vqmovun.s16 d1&r_offs, q12
|
||||
vqmovun.s16 d1&b_offs, q14
|
||||
.endm
|
||||
|
||||
asm_function jsimd_ycc_&colorid&_convert_neon
|
||||
OUTPUT_WIDTH .req r0
|
||||
INPUT_BUF .req r1
|
||||
INPUT_ROW .req r2
|
||||
OUTPUT_BUF .req r3
|
||||
NUM_ROWS .req r4
|
||||
|
||||
INPUT_BUF0 .req r5
|
||||
INPUT_BUF1 .req r6
|
||||
INPUT_BUF2 .req INPUT_BUF
|
||||
|
||||
RGB .req r7
|
||||
Y .req r8
|
||||
U .req r9
|
||||
V .req r10
|
||||
N .req ip
|
||||
|
||||
/* Load constants to d1, d2, d3 (d0 is just used for padding) */
|
||||
adrl ip, jsimd_ycc_rgb_neon_consts
|
||||
vld1.16 {d0, d1, d2, d3}, [ip, :128]
|
||||
|
||||
/* Save ARM registers and handle input arguments */
|
||||
push {r4, r5, r6, r7, r8, r9, r10, lr}
|
||||
ldr NUM_ROWS, [sp, #(4 * 8)]
|
||||
ldr INPUT_BUF0, [INPUT_BUF]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, #4]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, #8]
|
||||
.unreq INPUT_BUF
|
||||
|
||||
/* Save NEON registers */
|
||||
vpush {d8-d15}
|
||||
|
||||
/* Initially set d10, d11, d12, d13 to 0xFF */
|
||||
vmov.u8 q5, #255
|
||||
vmov.u8 q6, #255
|
||||
|
||||
/* Outer loop over scanlines */
|
||||
cmp NUM_ROWS, #1
|
||||
blt 9f
|
||||
0:
|
||||
ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
|
||||
ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
|
||||
mov N, OUTPUT_WIDTH
|
||||
ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
|
||||
add INPUT_ROW, INPUT_ROW, #1
|
||||
ldr RGB, [OUTPUT_BUF], #4
|
||||
|
||||
/* Inner loop over pixels */
|
||||
subs N, N, #8
|
||||
blt 2f
|
||||
1:
|
||||
do_load 8
|
||||
do_yuv_to_rgb
|
||||
do_store bpp, 8
|
||||
subs N, N, #8
|
||||
bge 1b
|
||||
tst N, #7
|
||||
beq 8f
|
||||
2:
|
||||
tst N, #4
|
||||
beq 3f
|
||||
do_load 4
|
||||
3:
|
||||
tst N, #2
|
||||
beq 4f
|
||||
do_load 2
|
||||
4:
|
||||
tst N, #1
|
||||
beq 5f
|
||||
do_load 1
|
||||
5:
|
||||
do_yuv_to_rgb
|
||||
tst N, #4
|
||||
beq 6f
|
||||
do_store bpp, 4
|
||||
6:
|
||||
tst N, #2
|
||||
beq 7f
|
||||
do_store bpp, 2
|
||||
7:
|
||||
tst N, #1
|
||||
beq 8f
|
||||
do_store bpp, 1
|
||||
8:
|
||||
subs NUM_ROWS, NUM_ROWS, #1
|
||||
bgt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5, r6, r7, r8, r9, r10, pc}
|
||||
|
||||
.unreq OUTPUT_WIDTH
|
||||
.unreq INPUT_ROW
|
||||
.unreq OUTPUT_BUF
|
||||
.unreq NUM_ROWS
|
||||
.unreq INPUT_BUF0
|
||||
.unreq INPUT_BUF1
|
||||
.unreq INPUT_BUF2
|
||||
.unreq RGB
|
||||
.unreq Y
|
||||
.unreq U
|
||||
.unreq V
|
||||
.unreq N
|
||||
.endfunc
|
||||
|
||||
.purgem do_yuv_to_rgb
|
||||
|
||||
.endm
|
||||
|
||||
/*--------------------------------- id ----- bpp R G B */
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
|
||||
generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
|
||||
generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
|
||||
generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
|
||||
generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
|
||||
|
||||
.purgem do_load
|
||||
.purgem do_store
|
||||
|
||||
/*****************************************************************************/
|
||||
Reference in New Issue
Block a user