Use intrinsics for loading/storing data in the DCT/IDCT functions. This has no effect on the performance of the aligned loads/stores, but it makes it more obvious what that code is doing. Using intrinsics for the unaligned stores in the inverse DCT functions increases overall decompression performance by 1-2%.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1472 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-12-22 16:04:17 +00:00
parent 243aba148e
commit 510e67c542
4 changed files with 122 additions and 82 deletions

View File

@@ -108,14 +108,14 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
/* Pass 1: process rows */ /* Pass 1: process rows */
row0 = *(__vector short *)&data[0]; row0 = vec_ld(0, data);
row1 = *(__vector short *)&data[8]; row1 = vec_ld(16, data);
row2 = *(__vector short *)&data[16]; row2 = vec_ld(32, data);
row3 = *(__vector short *)&data[24]; row3 = vec_ld(48, data);
row4 = *(__vector short *)&data[32]; row4 = vec_ld(64, data);
row5 = *(__vector short *)&data[40]; row5 = vec_ld(80, data);
row6 = *(__vector short *)&data[48]; row6 = vec_ld(96, data);
row7 = *(__vector short *)&data[56]; row7 = vec_ld(112, data);
TRANSPOSE(row, col); TRANSPOSE(row, col);
@@ -145,12 +145,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
DO_FDCT(); DO_FDCT();
*(__vector short *)&data[0] = out0; vec_st(out0, 0, data);
*(__vector short *)&data[8] = out1; vec_st(out1, 16, data);
*(__vector short *)&data[16] = out2; vec_st(out2, 32, data);
*(__vector short *)&data[24] = out3; vec_st(out3, 48, data);
*(__vector short *)&data[32] = out4; vec_st(out4, 64, data);
*(__vector short *)&data[40] = out5; vec_st(out5, 80, data);
*(__vector short *)&data[48] = out6; vec_st(out6, 96, data);
*(__vector short *)&data[56] = out7; vec_st(out7, 112, data);
} }

View File

@@ -177,14 +177,14 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
/* Pass 1: process rows */ /* Pass 1: process rows */
row0 = *(__vector short *)&data[0]; row0 = vec_ld(0, data);
row1 = *(__vector short *)&data[8]; row1 = vec_ld(16, data);
row2 = *(__vector short *)&data[16]; row2 = vec_ld(32, data);
row3 = *(__vector short *)&data[24]; row3 = vec_ld(48, data);
row4 = *(__vector short *)&data[32]; row4 = vec_ld(64, data);
row5 = *(__vector short *)&data[40]; row5 = vec_ld(80, data);
row6 = *(__vector short *)&data[48]; row6 = vec_ld(96, data);
row7 = *(__vector short *)&data[56]; row7 = vec_ld(112, data);
TRANSPOSE(row, col); TRANSPOSE(row, col);
@@ -214,12 +214,12 @@ jsimd_fdct_islow_altivec (DCTELEM *data)
DO_FDCT_COLS(); DO_FDCT_COLS();
*(__vector short *)&data[0] = out0; vec_st(out0, 0, data);
*(__vector short *)&data[8] = out1; vec_st(out1, 16, data);
*(__vector short *)&data[16] = out2; vec_st(out2, 32, data);
*(__vector short *)&data[24] = out3; vec_st(out3, 48, data);
*(__vector short *)&data[32] = out4; vec_st(out4, 64, data);
*(__vector short *)&data[40] = out5; vec_st(out5, 80, data);
*(__vector short *)&data[48] = out6; vec_st(out6, 96, data);
*(__vector short *)&data[56] = out7; vec_st(out7, 112, data);
} }

View File

@@ -112,7 +112,7 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
z5, z10, z10s, z11, z12s, z13, z5, z10, z10s, z11, z12s, z13,
out0, out1, out2, out3, out4, out5, out6, out7; out0, out1, out2, out3, out4, out5, out6, out7;
__vector signed char outb; __vector signed char outb;
long long *outptr, *outbptr = (long long *)(&outb); int *outptr;
/* Constants */ /* Constants */
__vector short zero = { __8X(0) }, __vector short zero = { __8X(0) },
@@ -127,14 +127,14 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
/* Pass 1: process columns */ /* Pass 1: process columns */
col0 = *(__vector short *)&coef_block[0]; col0 = vec_ld(0, coef_block);
col1 = *(__vector short *)&coef_block[8]; col1 = vec_ld(16, coef_block);
col2 = *(__vector short *)&coef_block[16]; col2 = vec_ld(32, coef_block);
col3 = *(__vector short *)&coef_block[24]; col3 = vec_ld(48, coef_block);
col4 = *(__vector short *)&coef_block[32]; col4 = vec_ld(64, coef_block);
col5 = *(__vector short *)&coef_block[40]; col5 = vec_ld(80, coef_block);
col6 = *(__vector short *)&coef_block[48]; col6 = vec_ld(96, coef_block);
col7 = *(__vector short *)&coef_block[56]; col7 = vec_ld(112, coef_block);
tmp1 = vec_or(col1, col2); tmp1 = vec_or(col1, col2);
tmp2 = vec_or(col3, col4); tmp2 = vec_or(col3, col4);
@@ -196,31 +196,51 @@ jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
TRANSPOSE(out, col); TRANSPOSE(out, col);
outb = vec_packs(col0, col1); outb = vec_packs(col0, col0);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[0] + output_col); outptr = (int *)(output_buf[0] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[1] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col2, col3); outb = vec_packs(col1, col1);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[2] + output_col); outptr = (int *)(output_buf[1] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[3] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col4, col5); outb = vec_packs(col2, col2);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[4] + output_col); outptr = (int *)(output_buf[2] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[5] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col6, col7); outb = vec_packs(col3, col3);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[6] + output_col); outptr = (int *)(output_buf[3] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[7] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col4, col4);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[4] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col5, col5);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[5] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col6, col6);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[6] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col7, col7);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[7] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
} }

View File

@@ -186,7 +186,7 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
out5l, out5h, out6l, out6h, out7l, out7h; out5l, out5h, out6l, out6h, out7l, out7h;
__vector signed char outb; __vector signed char outb;
long long *outptr, *outbptr = (long long *)(&outb); int *outptr;
/* Constants */ /* Constants */
__vector short zero16 = { __8X(0) }, __vector short zero16 = { __8X(0) },
@@ -271,31 +271,51 @@ jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
TRANSPOSE(out, col); TRANSPOSE(out, col);
outb = vec_packs(col0, col1); outb = vec_packs(col0, col0);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[0] + output_col); outptr = (int *)(output_buf[0] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[1] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col2, col3); outb = vec_packs(col1, col1);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[2] + output_col); outptr = (int *)(output_buf[1] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[3] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col4, col5); outb = vec_packs(col2, col2);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[4] + output_col); outptr = (int *)(output_buf[2] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[5] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col6, col7); outb = vec_packs(col3, col3);
outb = vec_add(outb, pb_centerjsamp); outb = vec_add(outb, pb_centerjsamp);
outptr = (long long *)(output_buf[6] + output_col); outptr = (int *)(output_buf[3] + output_col);
*outptr = outbptr[0]; vec_ste((__vector int)outb, 0, outptr);
outptr = (long long *)(output_buf[7] + output_col); vec_ste((__vector int)outb, 4, outptr);
*outptr = outbptr[1];
outb = vec_packs(col4, col4);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[4] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col5, col5);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[5] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col6, col6);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[6] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
outb = vec_packs(col7, col7);
outb = vec_add(outb, pb_centerjsamp);
outptr = (int *)(output_buf[7] + output_col);
vec_ste((__vector int)outb, 0, outptr);
vec_ste((__vector int)outb, 4, outptr);
} }