Use intrinsics for loading/storing data in the DCT/IDCT functions. This has no effect on the performance of the aligned loads/stores, but it makes it more obvious what that code is doing. Using intrinsics for the unaligned stores in the inverse DCT functions increases overall decompression performance by 1-2%.

git-svn-id: svn://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1472 632fc199-4ca6-4c93-a231-07263d6284db
This commit is contained in:
DRC
2014-12-22 16:04:17 +00:00
parent 243aba148e
commit 510e67c542
4 changed files with 122 additions and 82 deletions

View File

@@ -108,14 +108,14 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
/* Pass 1: process rows */
row0 = *(__vector short *)&data[0];
row1 = *(__vector short *)&data[8];
row2 = *(__vector short *)&data[16];
row3 = *(__vector short *)&data[24];
row4 = *(__vector short *)&data[32];
row5 = *(__vector short *)&data[40];
row6 = *(__vector short *)&data[48];
row7 = *(__vector short *)&data[56];
row0 = vec_ld(0, data);
row1 = vec_ld(16, data);
row2 = vec_ld(32, data);
row3 = vec_ld(48, data);
row4 = vec_ld(64, data);
row5 = vec_ld(80, data);
row6 = vec_ld(96, data);
row7 = vec_ld(112, data);
TRANSPOSE(row, col);
@@ -145,12 +145,12 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
DO_FDCT();
*(__vector short *)&data[0] = out0;
*(__vector short *)&data[8] = out1;
*(__vector short *)&data[16] = out2;
*(__vector short *)&data[24] = out3;
*(__vector short *)&data[32] = out4;
*(__vector short *)&data[40] = out5;
*(__vector short *)&data[48] = out6;
*(__vector short *)&data[56] = out7;
vec_st(out0, 0, data);
vec_st(out1, 16, data);
vec_st(out2, 32, data);
vec_st(out3, 48, data);
vec_st(out4, 64, data);
vec_st(out5, 80, data);
vec_st(out6, 96, data);
vec_st(out7, 112, data);
}