AVX2: Avoid expensive AVX-SSE transitions

Refer to
https://software.intel.com/sites/default/files/m/d/4/1/d/8/11MC12_Avoiding_2BAVX-SSE_2BTransition_2BPenalties_2Brh_2Bfinal.pdf
for more information.  This eliminates all AVX-SSE transitions detected
with the Intel SDE tool.
This commit is contained in:
DRC
2016-07-08 20:10:24 -05:00
parent e06ccbe3f7
commit a7c2f97939
12 changed files with 20 additions and 0 deletions

View File

@@ -550,6 +550,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp

View File

@@ -565,6 +565,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved

View File

@@ -428,6 +428,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp

View File

@@ -443,6 +443,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved

View File

@@ -177,6 +177,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
jg near .rowloop
.return:
vzeroupper
uncollect_args 6
pop rbp
ret
@@ -355,6 +356,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
jg near .rowloop
.return:
vzeroupper
uncollect_args 6
pop rbp
ret

View File

@@ -184,6 +184,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
@@ -373,6 +374,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved

View File

@@ -485,6 +485,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp

View File

@@ -500,6 +500,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
sfence ; flush the write buffer
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved

View File

@@ -479,6 +479,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp

View File

@@ -493,6 +493,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
sfence ; flush the write buffer
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved

View File

@@ -186,6 +186,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
jg near .rowloop
.return:
vzeroupper
uncollect_args 4
pop_xmm 3
pop rbp
@@ -497,6 +498,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 4
pop_xmm 3
mov rsp, rbp ; rsp <- aligned rbp
@@ -590,6 +592,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
jg short .rowloop
.return:
vzeroupper
uncollect_args 4
pop rbp
ret
@@ -688,6 +691,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
.return:
pop rbx
vzeroupper
uncollect_args 4
pop rbp
ret

View File

@@ -193,6 +193,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
@@ -540,6 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
@@ -642,6 +644,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
jg short .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved
@@ -748,6 +751,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
jg near .rowloop
.return:
vzeroupper
pop edi
pop esi
; pop edx ; need not be preserved