radeonsi: emit PS exports last
This effectively removes s_waitcnt instructions after FP16 exports.
Before:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ;
5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ;
5E020702
exp 15, 0, 1, 0, 0, v0, v1, v0, v0 ;
F800040F 00000100
s_waitcnt expcnt(0) ;
BF8C0F0F
v_cvt_pkrtz_f16_f32_e32 v0, v4, v5 ;
5E000B04
v_cvt_pkrtz_f16_f32_e32 v1, v6, v7 ;
5E020F06
exp 15, 1, 1, 0, 0, v0, v1, v0, v0 ;
F800041F 00000100
s_waitcnt expcnt(0) ;
BF8C0F0F
v_cvt_pkrtz_f16_f32_e32 v0, v8, v9 ;
5E001308
v_cvt_pkrtz_f16_f32_e32 v1, v10, v11 ;
5E02170A
exp 15, 2, 1, 0, 0, v0, v1, v0, v0 ;
F800042F 00000100
s_waitcnt expcnt(0) ;
BF8C0F0F
v_cvt_pkrtz_f16_f32_e32 v0, v12, v13 ;
5E001B0C
v_cvt_pkrtz_f16_f32_e32 v1, v14, v15 ;
5E021F0E
exp 15, 3, 1, 1, 1, v0, v1, v0, v0 ;
F8001C3F 00000100
s_endpgm ;
BF810000
After:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ;
5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ;
5E020702
v_cvt_pkrtz_f16_f32_e32 v2, v4, v5 ;
5E040B04
v_cvt_pkrtz_f16_f32_e32 v3, v6, v7 ;
5E060F06
exp 15, 0, 1, 0, 0, v0, v1, v0, v0 ;
F800040F 00000100
v_cvt_pkrtz_f16_f32_e32 v4, v8, v9 ;
5E081308
v_cvt_pkrtz_f16_f32_e32 v5, v10, v11 ;
5E0A170A
exp 15, 1, 1, 0, 0, v2, v3, v0, v0 ;
F800041F 00000302
v_cvt_pkrtz_f16_f32_e32 v6, v12, v13 ;
5E0C1B0C
v_cvt_pkrtz_f16_f32_e32 v7, v14, v15 ;
5E0E1F0E
exp 15, 2, 1, 0, 0, v4, v5, v0, v0 ;
F800042F 00000504
exp 15, 3, 1, 1, 1, v6, v7, v0, v0 ;
F8001C3F 00000706
s_endpgm ;
BF810000
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>