radeonsi: move instance divisors into a constant buffer
authorMarek Olšák <marek.olsak@amd.com>
Fri, 9 Jun 2017 16:46:07 +0000 (18:46 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Tue, 27 Jun 2017 17:55:09 +0000 (19:55 +0200)
commit4a10d6154e1a6086e1eecf0e17ab63cc41862ea6
treed8166d615f4771bc96147185dc12a52e59776c4a
parentaef998fe4b7551faf8a44409aa74554b45d2b67c
radeonsi: move instance divisors into a constant buffer

Shader key size: 107 -> 47

Divisors of 0 and 1 are encoded in the shader key. Greater instance divisors
are loaded from a constant buffer.

The shader code doing the division is huge. Is it something we need to
worry about? Does any app use instance divisors >= 2?

VS prolog disassembly:
    s_load_dwordx4 s[12:15], s[0:1], 0x80  ; C00A0300 00000080
    s_nop 0                                ; BF800000
    s_waitcnt lgkmcnt(0)                   ; BF8C007F
    s_buffer_load_dword s14, s[12:15], 0x4 ; C0220386 00000004
    s_waitcnt lgkmcnt(0)                   ; BF8C007F
    v_cvt_f32_u32_e32 v4, s14              ; 7E080C0E
    v_rcp_iflag_f32_e32 v4, v4             ; 7E084704
    v_mul_f32_e32 v4, 0x4f800000, v4       ; 0A0808FF 4F800000
    v_cvt_u32_f32_e32 v4, v4               ; 7E080F04
    v_mul_hi_u32 v5, v4, s14               ; D2860005 00001D04
    v_mul_lo_i32 v6, v4, s14               ; D2850006 00001D04
    v_cmp_eq_u32_e64 s[12:13], 0, v5       ; D0CA000C 00020A80
    v_sub_i32_e32 v5, vcc, 0, v6           ; 340A0C80
    v_cndmask_b32_e64 v5, v6, v5, s[12:13] ; D1000005 00320B06
    v_mul_hi_u32 v5, v5, v4                ; D2860005 00020905
    v_add_i32_e32 v6, vcc, v5, v4          ; 320C0905
    v_subrev_i32_e32 v4, vcc, v5, v4       ; 36080905
    v_cndmask_b32_e64 v4, v4, v6, s[12:13] ; D1000004 00320D04
    v_mul_hi_u32 v5, v4, v1                ; D2860005 00020304
    v_add_i32_e32 v4, vcc, s8, v0          ; 32080008
    v_mul_lo_i32 v6, v5, s14               ; D2850006 00001D05
    v_add_i32_e32 v7, vcc, 1, v5           ; 320E0A81
    v_cmp_ge_u32_e64 s[12:13], v1, v6      ; D0CE000C 00020D01
    v_sub_i32_e32 v6, vcc, v1, v6          ; 340C0D01
    v_cmp_le_u32_e32 vcc, s14, v6          ; 7D960C0E
    v_cndmask_b32_e64 v8, 0, -1, s[12:13]  ; D1000008 00318280
    v_cndmask_b32_e64 v6, 0, -1, vcc       ; D1000006 01A98280
    v_and_b32_e32 v6, v8, v6               ; 260C0D08
    v_cmp_eq_u32_e32 vcc, 0, v6            ; 7D940C80
    v_cndmask_b32_e32 v6, v7, v5, vcc      ; 000C0B07
    v_add_i32_e32 v5, vcc, -1, v5          ; 320A0AC1
    v_cmp_eq_u32_e32 vcc, 0, v8            ; 7D941080
    v_cndmask_b32_e32 v5, v6, v5, vcc      ; 000A0B06
    v_add_i32_e32 v5, vcc, s9, v5          ; 320A0A09

v2: set prefer_mono for fetched instance divisors

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_shaders.c