X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fv3d_compiler.h;h=127b04136d18584a695406926361ca9b998168a9;hb=3e743d8cd86c09f68d840604c9df52e0499af64f;hp=85def2cb02c54635de56b2b9ff04842f30a83738;hpb=8e5a0ed95307fed7cb2bbbb86d1c264bbd070ca0;p=mesa.git diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 85def2cb02c..127b04136d1 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -33,6 +33,8 @@ #include "util/macros.h" #include "common/v3d_debug.h" +#include "common/v3d_device_info.h" +#include "common/v3d_limits.h" #include "compiler/nir/nir.h" #include "util/list.h" #include "util/u_math.h" @@ -40,11 +42,6 @@ #include "qpu/qpu_instr.h" #include "pipe/p_state.h" -#define V3D_MAX_TEXTURE_SAMPLERS 32 -#define V3D_MAX_SAMPLES 4 -#define V3D_MAX_FS_INPUTS 64 -#define V3D_MAX_VS_INPUTS 64 - struct nir_builder; struct v3d_fs_inputs { @@ -72,7 +69,6 @@ enum qfile { * or physical registers later. */ QFILE_TEMP, - QFILE_VARY, QFILE_UNIF, QFILE_TLB, QFILE_TLBU, @@ -115,6 +111,7 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index) */ struct qpu_reg { bool magic; + bool smimm; int index; }; @@ -134,6 +131,7 @@ struct qinst { struct qreg src[3]; bool cond_is_exec_mask; bool has_implicit_uniform; + bool is_last_thrsw; /* After vir_to_qpu.c: If instr reads a uniform, which uniform from * the uncompiled stream it is. @@ -171,7 +169,7 @@ enum quniform_contents { QUNIFORM_USER_CLIP_PLANE, /** - * A reference to a texture config parameter 0 uniform. + * A reference to a V3D 3.x texture config parameter 0 uniform. * * This is a uniform implicitly loaded with a QPU_W_TMU* write, which * defines texture type, miplevels, and such. It will be found as a @@ -212,7 +210,7 @@ enum quniform_contents { QUNIFORM_TEXTURE_CONFIG_P0_32, /** - * A reference to a texture config parameter 1 uniform. + * A reference to a V3D 3.x texture config parameter 1 uniform. * * This is a uniform implicitly loaded with a QPU_W_TMU* write, which * has the pointer to the indirect texture state. Our data[] field @@ -221,6 +219,15 @@ enum quniform_contents { */ QUNIFORM_TEXTURE_CONFIG_P1, + /* A V3D 4.x texture config parameter. The high 8 bits will be + * which texture or sampler is being sampled, and the driver must + * replace the address field with the appropriate address. + */ + QUNIFORM_TMU_CONFIG_P0, + QUNIFORM_TMU_CONFIG_P1, + + QUNIFORM_IMAGE_TMU_CONFIG_P0, + QUNIFORM_TEXTURE_FIRST_LEVEL, QUNIFORM_TEXTURE_WIDTH, @@ -234,14 +241,55 @@ enum quniform_contents { QUNIFORM_TEXRECT_SCALE_X, QUNIFORM_TEXRECT_SCALE_Y, - QUNIFORM_TEXTURE_BORDER_COLOR, + /* Returns the base offset of the SSBO given by the data value. */ + QUNIFORM_SSBO_OFFSET, + + /* Returns the size of the SSBO given by the data value. */ + QUNIFORM_GET_BUFFER_SIZE, - QUNIFORM_STENCIL, + /* Sizes (in pixels) of a shader image given by the data value. */ + QUNIFORM_IMAGE_WIDTH, + QUNIFORM_IMAGE_HEIGHT, + QUNIFORM_IMAGE_DEPTH, + QUNIFORM_IMAGE_ARRAY_SIZE, QUNIFORM_ALPHA_REF, - QUNIFORM_SAMPLE_MASK, + + /* Number of workgroups passed to glDispatchCompute in the dimension + * selected by the data value. + */ + QUNIFORM_NUM_WORK_GROUPS, + + /** + * Returns the the offset of the scratch buffer for register spilling. + */ + QUNIFORM_SPILL_OFFSET, + QUNIFORM_SPILL_SIZE_PER_THREAD, + + /** + * Returns the offset of the shared memory for compute shaders. + * + * This will be accessed using TMU general memory operations, so the + * L2T cache will effectively be the shared memory area. + */ + QUNIFORM_SHARED_OFFSET, }; +static inline uint32_t v3d_tmu_config_data_create(uint32_t unit, uint32_t value) +{ + return unit << 24 | value; +} + +static inline uint32_t v3d_tmu_config_data_get_unit(uint32_t data) +{ + return data >> 24; +} + +static inline uint32_t v3d_tmu_config_data_get_value(uint32_t data) +{ + return data & 0xffffff; +} + struct v3d_varying_slot { uint8_t slot_and_component; }; @@ -288,18 +336,9 @@ struct v3d_key { uint8_t swizzle[4]; uint8_t return_size; uint8_t return_channels; - union { - struct { - unsigned compare_mode:1; - unsigned compare_func:3; - bool clamp_s:1; - bool clamp_t:1; - bool clamp_r:1; - }; - struct { - uint16_t msaa_width, msaa_height; - }; - }; + bool clamp_s:1; + bool clamp_t:1; + bool clamp_r:1; } tex[V3D_MAX_TEXTURE_SAMPLERS]; uint8_t ucp_enables; }; @@ -322,6 +361,11 @@ struct v3d_fs_key { uint8_t swap_color_rb; /* Mask of which render targets need to be written as 32-bit floats */ uint8_t f32_color_rb; + /* Masks of which render targets need to be written as ints/uints. + * Used by gallium to work around lost information in TGSI. + */ + uint8_t int_color_rb; + uint8_t uint_color_rb; uint8_t alpha_test_func; uint8_t logicop_func; uint32_t point_sprite_mask; @@ -376,6 +420,48 @@ struct qblock { /** @} */ }; +/** Which util/list.h add mode we should use when inserting an instruction. */ +enum vir_cursor_mode { + vir_cursor_add, + vir_cursor_addtail, +}; + +/** + * Tracking structure for where new instructions should be inserted. Create + * with one of the vir_after_inst()-style helper functions. + * + * This does not protect against removal of the block or instruction, so we + * have an assert in instruction removal to try to catch it. + */ +struct vir_cursor { + enum vir_cursor_mode mode; + struct list_head *link; +}; + +static inline struct vir_cursor +vir_before_inst(struct qinst *inst) +{ + return (struct vir_cursor){ vir_cursor_addtail, &inst->link }; +} + +static inline struct vir_cursor +vir_after_inst(struct qinst *inst) +{ + return (struct vir_cursor){ vir_cursor_add, &inst->link }; +} + +static inline struct vir_cursor +vir_before_block(struct qblock *block) +{ + return (struct vir_cursor){ vir_cursor_add, &block->instructions }; +} + +static inline struct vir_cursor +vir_after_block(struct qblock *block) +{ + return (struct vir_cursor){ vir_cursor_addtail, &block->instructions }; +} + /** * Compiler state saved across compiler invocations, for any expensive global * setup. @@ -383,7 +469,8 @@ struct qblock { struct v3d_compiler { const struct v3d_device_info *devinfo; struct ra_regs *regs; - unsigned int reg_class[3]; + unsigned int reg_class_phys[3]; + unsigned int reg_class_phys_or_acc[3]; }; struct v3d_compile { @@ -393,6 +480,10 @@ struct v3d_compile { struct exec_list *cf_node_list; const struct v3d_compiler *compiler; + void (*debug_output)(const char *msg, + void *debug_output_data); + void *debug_output_data; + /** * Mapping from nir_register * or nir_ssa_def * to array of struct * qreg for the values. @@ -423,6 +514,12 @@ struct v3d_compile { */ uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + bool uses_center_w; + struct v3d_ubo_range *ubo_ranges; bool *ubo_range_used; uint32_t ubo_ranges_array_size; @@ -452,9 +549,27 @@ struct v3d_compile { /* Fragment shader payload regs. */ struct qreg payload_w, payload_w_centroid, payload_z; - uint8_t vattr_sizes[V3D_MAX_VS_INPUTS]; + struct qreg cs_payload[2]; + struct qreg cs_shared_offset; + int local_invocation_index_bits; + + uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; uint32_t num_vpm_writes; + /* Size in bytes of registers that have been spilled. This is how much + * space needs to be available in the spill BO per thread per QPU. + */ + uint32_t spill_size; + /* Shader-db stats */ + uint32_t spills, fills, loops; + /** + * Register spilling's per-thread base address, shared between each + * spill/fill's addressing calculations. + */ + struct qreg spill_base; + /* Bit vector of which temps may be spilled */ + BITSET_WORD *spillable; + /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. * @@ -477,6 +592,7 @@ struct v3d_compile { /* Live ranges of temps. */ int *temp_start, *temp_end; + bool live_intervals_valid; uint32_t *uniform_data; enum quniform_contents *uniform_contents; @@ -491,6 +607,7 @@ struct v3d_compile { struct qreg undef; uint32_t num_temps; + struct vir_cursor cursor; struct list_head blocks; int next_block_index; struct qblock *cur_block; @@ -521,12 +638,16 @@ struct v3d_compile { uint32_t program_id; uint32_t variant_id; - /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH - * is used to hide texturing latency at the cost of limiting ourselves - * to the bottom half of physical reg space. + /* Set to compile program in in 1x, 2x, or 4x threaded mode, where + * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of + * limiting ourselves to the part of the physical reg space. + * + * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On + * V3D 4.x, all shaders are 2x threaded, and 4x only divides the + * physical reg space in half. */ - bool fs_threaded; - + uint8_t threads; + struct qinst *last_thrsw; bool last_thrsw_at_top_level; bool failed; @@ -544,9 +665,15 @@ struct v3d_prog_data { struct v3d_ubo_range *ubo_ranges; uint32_t num_ubo_ranges; uint32_t ubo_size; + uint32_t spill_size; uint8_t num_inputs; + uint8_t threads; + /* For threads > 1, whether the program should be dispatched in the + * after-final-THRSW state. + */ + bool single_seg; }; struct v3d_vs_prog_data { @@ -555,13 +682,21 @@ struct v3d_vs_prog_data { bool uses_iid, uses_vid; /* Number of components read from each vertex attribute. */ - uint8_t vattr_sizes[32]; + uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; /* Total number of components read, for the shader state record. */ uint32_t vpm_input_size; /* Total number of components written, for the shader state record. */ uint32_t vpm_output_size; + + /* Set if there should be separate VPM segments for input and output. + * If unset, vpm_input_size will be 0. + */ + bool separate_segments; + + /* Value to be programmed in VCM_CACHE_SIZE. */ + uint8_t vcm_cache_size; }; struct v3d_fs_prog_data { @@ -576,8 +711,13 @@ struct v3d_fs_prog_data { */ uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + + uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + bool writes_z; bool discard; + bool uses_center_w; }; /* Special nir_load_input intrinsic index for loading the current TLB @@ -593,19 +733,15 @@ const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devin void v3d_compiler_free(const struct v3d_compiler *compiler); void v3d_optimize_nir(struct nir_shader *s); -uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, - struct v3d_vs_key *key, - struct v3d_vs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size); - -uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler, - struct v3d_fs_key *key, - struct v3d_fs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size); +uint64_t *v3d_compile(const struct v3d_compiler *compiler, + struct v3d_key *key, + struct v3d_prog_data **prog_data, + nir_shader *s, + void (*debug_output)(const char *msg, + void *debug_output_data), + void *debug_output_data, + int program_id, int variant_id, + uint32_t *final_assembly_size); void v3d_nir_to_vir(struct v3d_compile *c); @@ -632,10 +768,12 @@ struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst); void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond); void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf); +void vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf); void vir_set_unpack(struct qinst *inst, int src, enum v3d_qpu_input_unpack unpack); struct qreg vir_get_temp(struct v3d_compile *c); +void vir_emit_last_thrsw(struct v3d_compile *c); void vir_calculate_live_intervals(struct v3d_compile *c); bool vir_has_implicit_uniform(struct qinst *inst); int vir_get_implicit_uniform_src(struct qinst *inst); @@ -649,14 +787,18 @@ bool vir_is_tex(struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); bool vir_is_float_input(struct qinst *inst); -bool vir_depends_on_flags(struct qinst *inst); -bool vir_writes_r3(struct qinst *inst); -bool vir_writes_r4(struct qinst *inst); +bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); uint8_t vir_channels_written(struct qinst *inst); +struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); +void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, + struct qreg result); +void vir_emit_thrsw(struct v3d_compile *c); void vir_dump(struct v3d_compile *c); void vir_dump_inst(struct v3d_compile *c, struct qinst *inst); +void vir_dump_uniform(enum quniform_contents contents, uint32_t data); void vir_validate(struct v3d_compile *c); @@ -671,14 +813,24 @@ bool vir_opt_vpm(struct v3d_compile *c); void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_image_load_store(nir_shader *s); void vir_lower_uniforms(struct v3d_compile *c); -void v3d_vir_to_qpu(struct v3d_compile *c); +void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); +void v3d33_vir_vpm_write_setup(struct v3d_compile *c); +void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d40_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr); + +void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); -struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); bool vir_init_reg_sets(struct v3d_compiler *compiler); +bool v3d_gl_format_is_return_32(GLenum format); + void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf); static inline bool @@ -743,6 +895,14 @@ vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \ } +#define VIR_NODST_0(name, vir_inst, op) \ +static inline struct qinst * \ +vir_##name(struct v3d_compile *c) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, c->undef, \ + c->undef, c->undef)); \ +} + #define VIR_NODST_1(name, vir_inst, op) \ static inline struct qinst * \ vir_##name(struct v3d_compile *c, struct qreg a) \ @@ -759,6 +919,33 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ a, b)); \ } +#define VIR_SFU(name) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a) \ +{ \ + if (c->devinfo->ver >= 41) { \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, c->undef)); \ + } else { \ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ + return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ + } \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a) \ +{ \ + if (c->devinfo->ver >= 41) { \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, c->undef)); \ + } else { \ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ + return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ + } \ +} + #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) #define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name) #define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name) @@ -769,6 +956,7 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ #define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name) #define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name) #define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name) VIR_A_ALU2(FADD) VIR_A_ALU2(VFPACK) @@ -791,15 +979,19 @@ VIR_A_ALU2(OR) VIR_A_ALU2(XOR) VIR_A_ALU2(VADD) VIR_A_ALU2(VSUB) +VIR_A_NODST_2(STVPMV) VIR_A_ALU1(NOT) VIR_A_ALU1(NEG) VIR_A_ALU1(FLAPUSH) VIR_A_ALU1(FLBPUSH) -VIR_A_ALU1(FLBPOP) +VIR_A_ALU1(FLPOP) VIR_A_ALU1(SETMSF) VIR_A_ALU1(SETREVF) -VIR_A_ALU1(TIDX) -VIR_A_ALU1(EIDX) +VIR_A_ALU0(TIDX) +VIR_A_ALU0(EIDX) +VIR_A_ALU1(LDVPMV_IN) +VIR_A_ALU1(LDVPMV_OUT) +VIR_A_ALU0(TMUWT) VIR_A_ALU0(FXCD) VIR_A_ALU0(XCD) @@ -807,7 +999,9 @@ VIR_A_ALU0(FYCD) VIR_A_ALU0(YCD) VIR_A_ALU0(MSF) VIR_A_ALU0(REVF) +VIR_A_ALU0(BARRIERID) VIR_A_NODST_1(VPMSETUP) +VIR_A_NODST_0(VPMWT) VIR_A_ALU2(FCMP) VIR_A_ALU2(VFMAX) @@ -835,6 +1029,13 @@ VIR_M_NODST_2(MULTOP) VIR_M_ALU1(MOV) VIR_M_ALU1(FMOV) +VIR_SFU(RECIP) +VIR_SFU(RSQRT) +VIR_SFU(EXP) +VIR_SFU(LOG) +VIR_SFU(SIN) +VIR_SFU(RSQRT2) + static inline struct qinst * vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, struct qreg dest, struct qreg src) @@ -854,18 +1055,35 @@ vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond, return t; } -static inline void -vir_VPM_WRITE(struct v3d_compile *c, struct qreg val) -{ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); -} - static inline struct qinst * vir_NOP(struct v3d_compile *c) { return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef)); } + +static inline struct qreg +vir_LDTMU(struct v3d_compile *c) +{ + if (c->devinfo->ver >= 41) { + struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtmu->qpu.sig.ldtmu = true; + + return vir_emit_def(c, ldtmu); + } else { + vir_NOP(c)->qpu.sig.ldtmu = true; + return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); + } +} + +static inline struct qreg +vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) +{ + vir_MULTOP(c, src0, src1); + return vir_UMUL24(c, src0, src1); +} + /* static inline struct qreg vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) @@ -891,7 +1109,7 @@ vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val) */ static inline struct qinst * -vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond) +vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { /* The actual uniform_data value will be set at scheduling time */ return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));