X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fbroadcom%2Fcompiler%2Fv3d_compiler.h;h=4249c181bf113f57974dfcd934004e5ece46c718;hp=e0eeefe245a9f2f02f8efb0e18709dd9ba6ed330;hb=76fc8c8bb1979122af40ed143fed726050b293b9;hpb=ade416d02369cc0942d53ad3cce601d66344f9c3 diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index e0eeefe245a..4249c181bf1 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -33,6 +33,8 @@ #include "util/macros.h" #include "common/v3d_debug.h" +#include "common/v3d_device_info.h" +#include "common/v3d_limits.h" #include "compiler/nir/nir.h" #include "util/list.h" #include "util/u_math.h" @@ -40,11 +42,6 @@ #include "qpu/qpu_instr.h" #include "pipe/p_state.h" -#define V3D_MAX_TEXTURE_SAMPLERS 32 -#define V3D_MAX_SAMPLES 4 -#define V3D_MAX_FS_INPUTS 64 -#define V3D_MAX_VS_INPUTS 64 - struct nir_builder; struct v3d_fs_inputs { @@ -72,10 +69,6 @@ enum qfile { * or physical registers later. */ QFILE_TEMP, - QFILE_VARY, - QFILE_UNIF, - QFILE_TLB, - QFILE_TLBU, /** * VPM reads use this with an index value to say what part of the VPM @@ -109,12 +102,23 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index) return (struct qreg){file, index}; } +static inline struct qreg vir_magic_reg(uint32_t index) +{ + return (struct qreg){QFILE_MAGIC, index}; +} + +static inline struct qreg vir_nop_reg(void) +{ + return (struct qreg){QFILE_NULL, 0}; +} + /** * A reference to an actual register at the QPU level, for register * allocation. */ struct qpu_reg { bool magic; + bool smimm; int index; }; @@ -132,11 +136,11 @@ struct qinst { /* Pre-register-allocation references to src/dst registers */ struct qreg dst; struct qreg src[3]; - bool cond_is_exec_mask; - bool has_implicit_uniform; + bool is_last_thrsw; - /* After vir_to_qpu.c: If instr reads a uniform, which uniform from - * the uncompiled stream it is. + /* If the instruction reads a uniform (other than through src[i].file + * == QFILE_UNIF), that uniform's index in c->uniform_contents. ~0 + * otherwise. */ int uniform; }; @@ -171,7 +175,7 @@ enum quniform_contents { QUNIFORM_USER_CLIP_PLANE, /** - * A reference to a texture config parameter 0 uniform. + * A reference to a V3D 3.x texture config parameter 0 uniform. * * This is a uniform implicitly loaded with a QPU_W_TMU* write, which * defines texture type, miplevels, and such. It will be found as a @@ -212,15 +216,24 @@ enum quniform_contents { QUNIFORM_TEXTURE_CONFIG_P0_32, /** - * A reference to a texture config parameter 1 uniform. + * A reference to a V3D 3.x texture config parameter 1 uniform. * * This is a uniform implicitly loaded with a QPU_W_TMU* write, which - * defines texture width, height, filters, and wrap modes. It will be - * found as a parameter to the second QOP_TEX_[STRB] instruction in a - * sequence. + * has the pointer to the indirect texture state. Our data[] field + * will have a packed p1 value, but the address field will be just + * which texture unit's texture should be referenced. */ QUNIFORM_TEXTURE_CONFIG_P1, + /* A V3D 4.x texture config parameter. The high 8 bits will be + * which texture or sampler is being sampled, and the driver must + * replace the address field with the appropriate address. + */ + QUNIFORM_TMU_CONFIG_P0, + QUNIFORM_TMU_CONFIG_P1, + + QUNIFORM_IMAGE_TMU_CONFIG_P0, + QUNIFORM_TEXTURE_FIRST_LEVEL, QUNIFORM_TEXTURE_WIDTH, @@ -229,21 +242,61 @@ enum quniform_contents { QUNIFORM_TEXTURE_ARRAY_SIZE, QUNIFORM_TEXTURE_LEVELS, - QUNIFORM_TEXTURE_MSAA_ADDR, - QUNIFORM_UBO_ADDR, QUNIFORM_TEXRECT_SCALE_X, QUNIFORM_TEXRECT_SCALE_Y, - QUNIFORM_TEXTURE_BORDER_COLOR, + /* Returns the base offset of the SSBO given by the data value. */ + QUNIFORM_SSBO_OFFSET, - QUNIFORM_STENCIL, + /* Returns the size of the SSBO given by the data value. */ + QUNIFORM_GET_BUFFER_SIZE, + + /* Sizes (in pixels) of a shader image given by the data value. */ + QUNIFORM_IMAGE_WIDTH, + QUNIFORM_IMAGE_HEIGHT, + QUNIFORM_IMAGE_DEPTH, + QUNIFORM_IMAGE_ARRAY_SIZE, QUNIFORM_ALPHA_REF, - QUNIFORM_SAMPLE_MASK, + + /* Number of workgroups passed to glDispatchCompute in the dimension + * selected by the data value. + */ + QUNIFORM_NUM_WORK_GROUPS, + + /** + * Returns the the offset of the scratch buffer for register spilling. + */ + QUNIFORM_SPILL_OFFSET, + QUNIFORM_SPILL_SIZE_PER_THREAD, + + /** + * Returns the offset of the shared memory for compute shaders. + * + * This will be accessed using TMU general memory operations, so the + * L2T cache will effectively be the shared memory area. + */ + QUNIFORM_SHARED_OFFSET, }; +static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) +{ + assert(value < (1 << 24)); + return unit << 24 | value; +} + +static inline uint32_t v3d_unit_data_get_unit(uint32_t data) +{ + return data >> 24; +} + +static inline uint32_t v3d_unit_data_get_offset(uint32_t data) +{ + return data & 0xffffff; +} + struct v3d_varying_slot { uint8_t slot_and_component; }; @@ -265,44 +318,18 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) return slot.slot_and_component & 3; } -struct v3d_ubo_range { - /** - * offset in bytes from the start of the ubo where this range is - * uploaded. - * - * Only set once used is set. - */ - uint32_t dst_offset; - - /** - * offset in bytes from the start of the gallium uniforms where the - * data comes from. - */ - uint32_t src_offset; - - /** size in bytes of this ubo range */ - uint32_t size; -}; - struct v3d_key { void *shader_state; struct { uint8_t swizzle[4]; uint8_t return_size; uint8_t return_channels; - union { - struct { - unsigned compare_mode:1; - unsigned compare_func:3; - unsigned wrap_s:3; - unsigned wrap_t:3; - }; - struct { - uint16_t msaa_width, msaa_height; - }; - }; + bool clamp_s:1; + bool clamp_t:1; + bool clamp_r:1; } tex[V3D_MAX_TEXTURE_SAMPLERS]; uint8_t ucp_enables; + bool is_last_geometry_stage; }; struct v3d_fs_key { @@ -318,7 +345,26 @@ struct v3d_fs_key { bool sample_alpha_to_coverage; bool sample_alpha_to_one; bool clamp_color; - bool swap_color_rb; + bool shade_model_flat; + /* Mask of which color render targets are present. */ + uint8_t cbufs; + uint8_t swap_color_rb; + /* Mask of which render targets need to be written as 32-bit floats */ + uint8_t f32_color_rb; + /* Masks of which render targets need to be written as ints/uints. + * Used by gallium to work around lost information in TGSI. + */ + uint8_t int_color_rb; + uint8_t uint_color_rb; + + /* Color format information per render target. Only set when logic + * operations are enabled. + */ + struct { + enum pipe_format format; + const uint8_t *swizzle; + } color_fmt[V3D_MAX_DRAW_BUFFERS]; + uint8_t alpha_test_func; uint8_t logicop_func; uint32_t point_sprite_mask; @@ -326,11 +372,21 @@ struct v3d_fs_key { struct pipe_rt_blend_state blend; }; +struct v3d_gs_key { + struct v3d_key base; + + struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS]; + uint8_t num_used_outputs; + + bool is_coord; + bool per_vertex_point_size; +}; + struct v3d_vs_key { struct v3d_key base; - struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS]; - uint8_t num_fs_inputs; + struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS]; + uint8_t num_used_outputs; bool is_coord; bool per_vertex_point_size; @@ -366,6 +422,8 @@ struct qblock { /** @{ used by v3d_vir_live_variables.c */ BITSET_WORD *def; + BITSET_WORD *defin; + BITSET_WORD *defout; BITSET_WORD *use; BITSET_WORD *live_in; BITSET_WORD *live_out; @@ -373,6 +431,48 @@ struct qblock { /** @} */ }; +/** Which util/list.h add mode we should use when inserting an instruction. */ +enum vir_cursor_mode { + vir_cursor_add, + vir_cursor_addtail, +}; + +/** + * Tracking structure for where new instructions should be inserted. Create + * with one of the vir_after_inst()-style helper functions. + * + * This does not protect against removal of the block or instruction, so we + * have an assert in instruction removal to try to catch it. + */ +struct vir_cursor { + enum vir_cursor_mode mode; + struct list_head *link; +}; + +static inline struct vir_cursor +vir_before_inst(struct qinst *inst) +{ + return (struct vir_cursor){ vir_cursor_addtail, &inst->link }; +} + +static inline struct vir_cursor +vir_after_inst(struct qinst *inst) +{ + return (struct vir_cursor){ vir_cursor_add, &inst->link }; +} + +static inline struct vir_cursor +vir_before_block(struct qblock *block) +{ + return (struct vir_cursor){ vir_cursor_add, &block->instructions }; +} + +static inline struct vir_cursor +vir_after_block(struct qblock *block) +{ + return (struct vir_cursor){ vir_cursor_addtail, &block->instructions }; +} + /** * Compiler state saved across compiler invocations, for any expensive global * setup. @@ -380,7 +480,10 @@ struct qblock { struct v3d_compiler { const struct v3d_device_info *devinfo; struct ra_regs *regs; - unsigned int reg_class[3]; + unsigned int reg_class_any[3]; + unsigned int reg_class_r5[3]; + unsigned int reg_class_phys[3]; + unsigned int reg_class_phys_or_acc[3]; }; struct v3d_compile { @@ -390,6 +493,10 @@ struct v3d_compile { struct exec_list *cf_node_list; const struct v3d_compiler *compiler; + void (*debug_output)(const char *msg, + void *debug_output_data); + void *debug_output_data; + /** * Mapping from nir_register * or nir_ssa_def * to array of struct * qreg for the values. @@ -408,30 +515,31 @@ struct v3d_compile { struct qreg *inputs; struct qreg *outputs; bool msaa_per_sample_output; - struct qreg color_reads[V3D_MAX_SAMPLES]; - struct qreg sample_colors[V3D_MAX_SAMPLES]; + struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; + struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; uint32_t inputs_array_size; uint32_t outputs_array_size; uint32_t uniforms_array_size; /* Booleans for whether the corresponding QFILE_VARY[i] is - * flat-shaded. This doesn't count gl_FragColor flat-shading, which is - * controlled by shader->color_inputs and rasterizer->flatshade in the - * gallium driver. + * flat-shaded. This includes gl_FragColor flat-shading, which is + * customized based on the shademodel_flat shader key. */ - BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; - struct v3d_ubo_range *ubo_ranges; - bool *ubo_range_used; - uint32_t ubo_ranges_array_size; - /** Number of uniform areas tracked in ubo_ranges. */ - uint32_t num_ubo_ranges; - uint32_t next_ubo_dst_offset; + bool uses_center_w; + bool writes_z; + bool uses_implicit_point_line_varyings; /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. */ struct qreg execute; + bool in_control_flow; struct qreg line_x, point_x, point_y; @@ -450,11 +558,27 @@ struct v3d_compile { /* Fragment shader payload regs. */ struct qreg payload_w, payload_w_centroid, payload_z; - /** boolean (~0 -> true) if the fragment has been discarded. */ - struct qreg discard; + struct qreg cs_payload[2]; + struct qreg cs_shared_offset; + int local_invocation_index_bits; + + uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; + uint8_t gs_input_sizes[V3D_MAX_GS_INPUTS]; + uint32_t vpm_output_size; - uint8_t vattr_sizes[V3D_MAX_VS_INPUTS]; - uint32_t num_vpm_writes; + /* Size in bytes of registers that have been spilled. This is how much + * space needs to be available in the spill BO per thread per QPU. + */ + uint32_t spill_size; + /* Shader-db stats */ + uint32_t spills, fills, loops; + /** + * Register spilling's per-thread base address, shared between each + * spill/fill's addressing calculations. + */ + struct qreg spill_base; + /* Bit vector of which temps may be spilled */ + BITSET_WORD *spillable; /** * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads. @@ -474,24 +598,25 @@ struct v3d_compile { struct pipe_shader_state *shader_state; struct v3d_key *key; struct v3d_fs_key *fs_key; + struct v3d_gs_key *gs_key; struct v3d_vs_key *vs_key; /* Live ranges of temps. */ int *temp_start, *temp_end; + bool live_intervals_valid; uint32_t *uniform_data; enum quniform_contents *uniform_contents; uint32_t uniform_array_size; uint32_t num_uniforms; - uint32_t num_outputs; uint32_t output_position_index; - nir_variable *output_color_var; - uint32_t output_point_size_index; + nir_variable *output_color_var[4]; uint32_t output_sample_mask_index; struct qreg undef; uint32_t num_temps; + struct vir_cursor cursor; struct list_head blocks; int next_block_index; struct qblock *cur_block; @@ -501,36 +626,34 @@ struct v3d_compile { uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; + uint32_t qpu_inst_stalled_count; /* For the FS, the number of varying inputs not counting the * point/line varyings payload */ uint32_t num_inputs; - /** - * Number of inputs from num_inputs remaining to be queued to the read - * FIFO in the VS/CS. - */ - uint32_t num_inputs_remaining; - - /* Number of inputs currently in the read FIFO for the VS/CS */ - uint32_t num_inputs_in_fifo; - - /** Next offset in the VPM to read from in the VS/CS */ - uint32_t vpm_read_offset; - uint32_t program_id; uint32_t variant_id; - /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH - * is used to hide texturing latency at the cost of limiting ourselves - * to the bottom half of physical reg space. + /* Set to compile program in in 1x, 2x, or 4x threaded mode, where + * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of + * limiting ourselves to the part of the physical reg space. + * + * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On + * V3D 4.x, all shaders are 2x threaded, and 4x only divides the + * physical reg space in half. */ - bool fs_threaded; - + uint8_t threads; + struct qinst *last_thrsw; bool last_thrsw_at_top_level; + bool emitted_tlb_load; + bool lock_scoreboard_on_first_thrsw; + bool failed; + + bool tmu_dirty_rcl; }; struct v3d_uniform_list { @@ -542,12 +665,16 @@ struct v3d_uniform_list { struct v3d_prog_data { struct v3d_uniform_list uniforms; - struct v3d_ubo_range *ubo_ranges; - uint32_t num_ubo_ranges; - uint32_t ubo_size; + uint32_t spill_size; - uint8_t num_inputs; + uint8_t threads; + /* For threads > 1, whether the program should be dispatched in the + * after-final-THRSW state. + */ + bool single_seg; + + bool tmu_dirty_rcl; }; struct v3d_vs_prog_data { @@ -556,13 +683,51 @@ struct v3d_vs_prog_data { bool uses_iid, uses_vid; /* Number of components read from each vertex attribute. */ - uint8_t vattr_sizes[32]; + uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; /* Total number of components read, for the shader state record. */ uint32_t vpm_input_size; /* Total number of components written, for the shader state record. */ uint32_t vpm_output_size; + + /* Set if there should be separate VPM segments for input and output. + * If unset, vpm_input_size will be 0. + */ + bool separate_segments; + + /* Value to be programmed in VCM_CACHE_SIZE. */ + uint8_t vcm_cache_size; +}; + +struct v3d_gs_prog_data { + struct v3d_prog_data base; + + /* Whether the program reads gl_PrimitiveIDIn */ + bool uses_pid; + + /* Number of components read from each input varying. */ + uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4]; + + /* Number of inputs */ + uint8_t num_inputs; + struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS]; + + /* Total number of components written, for the shader state record. */ + uint32_t vpm_output_size; + + /* Maximum SIMD dispatch width to not exceed VPM output size limits + * in the geometry shader. Notice that the final dispatch width has to + * be decided at draw time and could be lower based on the VPM pressure + * added by other shader stages. + */ + uint8_t simd_width; + + /* Output primitive type */ + uint8_t out_prim_type; + + /* Number of GS invocations */ + uint8_t num_invocations; }; struct v3d_fs_prog_data { @@ -570,23 +735,36 @@ struct v3d_fs_prog_data { struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS]; - /** bitmask of which inputs are color inputs, for flat shade handling. */ - uint32_t color_inputs[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; - - /* Bitmask for whether the corresponding input is flat-shaded, - * independent of rasterizer (gl_FragColor) flat-shading. + /* Array of flat shade flags. + * + * Each entry is only 24 bits (high 8 bits 0), to match the hardware + * packet layout. */ - BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + + uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + + uint8_t num_inputs; bool writes_z; + bool disable_ez; + bool uses_center_w; + bool uses_implicit_point_line_varyings; + bool lock_scoreboard_on_first_thrsw; }; -/* Special nir_load_input intrinsic index for loading the current TLB - * destination color. - */ -#define V3D_NIR_TLB_COLOR_READ_INPUT 2000000000 +struct v3d_compute_prog_data { + struct v3d_prog_data base; + /* Size in bytes of the workgroup's shared space. */ + uint32_t shared_size; +}; -#define V3D_NIR_MS_MASK_OUTPUT 2000000000 +static inline bool +vir_has_uniform(struct qinst *inst) +{ + return inst->uniform != ~0; +} extern const nir_shader_compiler_options v3d_nir_options; @@ -594,19 +772,15 @@ const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devin void v3d_compiler_free(const struct v3d_compiler *compiler); void v3d_optimize_nir(struct nir_shader *s); -uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler, - struct v3d_vs_key *key, - struct v3d_vs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size); - -uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler, - struct v3d_fs_key *key, - struct v3d_fs_prog_data *prog_data, - nir_shader *s, - int program_id, int variant_id, - uint32_t *final_assembly_size); +uint64_t *v3d_compile(const struct v3d_compiler *compiler, + struct v3d_key *key, + struct v3d_prog_data **prog_data, + nir_shader *s, + void (*debug_output)(const char *msg, + void *debug_output_data), + void *debug_output_data, + int program_id, int variant_id, + uint32_t *final_assembly_size); void v3d_nir_to_vir(struct v3d_compile *c); @@ -621,26 +795,30 @@ struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1); struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1); -struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0); +struct qinst *vir_branch_inst(struct v3d_compile *c, + enum v3d_qpu_branch_cond cond); void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst); +uint32_t vir_get_uniform_index(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data); struct qreg vir_uniform(struct v3d_compile *c, enum quniform_contents contents, uint32_t data); void vir_schedule_instructions(struct v3d_compile *c); +void v3d_setup_spill_base(struct v3d_compile *c); struct v3d_qpu_instr v3d_qpu_nop(void); struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst); void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond); void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf); +void vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf); void vir_set_unpack(struct qinst *inst, int src, enum v3d_qpu_input_unpack unpack); struct qreg vir_get_temp(struct v3d_compile *c); +void vir_emit_last_thrsw(struct v3d_compile *c); void vir_calculate_live_intervals(struct v3d_compile *c); -bool vir_has_implicit_uniform(struct qinst *inst); -int vir_get_implicit_uniform_src(struct qinst *inst); -int vir_get_non_sideband_nsrc(struct qinst *inst); int vir_get_nsrc(struct qinst *inst); bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); @@ -649,15 +827,18 @@ bool vir_is_raw_mov(struct qinst *inst); bool vir_is_tex(struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); -bool vir_is_float_input(struct qinst *inst); -bool vir_depends_on_flags(struct qinst *inst); -bool vir_writes_r3(struct qinst *inst); -bool vir_writes_r4(struct qinst *inst); +bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); uint8_t vir_channels_written(struct qinst *inst); +struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); +void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, + struct qreg result); +void vir_emit_thrsw(struct v3d_compile *c); void vir_dump(struct v3d_compile *c); void vir_dump_inst(struct v3d_compile *c, struct qinst *inst); +void vir_dump_uniform(enum quniform_contents contents, uint32_t data); void vir_validate(struct v3d_compile *c); @@ -667,20 +848,34 @@ bool vir_opt_constant_folding(struct v3d_compile *c); bool vir_opt_copy_propagate(struct v3d_compile *c); bool vir_opt_dead_code(struct v3d_compile *c); bool vir_opt_peephole_sf(struct v3d_compile *c); +bool vir_opt_redundant_flags(struct v3d_compile *c); bool vir_opt_small_immediates(struct v3d_compile *c); bool vir_opt_vpm(struct v3d_compile *c); void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_scratch(nir_shader *s); void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_image_load_store(nir_shader *s); void vir_lower_uniforms(struct v3d_compile *c); -void v3d_vir_to_qpu(struct v3d_compile *c); +void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); +void v3d33_vir_vpm_write_setup(struct v3d_compile *c); +void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr); +void v3d40_vir_emit_image_load_store(struct v3d_compile *c, + nir_intrinsic_instr *instr); + +void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers); uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c); void qpu_validate(struct v3d_compile *c); -struct qpu_reg *v3d_register_allocate(struct v3d_compile *c); +struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled); bool vir_init_reg_sets(struct v3d_compiler *compiler); -void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf); +bool v3d_gl_format_is_return_32(GLenum format); + +uint32_t +v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); static inline bool quniform_contents_is_texture_p0(enum quniform_contents contents) @@ -690,6 +885,12 @@ quniform_contents_is_texture_p0(enum quniform_contents contents) V3D_MAX_TEXTURE_SAMPLERS)); } +static inline bool +vir_in_nonuniform_control_flow(struct v3d_compile *c) +{ + return c->execute.file != QFILE_NULL; +} + static inline struct qreg vir_uniform_ui(struct v3d_compile *c, uint32_t ui) { @@ -744,6 +945,14 @@ vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ return vir_emit_nondef(c, vir_inst(op, dest, a, b)); \ } +#define VIR_NODST_0(name, vir_inst, op) \ +static inline struct qinst * \ +vir_##name(struct v3d_compile *c) \ +{ \ + return vir_emit_nondef(c, vir_inst(op, c->undef, \ + c->undef, c->undef)); \ +} + #define VIR_NODST_1(name, vir_inst, op) \ static inline struct qinst * \ vir_##name(struct v3d_compile *c, struct qreg a) \ @@ -760,6 +969,33 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ a, b)); \ } +#define VIR_SFU(name) \ +static inline struct qreg \ +vir_##name(struct v3d_compile *c, struct qreg a) \ +{ \ + if (c->devinfo->ver >= 41) { \ + return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \ + c->undef, \ + a, c->undef)); \ + } else { \ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ + return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ + } \ +} \ +static inline struct qinst * \ +vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \ + struct qreg a) \ +{ \ + if (c->devinfo->ver >= 41) { \ + return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \ + dest, \ + a, c->undef)); \ + } else { \ + vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \ + return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \ + } \ +} + #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name) #define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name) #define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name) @@ -770,6 +1006,7 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \ #define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name) #define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name) #define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name) +#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name) VIR_A_ALU2(FADD) VIR_A_ALU2(VFPACK) @@ -792,23 +1029,34 @@ VIR_A_ALU2(OR) VIR_A_ALU2(XOR) VIR_A_ALU2(VADD) VIR_A_ALU2(VSUB) +VIR_A_NODST_2(STVPMV) VIR_A_ALU1(NOT) VIR_A_ALU1(NEG) VIR_A_ALU1(FLAPUSH) VIR_A_ALU1(FLBPUSH) -VIR_A_ALU1(FLBPOP) +VIR_A_ALU1(FLPOP) VIR_A_ALU1(SETMSF) VIR_A_ALU1(SETREVF) -VIR_A_ALU1(TIDX) -VIR_A_ALU1(EIDX) - +VIR_A_ALU0(TIDX) +VIR_A_ALU0(EIDX) +VIR_A_ALU1(LDVPMV_IN) +VIR_A_ALU1(LDVPMV_OUT) +VIR_A_ALU1(LDVPMD_IN) +VIR_A_ALU1(LDVPMD_OUT) +VIR_A_ALU2(LDVPMG_IN) +VIR_A_ALU2(LDVPMG_OUT) +VIR_A_ALU0(TMUWT) + +VIR_A_ALU0(IID) VIR_A_ALU0(FXCD) VIR_A_ALU0(XCD) VIR_A_ALU0(FYCD) VIR_A_ALU0(YCD) VIR_A_ALU0(MSF) VIR_A_ALU0(REVF) +VIR_A_ALU0(BARRIERID) VIR_A_NODST_1(VPMSETUP) +VIR_A_NODST_0(VPMWT) VIR_A_ALU2(FCMP) VIR_A_ALU2(VFMAX) @@ -836,6 +1084,13 @@ VIR_M_NODST_2(MULTOP) VIR_M_ALU1(MOV) VIR_M_ALU1(FMOV) +VIR_SFU(RECIP) +VIR_SFU(RSQRT) +VIR_SFU(EXP) +VIR_SFU(LOG) +VIR_SFU(SIN) +VIR_SFU(RSQRT2) + static inline struct qinst * vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, struct qreg dest, struct qreg src) @@ -855,18 +1110,59 @@ vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond, return t; } -static inline void -vir_VPM_WRITE(struct v3d_compile *c, struct qreg val) -{ - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); -} - static inline struct qinst * vir_NOP(struct v3d_compile *c) { return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef)); } + +static inline struct qreg +vir_LDTMU(struct v3d_compile *c) +{ + if (c->devinfo->ver >= 41) { + struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtmu->qpu.sig.ldtmu = true; + + return vir_emit_def(c, ldtmu); + } else { + vir_NOP(c)->qpu.sig.ldtmu = true; + return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); + } +} + +static inline struct qreg +vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) +{ + vir_MULTOP(c, src0, src1); + return vir_UMUL24(c, src0, src1); +} + +static inline struct qreg +vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) +{ + assert(c->devinfo->ver >= 41); /* XXX */ + assert((config & 0xffffff00) == 0xffffff00); + + struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtlb->qpu.sig.ldtlbu = true; + ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); + return vir_emit_def(c, ldtlb); +} + +static inline struct qreg +vir_TLB_COLOR_READ(struct v3d_compile *c) +{ + assert(c->devinfo->ver >= 41); /* XXX */ + + struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtlb->qpu.sig.ldtlb = true; + return vir_emit_def(c, ldtlb); +} + /* static inline struct qreg vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) @@ -892,10 +1188,10 @@ vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val) */ static inline struct qinst * -vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond) +vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { /* The actual uniform_data value will be set at scheduling time */ - return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0))); + return vir_emit_nondef(c, vir_branch_inst(c, cond)); } #define vir_for_each_block(block, c) \ @@ -924,4 +1220,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond) vir_for_each_block(_block, c) \ vir_for_each_inst(inst, _block) +#define vir_for_each_inst_inorder_safe(inst, c) \ + vir_for_each_block(_block, c) \ + vir_for_each_inst_safe(inst, _block) + #endif /* V3D_COMPILER_H */