intel/fs,vec4: Stuff the constant data from NIR in the end of the program
[mesa.git] / src / intel / compiler / brw_fs.h
index f51a4d8889bd651b1ed2583782b52caaed5b078b..0843f6e73fc5b2b79785c501e7bccc26b2763e89 100644 (file)
@@ -31,6 +31,8 @@
 #include "brw_shader.h"
 #include "brw_ir_fs.h"
 #include "brw_fs_builder.h"
+#include "brw_fs_live_variables.h"
+#include "brw_ir_performance.h"
 #include "compiler/nir/nir.h"
 
 struct bblock_t;
@@ -38,8 +40,34 @@ namespace {
    struct acp_entry;
 }
 
+class fs_visitor;
+
 namespace brw {
-   class fs_live_variables;
+   /**
+    * Register pressure analysis of a shader.  Estimates how many registers
+    * are live at any point of the program in GRF units.
+    */
+   struct register_pressure {
+      register_pressure(const fs_visitor *v);
+      ~register_pressure();
+
+      analysis_dependency_class
+      dependency_class() const
+      {
+         return (DEPENDENCY_INSTRUCTION_IDENTITY |
+                 DEPENDENCY_INSTRUCTION_DATA_FLOW |
+                 DEPENDENCY_VARIABLES);
+      }
+
+      bool
+      validate(const fs_visitor *) const
+      {
+         /* FINISHME */
+         return true;
+      }
+
+      unsigned *regs_live_at_ip;
+   };
 }
 
 struct brw_gs_compile;
@@ -52,6 +80,11 @@ offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
 
 #define UBO_START ((1 << 16) - 4)
 
+struct shader_stats {
+   const char *scheduler_mode;
+   unsigned promoted_constants;
+};
+
 /**
  * The fragment shader front-end.
  *
@@ -62,9 +95,8 @@ class fs_visitor : public backend_shader
 public:
    fs_visitor(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
-              const void *key,
+              const brw_base_prog_key *key,
               struct brw_stage_prog_data *prog_data,
-              struct gl_program *prog,
               const nir_shader *shader,
               unsigned dispatch_width,
               int shader_time_index,
@@ -80,12 +112,6 @@ public:
 
    fs_reg vgrf(const glsl_type *const type);
    void import_uniforms(fs_visitor *v);
-   void setup_uniform_clipplane_values();
-   void compute_clip_distance();
-
-   fs_inst *get_instruction_generating_reg(fs_inst *start,
-                                          fs_inst *end,
-                                          const fs_reg &reg);
 
    void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
                                    const fs_reg &dst,
@@ -96,48 +122,44 @@ public:
 
    bool run_fs(bool allow_spilling, bool do_rep_send);
    bool run_vs();
-   bool run_tcs_single_patch();
+   bool run_tcs();
    bool run_tes();
    bool run_gs();
-   bool run_cs(unsigned min_dispatch_width);
+   bool run_cs(bool allow_spilling);
    void optimize();
-   void allocate_registers(unsigned min_dispatch_width, bool allow_spilling);
+   void allocate_registers(bool allow_spilling);
    void setup_fs_payload_gen4();
    void setup_fs_payload_gen6();
    void setup_vs_payload();
    void setup_gs_payload();
    void setup_cs_payload();
+   bool fixup_sends_duplicate_payload();
    void fixup_3src_null_dest();
+   bool fixup_nomask_control_flow();
    void assign_curb_setup();
-   void calculate_urb_setup();
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
-   void assign_tcs_single_patch_urb_setup();
+   void assign_tcs_urb_setup();
    void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling, bool spill_all);
    void assign_regs_trivial();
    void calculate_payload_ranges(int payload_node_count,
-                                 int *payload_last_use_ip);
-   void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
-                                   int first_payload_node);
-   int choose_spill_reg(struct ra_graph *g);
-   void spill_reg(int spill_reg);
+                                 int *payload_last_use_ip) const;
    void split_virtual_grfs();
    bool compact_virtual_grfs();
    void assign_constant_locations();
    bool get_pull_locs(const fs_reg &src, unsigned *out_surf_index,
                       unsigned *out_pull_index);
    void lower_constant_loads();
-   void invalidate_live_intervals();
-   void calculate_live_intervals();
-   void calculate_register_pressure();
+   virtual void invalidate_analysis(brw::analysis_dependency_class c);
    void validate();
    bool opt_algebraic();
    bool opt_redundant_discard_jumps();
    bool opt_cse();
-   bool opt_cse_local(bblock_t *block);
+   bool opt_cse_local(const brw::fs_live_variables &live, bblock_t *block, int &ip);
+
    bool opt_copy_propagation();
    bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
    bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
@@ -145,14 +167,14 @@ public:
                                    exec_list *acp);
    bool opt_drop_redundant_mov_to_flags();
    bool opt_register_renaming();
+   bool opt_bank_conflicts();
    bool register_coalesce();
    bool compute_to_mrf();
    bool eliminate_find_live_channel();
    bool dead_code_eliminate();
    bool remove_duplicate_mrf_writes();
+   bool remove_extra_rounding_modes();
 
-   bool opt_sampler_eot();
-   bool virtual_grf_interferes(int a, int b);
    void schedule_instructions(instruction_scheduler_mode mode);
    void insert_gen4_send_dependency_workarounds();
    void insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
@@ -165,11 +187,14 @@ public:
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
    bool lower_pack();
-   bool lower_conversions();
+   bool lower_regioning();
    bool lower_logical_sends();
    bool lower_integer_multiplication();
    bool lower_minmax();
    bool lower_simd_width();
+   bool lower_barycentrics();
+   bool lower_scoreboard();
+   bool lower_sub_sat();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
@@ -183,16 +208,22 @@ public:
    void emit_interpolation_setup_gen6();
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
    fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
-                         const fs_reg &sampler);
+                         const fs_reg &texture,
+                         const fs_reg &texture_handle);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
    fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
+   void emit_fsign(const class brw::fs_builder &, const nir_alu_instr *instr,
+                   fs_reg result, fs_reg *op, unsigned fsign_src);
+   void emit_shader_float_controls_execution_mode();
    bool opt_peephole_sel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
    bool opt_cmod_propagation();
    bool opt_zero_samples();
 
+   void set_tcs_invocation_id();
+
    void emit_nir_code();
    void nir_setup_outputs();
    void nir_setup_uniforms();
@@ -203,7 +234,10 @@ public:
    void nir_emit_loop(nir_loop *loop);
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
-   void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr,
+                     bool need_dest);
+   bool try_emit_b2fi_of_inot(const brw::fs_builder &bld, fs_reg result,
+                              nir_alu_instr *instr);
    void nir_emit_load_const(const brw::fs_builder &bld,
                             nir_load_const_instr *instr);
    void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
@@ -216,14 +250,29 @@ public:
                               nir_intrinsic_instr *instr);
    void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
                               nir_intrinsic_instr *instr);
+   fs_reg get_nir_image_intrinsic_image(const brw::fs_builder &bld,
+                                        nir_intrinsic_instr *instr);
+   fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
+                                       nir_intrinsic_instr *instr);
+   fs_reg swizzle_nir_scratch_addr(const brw::fs_builder &bld,
+                                   const fs_reg &addr,
+                                   bool in_dwords);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
                                nir_intrinsic_instr *instr);
    void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
                              int op, nir_intrinsic_instr *instr);
+   void nir_emit_ssbo_atomic_float(const brw::fs_builder &bld,
+                                   int op, nir_intrinsic_instr *instr);
    void nir_emit_shared_atomic(const brw::fs_builder &bld,
                                int op, nir_intrinsic_instr *instr);
+   void nir_emit_shared_atomic_float(const brw::fs_builder &bld,
+                                     int op, nir_intrinsic_instr *instr);
+   void nir_emit_global_atomic(const brw::fs_builder &bld,
+                               int op, nir_intrinsic_instr *instr);
+   void nir_emit_global_atomic_float(const brw::fs_builder &bld,
+                                     int op, nir_intrinsic_instr *instr);
    void nir_emit_texture(const brw::fs_builder &bld,
                          nir_tex_instr *instr);
    void nir_emit_jump(const brw::fs_builder &bld,
@@ -231,8 +280,13 @@ public:
    fs_reg get_nir_src(const nir_src &src);
    fs_reg get_nir_src_imm(const nir_src &src);
    fs_reg get_nir_dest(const nir_dest &dest);
-   fs_reg get_nir_image_deref(const nir_deref_var *deref);
    fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
+   fs_reg get_tcs_single_patch_icp_handle(const brw::fs_builder &bld,
+                                          nir_intrinsic_instr *instr);
+   fs_reg get_tcs_eight_patch_icp_handle(const brw::fs_builder &bld,
+                                         nir_intrinsic_instr *instr);
+   struct brw_reg get_tcs_output_urb_handle();
+
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
@@ -245,6 +299,7 @@ public:
    fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
                                  fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components);
+   void emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha);
    void emit_fb_writes();
    fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld,
                                       const fs_reg &dst, unsigned target);
@@ -272,30 +327,30 @@ public:
 
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
-   struct brw_reg interp_reg(int location, int channel);
+   fs_reg interp_reg(int location, int channel);
 
-   int implied_mrf_writes(fs_inst *inst);
+   virtual void dump_instructions() const;
+   virtual void dump_instructions(const char *name) const;
+   void dump_instruction(const backend_instruction *inst) const;
+   void dump_instruction(const backend_instruction *inst, FILE *file) const;
 
-   virtual void dump_instructions();
-   virtual void dump_instructions(const char *name);
-   void dump_instruction(backend_instruction *inst);
-   void dump_instruction(backend_instruction *inst, FILE *file);
-
-   const void *const key;
+   const brw_base_prog_key *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
    struct brw_gs_compile *gs_compile;
 
    struct brw_stage_prog_data *prog_data;
-   struct gl_program *prog;
 
    const struct brw_vue_map *input_vue_map;
 
-   int *virtual_grf_start;
-   int *virtual_grf_end;
-   brw::fs_live_variables *live_intervals;
+   int *param_size;
 
-   int *regs_live_at_ip;
+   BRW_ANALYSIS(live_analysis, brw::fs_live_variables,
+                backend_shader *) live_analysis;
+   BRW_ANALYSIS(regpressure_analysis, brw::register_pressure,
+                fs_visitor *) regpressure_analysis;
+   BRW_ANALYSIS(performance_analysis, brw::performance,
+                fs_visitor *) performance_analysis;
 
    /** Number of uniform variable components visited. */
    unsigned uniforms;
@@ -315,7 +370,9 @@ public:
     */
    int *push_constant_loc;
 
-   fs_reg thread_local_id;
+   fs_reg subgroup_id;
+   fs_reg group_size[3];
+   fs_reg scratch_base;
    fs_reg frag_depth;
    fs_reg frag_stencil;
    fs_reg sample_mask;
@@ -334,14 +391,15 @@ public:
 
    /** Register numbers for thread payload fields. */
    struct thread_payload {
-      uint8_t source_depth_reg;
-      uint8_t source_w_reg;
-      uint8_t aa_dest_stencil_reg;
-      uint8_t dest_depth_reg;
-      uint8_t sample_pos_reg;
-      uint8_t sample_mask_in_reg;
-      uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT];
-      uint8_t local_invocation_id_reg;
+      uint8_t subspan_coord_reg[2];
+      uint8_t source_depth_reg[2];
+      uint8_t source_w_reg[2];
+      uint8_t aa_dest_stencil_reg[2];
+      uint8_t dest_depth_reg[2];
+      uint8_t sample_pos_reg[2];
+      uint8_t sample_mask_in_reg[2];
+      uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
+      uint8_t local_invocation_id_reg[2];
 
       /** The number of thread payload registers the hardware will supply. */
       uint8_t num_regs;
@@ -356,7 +414,6 @@ public:
    fs_reg pixel_w;
    fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
    fs_reg shader_start_time;
-   fs_reg userplane[MAX_CLIP_PLANES];
    fs_reg final_gs_vertex_count;
    fs_reg control_data_bits;
    fs_reg invocation_id;
@@ -369,10 +426,38 @@ public:
 
    int shader_time_index;
 
-   unsigned promoted_constants;
+   struct shader_stats shader_stats;
+
    brw::fs_builder bld;
+
+private:
+   fs_reg prepare_alu_destination_and_sources(const brw::fs_builder &bld,
+                                              nir_alu_instr *instr,
+                                              fs_reg *op,
+                                              bool need_dest);
+
+   void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr,
+                             fs_reg *op);
+   void lower_mul_dword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mul_qword_inst(fs_inst *inst, bblock_t *block);
+   void lower_mulh_inst(fs_inst *inst, bblock_t *block);
+
+   unsigned workgroup_size() const;
 };
 
+/**
+ * Return the flag register used in fragment shaders to keep track of live
+ * samples.  On Gen7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
+ * dispatch mode, while earlier generations are constrained to f0.1, which
+ * limits the dispatch width to SIMD16 for fragment shaders that use discard.
+ */
+static inline unsigned
+sample_mask_flag_subreg(const fs_visitor *shader)
+{
+   assert(shader->stage == MESA_SHADER_FRAGMENT);
+   return shader->devinfo->gen >= 7 ? 2 : 1;
+}
+
 /**
  * The fragment shader code generator.
  *
@@ -383,22 +468,30 @@ class fs_generator
 public:
    fs_generator(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
-                const void *key,
                 struct brw_stage_prog_data *prog_data,
-                unsigned promoted_constants,
                 bool runtime_check_aads_emit,
                 gl_shader_stage stage);
    ~fs_generator();
 
    void enable_debug(const char *shader_name);
-   int generate_code(const cfg_t *cfg, int dispatch_width);
-   const unsigned *get_assembly(unsigned int *assembly_size);
+   int generate_code(const cfg_t *cfg, int dispatch_width,
+                     struct shader_stats shader_stats,
+                     const brw::performance &perf,
+                     struct brw_compile_stats *stats);
+   void add_const_data(void *data, unsigned size);
+   const unsigned *get_assembly();
 
 private:
    void fire_fb_write(fs_inst *inst,
                       struct brw_reg payload,
                       struct brw_reg implied_header,
                       GLuint nr);
+   void generate_send(fs_inst *inst,
+                      struct brw_reg dst,
+                      struct brw_reg desc,
+                      struct brw_reg ex_desc,
+                      struct brw_reg payload,
+                      struct brw_reg payload2);
    void generate_fb_write(fs_inst *inst, struct brw_reg payload);
    void generate_fb_read(fs_inst *inst, struct brw_reg dst,
                          struct brw_reg payload);
@@ -406,16 +499,18 @@ private:
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
    void generate_barrier(fs_inst *inst, struct brw_reg src);
-   void generate_linterp(fs_inst *inst, struct brw_reg dst,
+   bool generate_linterp(fs_inst *inst, struct brw_reg dst,
                         struct brw_reg *src);
-   void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+   void generate_tex(fs_inst *inst, struct brw_reg dst,
                      struct brw_reg surface_index,
                      struct brw_reg sampler_index);
    void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
                                  struct brw_reg src,
                                  struct brw_reg surf_index);
-   void generate_ddx(enum opcode op, struct brw_reg dst, struct brw_reg src);
-   void generate_ddy(enum opcode op, struct brw_reg dst, struct brw_reg src);
+   void generate_ddx(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
+   void generate_ddy(const fs_inst *inst,
+                     struct brw_reg dst, struct brw_reg src);
    void generate_scratch_write(fs_inst *inst, struct brw_reg src);
    void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
    void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
@@ -429,10 +524,6 @@ private:
    void generate_varying_pull_constant_load_gen4(fs_inst *inst,
                                                  struct brw_reg dst,
                                                  struct brw_reg index);
-   void generate_varying_pull_constant_load_gen7(fs_inst *inst,
-                                                 struct brw_reg dst,
-                                                 struct brw_reg index,
-                                                 struct brw_reg offset);
    void generate_mov_dispatch_to_flags(fs_inst *inst);
 
    void generate_pixel_interpolator_query(fs_inst *inst,
@@ -452,9 +543,6 @@ private:
                                       struct brw_reg dst,
                                       struct brw_reg x,
                                       struct brw_reg y);
-   void generate_unpack_half_2x16_split(fs_inst *inst,
-                                        struct brw_reg dst,
-                                        struct brw_reg src);
 
    void generate_shader_time_add(fs_inst *inst,
                                  struct brw_reg payload,
@@ -466,6 +554,15 @@ private:
                               struct brw_reg reg,
                               struct brw_reg indirect_byte_offset);
 
+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
+   void generate_quad_swizzle(const fs_inst *inst,
+                              struct brw_reg dst, struct brw_reg src,
+                              unsigned swiz);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
@@ -474,13 +571,11 @@ private:
    const struct gen_device_info *devinfo;
 
    struct brw_codegen *p;
-   const void * const key;
    struct brw_stage_prog_data * const prog_data;
 
    unsigned dispatch_width; /**< 8, 16 or 32 */
 
    exec_list discard_halt_patches;
-   unsigned promoted_constants;
    bool runtime_check_aads_emit;
    bool debug_flag;
    const char *shader_name;
@@ -488,18 +583,81 @@ private:
    void *mem_ctx;
 };
 
-void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder &bld,
-                                             const fs_reg &dst,
-                                             const fs_reg &src,
-                                             uint32_t components);
+namespace brw {
+   inline fs_reg
+   fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
+                     brw_reg_type type = BRW_REGISTER_TYPE_F)
+   {
+      if (!regs[0])
+         return fs_reg();
+
+      if (bld.dispatch_width() > 16) {
+         const fs_reg tmp = bld.vgrf(type);
+         const brw::fs_builder hbld = bld.exec_all().group(16, 0);
+         const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
+         fs_reg *const components = new fs_reg[m];
+
+         for (unsigned g = 0; g < m; g++)
+               components[g] = retype(brw_vec8_grf(regs[g], 0), type);
+
+         hbld.LOAD_PAYLOAD(tmp, components, m, 0);
+
+         delete[] components;
+         return tmp;
+
+      } else {
+         return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
+      }
+   }
+
+   inline fs_reg
+   fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
+   {
+      if (!regs[0])
+         return fs_reg();
+
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+      const brw::fs_builder hbld = bld.exec_all().group(8, 0);
+      const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
+      fs_reg *const components = new fs_reg[2 * m];
+
+      for (unsigned c = 0; c < 2; c++) {
+         for (unsigned g = 0; g < m; g++)
+            components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
+                                           hbld, c + 2 * (g % 2));
+      }
+
+      hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
+
+      delete[] components;
+      return tmp;
+   }
+
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
+}
+
+void shuffle_from_32bit_read(const brw::fs_builder &bld,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             uint32_t first_component,
+                             uint32_t components);
 
-fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
-                                          const fs_reg &src,
-                                          uint32_t components);
 fs_reg setup_imm_df(const brw::fs_builder &bld,
                     double v);
 
+fs_reg setup_imm_b(const brw::fs_builder &bld,
+                   int8_t v);
+
+fs_reg setup_imm_ub(const brw::fs_builder &bld,
+                   uint8_t v);
+
 enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
                                                nir_intrinsic_op op);
 
+uint32_t brw_fb_write_msg_control(const fs_inst *inst,
+                                  const struct brw_wm_prog_data *prog_data);
+
+void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
+
 #endif /* BRW_FS_H */