r600/sfn: Add lowering arrays to scratch and according instructions
authorGert Wollny <gert.wollny@collabora.com>
Fri, 27 Dec 2019 16:49:26 +0000 (17:49 +0100)
committerMarge Bot <eric+marge@anholt.net>
Mon, 10 Feb 2020 19:09:08 +0000 (19:09 +0000)
Make use of the scratch space for arrays that are larger then 100 elements.
Since for IO r600 is vector based, there is a bit of a scratch space waste
here for arrays that use types smaller then vec4.

Signed-off-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3225>

src/gallium/drivers/r600/sfn/sfn_instruction_export.cpp
src/gallium/drivers/r600/sfn/sfn_instruction_export.h
src/gallium/drivers/r600/sfn/sfn_instruction_fetch.cpp
src/gallium/drivers/r600/sfn/sfn_instruction_fetch.h
src/gallium/drivers/r600/sfn/sfn_ir_to_assembly.cpp
src/gallium/drivers/r600/sfn/sfn_nir.cpp
src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
src/gallium/drivers/r600/sfn/sfn_shader_base.h
src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp
src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp

index d3a07713fa7094c6f9b31a3d5f31e3cf3c1d95a3..fffcb09537cd4a8de479906dadac7b25b203f9e0 100644 (file)
@@ -100,6 +100,98 @@ void ExportInstruction::set_last()
    m_is_last = true;
 }
 
+WriteScratchInstruction::WriteScratchInstruction(unsigned loc, const GPRVector& value,
+                                                 int align, int align_offset, int writemask):
+   WriteoutInstruction (Instruction::mem_wr_scratch, value),
+   m_loc(loc),
+   m_align(align),
+   m_align_offset(align_offset),
+   m_writemask(writemask),
+   m_array_size(0)
+{
+}
+
+WriteScratchInstruction::WriteScratchInstruction(const PValue& address, const GPRVector& value,
+                                                 int align, int align_offset, int writemask, int array_size):
+   WriteoutInstruction (Instruction::mem_wr_scratch, value),
+   m_loc(0),
+   m_address(address),
+   m_align(align),
+   m_align_offset(align_offset),
+   m_writemask(writemask),
+   m_array_size(array_size - 1)
+{
+   add_remappable_src_value(&m_address);
+}
+
+bool WriteScratchInstruction::is_equal_to(const Instruction& lhs) const
+{
+   if (lhs.type() != Instruction::mem_wr_scratch)
+      return false;
+   const auto& other = dynamic_cast<const WriteScratchInstruction&>(lhs);
+
+   if (m_address) {
+      if (!other.m_address)
+         return false;
+      if (*m_address != *other.m_address)
+         return false;
+   } else {
+      if (other.m_address)
+         return false;
+   }
+
+   return gpr() == other.gpr() &&
+         m_loc == other.m_loc &&
+         m_align == other.m_align &&
+         m_align_offset == other.m_align_offset &&
+         m_writemask == other.m_writemask;
+}
+
+static char *writemask_to_swizzle(int writemask, char *buf)
+{
+   const char *swz = "xyzw";
+   for (int i = 0; i < 4; ++i) {
+      buf[i] = (writemask & (1 << i)) ? swz[i] : '_';
+   }
+   return buf;
+}
+
+void WriteScratchInstruction::do_print(std::ostream& os) const
+{
+   char buf[5];
+
+   os << "MEM_SCRATCH_WRITE ";
+   if (m_address)
+      os << "@" << *m_address << "+";
+
+   os << m_loc  << "." << writemask_to_swizzle(m_writemask, buf)
+      << " " <<  gpr()  << " AL:" << m_align << " ALO:" << m_align_offset;
+}
+
+void WriteScratchInstruction::replace_values_child(const ValueSet& candiates, PValue new_value)
+{
+   if (!m_address)
+      return;
+
+   for (auto c: candiates) {
+      if (*c == *m_address)
+         m_address = new_value;
+   }
+}
+
+void WriteScratchInstruction::remap_registers_child(std::vector<rename_reg_pair>& map,
+                           ValueMap& values)
+{
+   if (!m_address)
+      return;
+   sfn_log << SfnLog::merge << "Remap " << *m_address <<  " of type " << m_address->type() << "\n";
+   assert(m_address->type() == Value::gpr);
+   auto new_index = map[m_address->sel()];
+   if (new_index.valid)
+      m_address = values.get_or_inject(new_index.new_reg, m_address->chan());
+   map[m_address->sel()].used = true;
+}
+
 StreamOutIntruction::StreamOutIntruction(const GPRVector& value, int num_components,
                                          int array_base, int comp_mask, int out_buffer,
                                          int stream):
index 1971e3391353ee82eca1b438ed6b14c25b821dc1..f32800381c438e63e083b7220fe023bd3d740581 100644 (file)
@@ -73,6 +73,37 @@ private:
    bool m_is_last;
 };
 
+class WriteScratchInstruction : public WriteoutInstruction {
+public:
+
+   WriteScratchInstruction(unsigned loc, const GPRVector& value, int align,
+                           int align_offset, int writemask);
+   WriteScratchInstruction(const PValue& address, const GPRVector& value,
+                           int align, int align_offset, int writemask, int array_size);
+   unsigned location() const {return m_loc;}
+
+   int write_mask() const { return m_writemask;}
+   int address() const { assert(m_address); return m_address->sel();}
+   bool indirect() const { return !!m_address;}
+   int array_size() const { return m_array_size;}
+
+private:
+   bool is_equal_to(const Instruction& lhs) const override;
+   void do_print(std::ostream& os) const override;
+
+   void replace_values_child(const ValueSet& candiates, PValue new_value) override;
+   void remap_registers_child(std::vector<rename_reg_pair>& map,
+                              ValueMap& values)override;
+
+   unsigned m_loc;
+   PValue m_address;
+   unsigned m_align;
+   unsigned m_align_offset;
+   unsigned m_writemask;
+   int m_array_size;
+};
+
+
 class StreamOutIntruction: public WriteoutInstruction {
 public:
    StreamOutIntruction(const GPRVector& value, int num_components,
index c41692639b997283113dd555368acaa7712039a6..e51bb5889690539b62f25ac05513b01e645a5ef8 100644 (file)
@@ -296,6 +296,11 @@ const std::vector<PInstruction>& FetchInstruction::prelude() const
    return m_prelude;
 }
 
+LoadFromScratch::LoadFromScratch(GPRVector dst, PValue src, int scratch_size):
+   FetchInstruction(dst, src, scratch_size)
+{
+}
+
 static const char *fmt_descr[64] = {
    "INVALID",
    "8",
index 369094edfa7ca3751d350db3581ccda4489de258..465f604fed53fb17be037cff85da781d520a3cba 100644 (file)
@@ -164,6 +164,11 @@ private:
    std::vector<PInstruction> m_prelude;
 };
 
+class LoadFromScratch: public FetchInstruction {
+public:
+   LoadFromScratch(GPRVector dst, PValue src, int scratch_size);
+};
+
 }
 
 #endif // SFN_INSTRUCTION_FETCH_H
index b8ceb7ff0c2ec46c5b1f00ded6cc903ed8d4fcdb..e9624c58b2ba843caac5e06c46ef571da81bbac4 100644 (file)
@@ -58,6 +58,7 @@ private:
    bool emit_loop_break(const LoopBreakInstruction& instr);
    bool emit_loop_continue(const LoopContInstruction& instr);
    bool emit_wait_ack(const WaitAck& instr);
+   bool emit_wr_scratch(const WriteScratchInstruction& instr);
 
    bool emit_load_addr(PValue addr);
    bool emit_fs_pixel_export(const ExportInstruction & exi);
@@ -165,6 +166,8 @@ bool AssemblyFromShaderLegacyImpl::emit(const Instruction::Pointer i)
       return emit_streamout(static_cast<const StreamOutIntruction&>(*i));
    case Instruction::wait_ack:
       return emit_wait_ack(static_cast<const WaitAck&>(*i));
+   case Instruction::mem_wr_scratch:
+      return emit_wr_scratch(static_cast<const WriteScratchInstruction&>(*i));
    default:
       return false;
    }
@@ -749,6 +752,45 @@ bool AssemblyFromShaderLegacyImpl::emit_wait_ack(const WaitAck& instr)
    return r == 0;
 }
 
+bool AssemblyFromShaderLegacyImpl::emit_wr_scratch(const WriteScratchInstruction& instr)
+{
+   struct r600_bytecode_output cf;
+
+   memset(&cf, 0, sizeof(struct r600_bytecode_output));
+
+   cf.op = CF_OP_MEM_SCRATCH;
+   cf.elem_size = 3;
+   cf.gpr = instr.gpr().sel();
+   cf.mark = 1;
+   cf.comp_mask = instr.write_mask();
+   cf.swizzle_x = 0;
+   cf.swizzle_y = 1;
+   cf.swizzle_z = 2;
+   cf.swizzle_w = 3;
+   cf.burst_count = 1;
+
+   if (instr.indirect()) {
+      cf.type = 3;
+      cf.index_gpr = instr.address();
+
+      /* The docu seems to be wrong here: In indirect addressing the
+       * address_base seems to be the array_size */
+      cf.array_size = instr.array_size();
+   } else {
+      cf.type = 2;
+      cf.array_base = instr.location();
+   }
+   /* This should be 0, but the address calculation is apparently wrong */
+
+
+   if (r600_bytecode_add_output(m_bc, &cf)){
+      R600_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n");
+      return false;
+   }
+
+   return true;
+}
+
 extern const std::map<ESDOp, int> ds_opcode_map;
 
 bool AssemblyFromShaderLegacyImpl::copy_dst(r600_bytecode_alu_dst& dst,
index 3bf430438c918b286c42111e3a8984a8a831291d..7eb025f46a45b3e7a90e60066041da4c63ae8548 100644 (file)
@@ -330,10 +330,56 @@ bool r600_nir_lower_pack_unpack_2x16(nir_shader *shader)
                                         nullptr);
 };
 
+static void
+r600_nir_lower_scratch_address_impl(nir_builder *b, nir_intrinsic_instr *instr)
+{
+   b->cursor = nir_before_instr(&instr->instr);
+
+   int address_index = 0;
+   int align;
+
+   if (instr->intrinsic == nir_intrinsic_store_scratch) {
+      align  = instr->src[0].ssa->num_components;
+      address_index = 1;
+   } else{
+      align = instr->dest.ssa.num_components;
+   }
+
+   nir_ssa_def *address = instr->src[address_index].ssa;
+   nir_ssa_def *new_address = nir_ishr(b, address,  nir_imm_int(b, 4 * align));
+
+   nir_instr_rewrite_src(&instr->instr, &instr->src[address_index],
+                         nir_src_for_ssa(new_address));
+}
+
+bool r600_lower_scratch_addresses(nir_shader *shader)
+{
+   bool progress = false;
+   nir_foreach_function(function, shader) {
+      nir_builder build;
+      nir_builder_init(&build, function->impl);
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+            nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+            if (op->intrinsic != nir_intrinsic_load_scratch &&
+                op->intrinsic != nir_intrinsic_store_scratch)
+               continue;
+            r600_nir_lower_scratch_address_impl(&build, op);
+            progress = true;
+         }
+      }
+   }
+   return progress;
+}
+
 }
 
 using r600::r600_nir_lower_int_tg4;
 using r600::r600_nir_lower_pack_unpack_2x16;
+using r600::r600_lower_scratch_addresses;
 using r600::r600_lower_fs_out_to_vector;
 
 int
@@ -463,6 +509,11 @@ int r600_shader_from_nir(struct r600_context *rctx,
    if (optimize)
       while(optimize_once(sel->nir));
 
+   NIR_PASS_V(sel->nir, nir_lower_vars_to_scratch,
+              nir_var_function_temp,
+              100,
+              r600_get_natural_size_align_bytes);
+
    while (optimize && optimize_once(sel->nir));
 
    NIR_PASS_V(sel->nir, nir_lower_locals_to_regs);
@@ -482,6 +533,7 @@ int r600_shader_from_nir(struct r600_context *rctx,
    }
 
    memset(&pipeshader->shader, 0, sizeof(r600_shader));
+   pipeshader->scratch_space_needed = sel->nir->scratch_size;
 
    if (sel->nir->info.stage == MESA_SHADER_TESS_EVAL ||
        sel->nir->info.stage == MESA_SHADER_VERTEX ||
index a0b0d8b7f87854992c87da2e60b6f4c7035f780c..d1f3f2feba03490457b449379b0535e406ee06f6 100644 (file)
@@ -55,12 +55,13 @@ using namespace std;
 
 ShaderFromNirProcessor::ShaderFromNirProcessor(pipe_shader_type ptype,
                                                r600_pipe_shader_selector& sel,
-                                               r600_shader &sh_info):
+                                               r600_shader &sh_info, int scratch_size):
    m_processor_type(ptype),
    m_sh_info(sh_info),
    m_tex_instr(*this),
    m_alu_instr(*this),
    m_pending_else(nullptr),
+   m_scratch_size(scratch_size),
    m_next_hwatomic_loc(0),
    m_sel(sel)
 {
@@ -433,6 +434,10 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
          return false;
       }
    }
+   case nir_intrinsic_store_scratch:
+      return emit_store_scratch(instr);
+   case nir_intrinsic_load_scratch:
+      return emit_load_scratch(instr);
    case nir_intrinsic_store_deref:
       return emit_store_deref(instr);
    case nir_intrinsic_load_uniform:
@@ -477,6 +482,47 @@ bool ShaderFromNirProcessor::load_preloaded_value(const nir_dest& dest, int chan
    return true;
 }
 
+bool ShaderFromNirProcessor::emit_store_scratch(nir_intrinsic_instr* instr)
+{
+   PValue address = from_nir(instr->src[1], 0, 0);
+
+   std::unique_ptr<GPRVector> vec(vec_from_nir_with_fetch_constant(instr->src[0], (1 << instr->num_components) - 1,
+                                  swizzle_from_mask(instr->num_components)));
+   GPRVector value(*vec);
+
+   int writemask = nir_intrinsic_write_mask(instr);
+   int align = nir_intrinsic_align_mul(instr);
+   int align_offset = nir_intrinsic_align_offset(instr);
+
+   WriteScratchInstruction *ir = nullptr;
+   if (address->type() == Value::literal) {
+      const auto& lv = dynamic_cast<const LiteralValue&>(*address);
+      ir = new WriteScratchInstruction(lv.value(), value, align, align_offset, writemask);
+   } else {
+      address = from_nir_with_fetch_constant(instr->src[1], 0);
+      ir = new WriteScratchInstruction(address, value, align, align_offset,
+                                       writemask, m_scratch_size);
+   }
+   emit_instruction(ir);
+   sh_info().needs_scratch_space = 1;
+   return true;
+}
+
+bool ShaderFromNirProcessor::emit_load_scratch(nir_intrinsic_instr* instr)
+{
+   PValue address = from_nir_with_fetch_constant(instr->src[0], 0);
+   std::array<PValue, 4> dst_val;
+   for (int i = 0; i < 4; ++i)
+      dst_val[i] = from_nir(instr->dest, i < instr->num_components ? i : 7);
+
+   GPRVector dst(dst_val);
+   auto ir = new LoadFromScratch(dst, address, m_scratch_size);
+   ir->prelude_append(new WaitAck(0));
+   emit_instruction(ir);
+   sh_info().needs_scratch_space = 1;
+   return true;
+}
+
 GPRVector *ShaderFromNirProcessor::vec_from_nir_with_fetch_constant(const nir_src& src,
                                                                     UNUSED unsigned mask,
                                                                     const GPRVector::Swizzle& swizzle)
index b9184fda2fbfb7ae78f4e50ffe50ebd48d92aac0..3cd9b971e4a4656e0ee3527ad0756448a4d47f91 100644 (file)
@@ -54,7 +54,7 @@ extern SfnLog sfn_log;
 class ShaderFromNirProcessor : public ValuePool {
 public:
    ShaderFromNirProcessor(pipe_shader_type ptype, r600_pipe_shader_selector& sel,
-                          r600_shader& sh_info);
+                          r600_shader& sh_info, int scratch_size);
    virtual ~ShaderFromNirProcessor();
 
    void emit_instruction(Instruction *ir);
@@ -147,6 +147,8 @@ private:
    virtual bool do_emit_load_deref(const nir_variable *in_var, nir_intrinsic_instr* instr) = 0;
    virtual bool do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) = 0;
 
+   bool emit_store_scratch(nir_intrinsic_instr* instr);
+   bool emit_load_scratch(nir_intrinsic_instr* instr);
    virtual void do_finalize() = 0;
 
    void finalize();
@@ -175,6 +177,7 @@ private:
    OutputRegisterMap m_output_register_map;
 
    IfElseInstruction *m_pending_else;
+   int m_scratch_size;
    int m_next_hwatomic_loc;
 
    r600_pipe_shader_selector& m_sel;
index 7b0e4c998d2e4610fb4fd62f88449d54775ba8ac..270cb96d9a4416175a3a08561dd1dded6ad442cf 100644 (file)
@@ -35,7 +35,7 @@ FragmentShaderFromNir::FragmentShaderFromNir(const nir_shader& nir,
                                              r600_shader& sh,
                                              r600_pipe_shader_selector &sel,
                                              const r600_shader_key &key):
-   ShaderFromNirProcessor(PIPE_SHADER_FRAGMENT, sel, sh),
+   ShaderFromNirProcessor(PIPE_SHADER_FRAGMENT, sel, sh, nir.scratch_size),
    m_max_color_exports(MAX2(key.ps.nr_cbufs,1)),
    m_max_counted_color_exports(0),
    m_two_sided_color(key.ps.color_two_side),
index 7eb67f46c8329a6328f0aa7ba690a3c74de3a079..e67cd4638f3a61be96b233c653239eab12676c25 100644 (file)
@@ -39,7 +39,8 @@ using std::priority_queue;
 VertexShaderFromNir::VertexShaderFromNir(r600_pipe_shader *sh,
                                          r600_pipe_shader_selector& sel,
                                          const r600_shader_key& key):
-   ShaderFromNirProcessor (PIPE_SHADER_VERTEX, sel, sh->shader),
+   ShaderFromNirProcessor (PIPE_SHADER_VERTEX, sel, sh->shader,
+                           sh->scratch_space_needed),
    m_num_clip_dist(0),
    m_last_param_export(nullptr),
    m_last_pos_export(nullptr),