From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 7 Sep 2010 13:40:34 +0000 (+0200)
Subject: nv50: prepare for having multiple functions
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d91b8865ec2bb41f9b58ad5ce2df7f6f48f98281;p=mesa.git

nv50: prepare for having multiple functions

At some point we'll want to support real subroutines instead of
just inlining them into the main shader.

Since recursive calls are forbidden, we can just save all used
registers to a fixed local memory region and restore them on a
return, no need for a stack pointer.
---

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e34c0553eb4..c54f16e4c53 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -304,7 +304,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 }
 
 static void
-nv_do_print_program(void *priv, struct nv_basic_block *b)
+nv_do_print_function(void *priv, struct nv_basic_block *b)
 {
    struct nv_instruction *i = b->phi;
 
@@ -323,11 +323,23 @@ nv_do_print_program(void *priv, struct nv_basic_block *b)
 }
 
 void
-nv_print_program(struct nv_basic_block *root)
+nv_print_function(struct nv_basic_block *root)
 {
-   nv_pc_pass_in_order(root, nv_do_print_program, root);
+   if (root->subroutine)
+      debug_printf("SUBROUTINE %i\n", root->subroutine);
+   else
+      debug_printf("MAIN\n");
 
-   debug_printf("END\n\n");
+   nv_pc_pass_in_order(root, nv_do_print_function, root);
+}
+
+void
+nv_print_program(struct nv_pc *pc)
+{
+   int i;
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i])
+         nv_print_function(pc->root[i]);
 }
 
 static INLINE void
@@ -388,11 +400,18 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (!pc)
       return 1;
 
+   pc->root = CALLOC(ti->subr_nr + 1, sizeof(pc->root[0]));
+   if (!pc->root) {
+      FREE(pc);
+      return 1;
+   }
+   pc->num_subroutines = ti->subr_nr;
+
    ret = nv50_tgsi_to_nc(pc, ti);
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* optimization */
@@ -400,7 +419,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* register allocation */
@@ -408,7 +427,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* prepare for emission */
@@ -441,16 +460,19 @@ nv50_generate_code(struct nv50_translation_info *ti)
 
 out:
    nv_pc_free_refs(pc);
-   if (ret) {
+
+   if (pc->bb_list)
+      FREE(pc->bb_list);
+
+   if (ret) { /* on success, these will be referenced by nv50_program */
       if (pc->emit)
-         free(pc->emit);
+         FREE(pc->emit);
       if (pc->immd_buf)
-         free(pc->immd_buf);
+         FREE(pc->immd_buf);
       if (pc->fixups)
-         free(pc->fixups);
+         FREE(pc->fixups);
    }
-   free(pc);
-
+   FREE(pc);
    return ret;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 703d32d334e..d9cc775572e 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -282,7 +282,7 @@ struct nv_basic_block {
    ubyte in_kind[8];
 
    int id;
-   struct nv_basic_block *last_visitor;
+   int subroutine;
    uint priv;
    uint pass_seq;
 
@@ -314,10 +314,10 @@ nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data)
    bin[fixup->offset / 4] = val;
 }
 
-struct nv_pc {
-   struct nv50_translation_info *ti;
+struct nv50_translation_info;
 
-   struct nv_basic_block *root;
+struct nv_pc {
+   struct nv_basic_block **root;
    struct nv_basic_block *current_block;
    struct nv_basic_block *parent_block;
 
@@ -332,6 +332,7 @@ struct nv_pc {
    int num_instructions;
    int num_refs;
    int num_blocks;
+   int num_subroutines;
 
    int max_reg[4];
 
@@ -463,7 +464,8 @@ void nv_print_instruction(struct nv_instruction *);
 
 /* nv50_pc.c */
 
-void nv_print_program(struct nv_basic_block *b);
+void nv_print_function(struct nv_basic_block *root);
+void nv_print_program(struct nv_pc *);
 
 boolean nv_op_commutative(uint opcode);
 int nv50_indirect_opnd(struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 1ed50321754..4f5bdc1f9fb 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -213,23 +213,36 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
    pc->bin_size += b->bin_size *= 4;
 }
 
-int
-nv_pc_exec_pass2(struct nv_pc *pc)
+static int
+nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass pass;
 
    pass.pc = pc;
 
    pc->pass_seq++;
-   nv_pass_flatten(&pass, pc->root);
+
+   nv_pass_flatten(&pass, root);
+
+   nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc);
+
+   return 0;
+}
+
+int
+nv_pc_exec_pass2(struct nv_pc *pc)
+{
+   int i, ret;
 
    NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
 
-   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
-   pc->num_blocks = 0;
+   pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0]));
 
-   nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc);
+   pc->num_blocks = 0;
 
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
+         return ret;
    return 0;
 }
 
@@ -1032,8 +1045,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
-int
-nv_pc_exec_pass0(struct nv_pc *pc)
+static int
+nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass_reld_elim *reldelim;
    struct nv_pass pass;
@@ -1047,35 +1060,35 @@ nv_pc_exec_pass0(struct nv_pc *pc)
     * to whether sources are supported memory loads.
     */
    pc->pass_seq++;
-   ret = nv_pass_lower_arith(&pass, pc->root);
+   ret = nv_pass_lower_arith(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_loads(&pass, pc->root);
+   ret = nv_pass_fold_loads(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_stores(&pass, pc->root);
+   ret = nv_pass_fold_stores(&pass, root);
    if (ret)
       return ret;
 
    reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
    reldelim->pc = pc;
    pc->pass_seq++;
-   ret = nv_pass_reload_elim(reldelim, pc->root);
+   ret = nv_pass_reload_elim(reldelim, root);
    FREE(reldelim);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_cse(&pass, pc->root);
+   ret = nv_pass_cse(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_lower_mods(&pass, pc->root);
+   ret = nv_pass_lower_mods(&pass, root);
    if (ret)
       return ret;
 
@@ -1083,14 +1096,25 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    do {
       dce.removed = 0;
       pc->pass_seq++;
-      ret = nv_pass_dce(&dce, pc->root);
+      ret = nv_pass_dce(&dce, root);
       if (ret)
          return ret;
    } while (dce.removed);
 
-   ret = nv_pass_tex_mask(&pass, pc->root);
+   ret = nv_pass_tex_mask(&pass, root);
    if (ret)
       return ret;
 
    return ret;
 }
+
+int
+nv_pc_exec_pass0(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index d401706b5bc..2998343db52 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -874,8 +874,8 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter)
    return 0;
 }
 
-int
-nv_pc_exec_pass1(struct nv_pc *pc)
+static int
+nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pc_pass *ctx;
    int i, ret;
@@ -890,12 +890,12 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
 
    pc->pass_seq++;
-   ret = pass_generate_phi_movs(ctx, pc->root);
+   ret = pass_generate_phi_movs(ctx, root);
    assert(!ret);
 
    for (i = 0; i < pc->loop_nesting_bound; ++i) {
       pc->pass_seq++;
-      ret = pass_build_live_sets(ctx, pc->root);
+      ret = pass_build_live_sets(ctx, root);
       assert(!ret && "live sets");
       if (ret) {
          NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i);
@@ -904,10 +904,10 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    }
 
    pc->pass_seq++;
-   nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx);
+   nv_pc_pass_in_order(root, pass_order_instructions, ctx);
 
    pc->pass_seq++;
-   ret = pass_build_intervals(ctx, pc->root);
+   ret = pass_build_intervals(ctx, root);
    assert(!ret && "build intervals");
    if (ret) {
       NOUVEAU_ERR("failed to build live intervals\n");
@@ -944,3 +944,14 @@ out:
    FREE(ctx);
    return ret;
 }
+
+int
+nv_pc_exec_pass1(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d7d3030e2f6..925028700cd 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -147,10 +147,17 @@ prog_inst(struct nv50_translation_info *ti,
    int s, c, k;
    unsigned mask;
 
+   if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+      ti->subr[ti->subr_nr].pos = id - 1;
+      ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */
+      ++ti->subr_nr;
+   }
+
    if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      dst = &inst->Dst[0].Register;
+
       for (c = 0; c < 4; ++c) {
-         dst = &inst->Dst[0].Register;
-         if (inst->Dst[0].Register.Indirect)
+         if (dst->Indirect)
             nv50_indirect_outputs(ti, id);
          if (!(dst->WriteMask & (1 << c)))
             continue;
@@ -182,6 +189,44 @@ prog_inst(struct nv50_translation_info *ti,
    }
 }
 
+/* Probably should introduce something like struct tgsi_function_declaration
+ * instead of trying to guess inputs/outputs.
+ */
+static void
+prog_subroutine_inst(struct nv50_subroutine *subr,
+                     const struct tgsi_full_instruction *inst)
+{
+   const struct tgsi_dst_register *dst;
+   const struct tgsi_src_register *src;
+   int s, c, k;
+   unsigned mask;
+
+   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
+      src = &inst->Src[s].Register;
+      if (src->File != TGSI_FILE_TEMPORARY)
+         continue;
+      mask = nv50_tgsi_src_mask(inst, s);
+
+      assert(!inst->Src[s].Register.Indirect);
+
+      for (c = 0; c < 4; ++c) {
+         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
+
+         if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W)
+            if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32))))
+               subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32);
+      }
+   }
+
+   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+      dst = &inst->Dst[0].Register;
+
+      for (c = 0; c < 4; ++c)
+         if (dst->WriteMask & (1 << c))
+            subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32);
+   }
+}
+
 static void
 prog_immediate(struct nv50_translation_info *ti,
                const struct tgsi_full_immediate *imm)
@@ -482,7 +527,7 @@ nv50_prog_scan(struct nv50_translation_info *ti)
 {
    struct nv50_program *p = ti->p;
    struct tgsi_parse_context parse;
-   int ret;
+   int ret, i;
 
    p->vp.edgeflag = 0x40;
    p->vp.psiz = 0x40;
@@ -496,6 +541,9 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    tgsi_dump(p->pipe.tokens, 0);
 #endif
 
+   ti->subr =
+      CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0]));
+
    ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
    ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
 
@@ -519,6 +567,13 @@ nv50_prog_scan(struct nv50_translation_info *ti)
       }
    }
 
+   /* Scan to determine which registers are inputs/outputs of a subroutine. */
+   for (i = 0; i < ti->subr_nr; ++i) {
+      int pc = ti->subr[i].id;
+      while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB)
+         prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]);
+   }
+
    p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1;
    p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1;
 
@@ -572,6 +627,8 @@ out:
       FREE(ti->immd32_ty);
    if (ti->insns)
       FREE(ti->insns);
+   if (ti->subr)
+      FREE(ti->subr);
    FREE(ti);
    return ret ? FALSE : TRUE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 3c3f1f7f970..918baf325f5 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -27,6 +27,8 @@
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_class.h"
 
+#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4)
+
 struct nv50_varying {
    uint8_t id; /* tgsi index */
    uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
@@ -92,13 +94,13 @@ struct nv50_program {
 #define NV50_INTERP_FLAT     (1 << 1)
 #define NV50_INTERP_CENTROID (1 << 2)
 
-#define NV50_PROG_MAX_SUBROUTINES 8
-
 /* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */
 struct nv50_subroutine {
-   int id;
-   uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */
-   uint32_t retv[4][1];
+   unsigned id;
+   unsigned pos;
+   /* function inputs and outputs */
+   uint32_t argv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
+   uint32_t retv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
 };
 
 struct nv50_translation_info {
@@ -119,8 +121,8 @@ struct nv50_translation_info {
    unsigned immd32_nr;
    ubyte *immd32_ty;
    ubyte edgeflag_out;
-   struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES];
-   int subr_nr;
+   struct nv50_subroutine *subr;
+   unsigned subr_nr;
 };
 
 int nv50_generate_code(struct nv50_translation_info *ti);
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index fc75d81d549..c1efa443daf 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -26,6 +26,7 @@
 #include "nv50_context.h"
 #include "nv50_screen.h"
 #include "nv50_resource.h"
+#include "nv50_program.h"
 
 #include "nouveau/nouveau_stateobj.h"
 
@@ -152,7 +153,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_MAX_VS_TEMPS:
 	case PIPE_CAP_MAX_FS_TEMPS: /* no spilling atm */
-		return 128 / 4;
+		return NV50_CAP_MAX_PROGRAM_TEMPS;
 	case PIPE_CAP_DEPTH_CLAMP:
 		return 1;
 	default:
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 386dbda423d..dea8fa0663e 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1850,7 +1850,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
    struct bld_context *bld = CALLOC_STRUCT(bld_context);
    int c;
 
-   pc->root = pc->current_block = new_basic_block(pc);
+   pc->root[0] = pc->current_block = new_basic_block(pc);
 
    bld->pc = pc;
    bld->ti = ti;