Merge remote branch 'origin/master' into pipe-video
authorChristian König <deathsimple@vodafone.de>
Thu, 20 Jan 2011 21:10:37 +0000 (22:10 +0100)
committerChristian König <deathsimple@vodafone.de>
Thu, 20 Jan 2011 21:10:37 +0000 (22:10 +0100)
Conflicts:
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_shader.c

1  2 
src/gallium/drivers/r600/eg_asm.c
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_shader.c
src/gallium/drivers/r600/r600_state.c
src/gallium/drivers/r600/r600_state_inlines.h
src/gallium/drivers/softpipe/sp_screen.c
src/gallium/include/pipe/p_defines.h

index 1881e633d54655cf042c7bbd88e632b085e74b20,67d742b376098890b75c7a4a923fd77dba63197b..4f86e3b4c38d0fbf721c62a06ff0d1c060d379f7
@@@ -36,8 -35,9 +36,10 @@@ int eg_bc_cf_build(struct r600_bc *bc, 
  
        switch (cf->inst) {
        case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
+       case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
+       case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
        case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 +              assert(!end_of_program);
                bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
                        S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
                        S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
index 31f386964a1057d7872a77a67d325045c3e110a5,a473fb29c4d570dba6e4bb06008d0224741ba3be..61de24b31ae699a05d348f71216f51c97fd5a44a
  #define NUM_OF_CYCLES 3
  #define NUM_OF_COMPONENTS 4
  
- static inline unsigned int r600_bc_get_num_operands(struct r600_bc_alu *alu)
 +#define PREV_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.prev, list)
 +#define NEXT_ALU(alu) LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)
 +
+ static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
  {
        if(alu->is_op3)
                return 3;
@@@ -240,74 -254,121 +297,141 @@@ int r600_bc_add_output(struct r600_bc *
        return 0;
  }
  
 -/* alu instructions that can ony exits once per group */
 -static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
 +/* alu predicate instructions */
- static int is_alu_pred_inst(struct r600_bc_alu *alu)
- {
-       return !alu->is_op3 && (
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
++static int is_alu_pred_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
+ {
+       switch (bc->chiprev) {
+       case CHIPREV_R600:
+       case CHIPREV_R700:
+               return !alu->is_op3 && (
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
 -                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
+       case CHIPREV_EVERGREEN:
+       default:
+               return !alu->is_op3 && (
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
 -                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
+       }
  }
  
- static int is_alu_kill_inst(struct r600_bc_alu *alu)
 +/* alu kill instructions */
-       return !alu->is_op3 && (
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
++static int is_alu_kill_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
 +{
- static int is_alu_once_inst(struct r600_bc_alu *alu)
++      switch (bc->chiprev) {
++      case CHIPREV_R600:
++      case CHIPREV_R700:
++              return !alu->is_op3 && (
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
++                      alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
++      case CHIPREV_EVERGREEN:
++      default:
++              return !alu->is_op3 && (
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
++                      alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT);
++      }
 +}
 +
 +/* alu instructions that can ony exits once per group */
-       return is_alu_kill_inst(alu) ||
-               is_alu_pred_inst(alu);
++static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
 +{
- static int is_alu_reduction_inst(struct r600_bc_alu *alu)
++      return is_alu_kill_inst(bc, alu) ||
++              is_alu_pred_inst(bc, alu);
 +}
 +
+ static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
  {
-       return !alu->is_op3 && (
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
+       switch (bc->chiprev) {
+       case CHIPREV_R600:
+       case CHIPREV_R700:
+               return !alu->is_op3 && (
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
+       case CHIPREV_EVERGREEN:
+       default:
+               return !alu->is_op3 && (
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
+       }
  }
  
- static int is_alu_mova_inst(struct r600_bc_alu *alu)
+ static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
  {
-       return !alu->is_op3 && (
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
-               alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+       switch (bc->chiprev) {
+       case CHIPREV_R600:
+       case CHIPREV_R700:
+               return !alu->is_op3 && (
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+       case CHIPREV_EVERGREEN:
+       default:
+               return !alu->is_op3 && (
+                       alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+       }
  }
  
  /* alu instructions that can only execute on the vector unit */
@@@ -474,31 -569,26 +632,26 @@@ static int is_gpr(unsigned sel
  /* CB constants start at 512, and get translated to a kcache index when ALU
   * clauses are constructed. Note that we handle kcache constants the same way
   * as (the now gone) cfile constants, is that really required? */
- static int is_cb_const(int sel)
+ static int is_cfile(unsigned sel)
  {
-       if (sel > 511 && sel < 4607)
-               return 1;
-       return 0;
+       return (sel > 255 && sel < 512) ||
+               (sel > 511 && sel < 4607) || // Kcache before translate
+               (sel > 127 && sel < 192); // Kcache after translate
  }
 - 
 +
  static int is_const(int sel)
  {
        return is_cfile(sel) ||
-               is_cb_const(sel) ||
 -              (sel >= V_SQ_ALU_SRC_0 && 
 +              (sel >= V_SQ_ALU_SRC_0 &&
                sel <= V_SQ_ALU_SRC_LITERAL);
  }
 - 
 +
- static int check_vector(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, int bank_swizzle)
+ static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
+                       struct alu_bank_swizzle *bs, int bank_swizzle)
  {
        int r, src, num_src, sel, elem, cycle;
 - 
 +
-       num_src = r600_bc_get_num_operands(alu);
+       num_src = r600_bc_get_num_operands(bc, alu);
        for (src = 0; src < num_src; src++) {
                sel = alu->src[src].sel;
                elem = alu->src[src].chan;
        }
        return 0;
  }
 - 
 +
- static int check_scalar(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, int bank_swizzle)
+ static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
+                       struct alu_bank_swizzle *bs, int bank_swizzle)
  {
        int r, src, num_src, const_count, sel, elem, cycle;
 - 
 +
-       num_src = r600_bc_get_num_operands(alu);
+       num_src = r600_bc_get_num_operands(bc, alu);
        for (const_count = 0, src = 0; src < num_src; ++src) {
                sel = alu->src[src].sel;
                elem = alu->src[src].chan;
@@@ -749,9 -846,9 +909,9 @@@ static int merge_inst_groups(struct r60
  {
        struct r600_bc_alu *prev[5];
        struct r600_bc_alu *result[5] = { NULL };
 -      
 +
-       uint32_t literal[4];
-       unsigned nliteral = 0;
+       uint32_t literal[4], prev_literal[4];
+       unsigned nliteral = 0, prev_nliteral = 0;
  
        int i, j, r, src, num_src;
        int num_once_inst = 0;
@@@ -1009,21 -1115,15 +1178,16 @@@ int r600_bc_add_alu_type(struct r600_b
        if (!bc->cf_last->curr_bs_head) {
                bc->cf_last->curr_bs_head = nalu;
        }
-       /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
-        * worst case */
-       if (nalu->last && (bc->cf_last->ndw >> 1) >= 120) {
-               bc->force_add_cf = 1;
-       }
 -      /* number of gpr == the last gpr used in any alu */
 +      /* replace special constants */
        for (i = 0; i < 3; i++) {
 -              if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
 -                      bc->ngpr = nalu->src[i].sel + 1;
 -              }
                if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
                        r600_bc_special_constants(
 -                              nalu->src[i].value[nalu->src[i].chan], 
 +                              nalu->src[i].value[nalu->src[i].chan],
                                &nalu->src[i].sel, &nalu->src[i].neg);
 +
 +              if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
 +                      bc->ngpr = nalu->src[i].sel + 1;
 +              }
        }
        if (nalu->dst.sel >= bc->ngpr) {
                bc->ngpr = nalu->dst.sel + 1;
@@@ -1374,765 -1448,6 +1561,766 @@@ static int r600_bc_cf_build(struct r600
        return 0;
  }
  
- static void notice_alu_src_gprs(struct r600_bc_alu *alu, struct gpr_usage usage[128], int32_t id)
 +struct gpr_usage_range {
 +      int     replacement;
 +      int32_t start;
 +      int32_t end;
 +};
 +
 +struct gpr_usage {
 +      unsigned                channels:4;
 +      int32_t                 first_write;
 +      int32_t                 last_write[4];
 +      unsigned                nranges;
 +      struct gpr_usage_range  *ranges;
 +};
 +
 +static struct gpr_usage_range* add_gpr_usage_range(struct gpr_usage *usage)
 +{
 +      usage->nranges++;
 +      usage->ranges = realloc(usage->ranges, usage->nranges * sizeof(struct gpr_usage_range));
 +      if (!usage->ranges)
 +              return NULL;
 +      return &usage->ranges[usage->nranges-1];
 +}
 +
 +static void notice_gpr_read(struct gpr_usage *usage, int32_t id, unsigned chan)
 +{
 +        usage->channels |= 1 << chan;
 +        usage->first_write = -1;
 +        if (!usage->nranges) {
 +              struct gpr_usage_range* range = add_gpr_usage_range(usage);
 +              range->replacement = -1;
 +                range->start = -1;
 +                range->end = -1;
 +        }
 +        if (usage->ranges[usage->nranges-1].end < id)
 +              usage->ranges[usage->nranges-1].end = id;
 +}
 +
 +static void notice_gpr_rel_read(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_read(&usage[i], id, chan);
 +}
 +
 +static void notice_gpr_last_write(struct gpr_usage *usage, int32_t id, unsigned chan)
 +{
 +        usage->last_write[chan] = id;
 +}
 +
 +static void notice_gpr_write(struct gpr_usage *usage, int32_t id, unsigned chan,
 +                              int predicate, int prefered_replacement)
 +{
 +      int32_t start = usage->first_write != -1 ? usage->first_write : id;
 +      usage->channels &= ~(1 << chan);
 +      if (usage->channels) {
 +              if (usage->first_write == -1)
 +                      usage->first_write = id;
 +      } else if (!usage->nranges || (usage->ranges[usage->nranges-1].start != start && !predicate)) {
 +              usage->first_write = start;
 +              struct gpr_usage_range* range = add_gpr_usage_range(usage);
 +              range->replacement = prefered_replacement;
 +                range->start = start;
 +                range->end = -1;
 +        } else if (usage->ranges[usage->nranges-1].start == start && prefered_replacement != -1) {
 +              usage->ranges[usage->nranges-1].replacement = prefered_replacement;
 +        }
 +        notice_gpr_last_write(usage, id, chan);
 +}
 +
 +static void notice_gpr_rel_last_write(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_last_write(&usage[i], id, chan);
 +}
 +
 +static void notice_gpr_rel_write(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_write(&usage[i], id, chan, 1, -1);
 +}
 +
-       num_src = r600_bc_get_num_operands(alu);
++static void notice_alu_src_gprs(struct r600_bc *bc, struct r600_bc_alu *alu,
++                                struct gpr_usage usage[128], int32_t id)
 +{
 +      unsigned src, num_src;
 +
- static void replace_alu_gprs(struct r600_bc_alu *alu, struct gpr_usage usage[128],
++      num_src = r600_bc_get_num_operands(bc, alu);
 +      for (src = 0; src < num_src; ++src) {
 +              // constants doesn't matter
 +              if (!is_gpr(alu->src[src].sel))
 +                      continue;
 +
 +              if (alu->src[src].rel)
 +                      notice_gpr_rel_read(usage, id, alu->src[src].chan);
 +              else
 +                      notice_gpr_read(&usage[alu->src[src].sel], id, alu->src[src].chan);
 +      }
 +}
 +
 +static void notice_alu_dst_gprs(struct r600_bc_alu *alu_first, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      struct r600_bc_alu *alu;
 +      for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
 +              if (alu->dst.write) {
 +                      if (alu->dst.rel)
 +                              notice_gpr_rel_write(usage, id, alu->dst.chan);
 +                      else if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV && is_gpr(alu->src[0].sel))
 +                              notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan,
 +                                              predicate, alu->src[0].sel);
 +                      else
 +                              notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan, predicate, -1);
 +              }
 +
 +              if (alu->last)
 +                      break;
 +      }
 +}
 +
 +static void notice_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      if (tex->src_rel) {
 +                if (tex->src_sel_x < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_x);
 +              if (tex->src_sel_y < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_y);
 +              if (tex->src_sel_z < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_z);
 +              if (tex->src_sel_w < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_w);
 +        } else {
 +              if (tex->src_sel_x < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_x);
 +              if (tex->src_sel_y < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_y);
 +              if (tex->src_sel_z < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_z);
 +              if (tex->src_sel_w < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_w);
 +      }
 +      if (tex->dst_rel) {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_rel_write(usage, id, 0);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_rel_write(usage, id, 1);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_rel_write(usage, id, 2);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_rel_write(usage, id, 3);
 +      } else {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 0, predicate, -1);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 1, predicate, -1);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 2, predicate, -1);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 3, predicate, -1);
 +      }
 +}
 +
 +static void notice_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      notice_gpr_read(&usage[vtx->src_gpr], id, vtx->src_sel_x);
 +
 +      if (vtx->dst_sel_x != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 0, predicate, -1);
 +      if (vtx->dst_sel_y != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 1, predicate, -1);
 +      if (vtx->dst_sel_z != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 2, predicate, -1);
 +      if (vtx->dst_sel_w != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 3, predicate, -1);
 +}
 +
 +static void notice_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
 +                              struct r600_bc_cf *export_cf[128], int32_t export_remap[128])
 +{
 +      //TODO handle other memory operations
 +      struct gpr_usage *output = &usage[cf->output.gpr];
 +      int32_t id = (output->last_write[0] + 0x100) & ~0xFF;
 +
 +      export_cf[cf->output.gpr] = cf;
 +      export_remap[cf->output.gpr] = id;
 +      if (cf->output.swizzle_x < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_x);
 +      if (cf->output.swizzle_y < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_y);
 +      if (cf->output.swizzle_z < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_z);
 +      if (cf->output.swizzle_w < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_w);
 +}
 +
 +static struct gpr_usage_range *find_src_range(struct gpr_usage *usage, int32_t id)
 +{
 +      unsigned i;
 +      for (i = 0; i < usage->nranges; ++i) {
 +              struct gpr_usage_range* range = &usage->ranges[i];
 +
 +              if (range->start < id && id <= range->end)
 +                      return range;
 +      }
 +      return NULL;
 +}
 +
 +static struct gpr_usage_range *find_dst_range(struct gpr_usage *usage, int32_t id)
 +{
 +      unsigned i;
 +      for (i = 0; i < usage->nranges; ++i) {
 +              struct gpr_usage_range* range = &usage->ranges[i];
 +              int32_t end = range->end;
 +
 +              if (range->start <= id && (id < end || end == -1))
 +                      return range;
 +      }
 +      assert(0); /* should not happen */
 +      return NULL;
 +}
 +
 +static int is_barrier_needed(struct gpr_usage *usage, int32_t id, unsigned chan, int32_t last_barrier)
 +{
 +      if (usage->last_write[chan] != (id & ~0xFF))
 +              return usage->last_write[chan] >= last_barrier;
 +      else
 +              return 0;
 +}
 +
 +static int is_intersection(struct gpr_usage_range* a, struct gpr_usage_range* b)
 +{
 +      return a->start <= b->end && b->start < a->end;
 +}
 +
 +static int rate_replacement(struct gpr_usage *usage, struct gpr_usage_range* range)
 +{
 +      unsigned i;
 +      int32_t best_start = 0x3FFFFFFF, best_end = 0x3FFFFFFF;
 +
 +      for (i = 0; i < usage->nranges; ++i) {
 +              if (usage->ranges[i].replacement != -1)
 +                      continue; /* ignore already remapped ranges */
 +
 +              if (is_intersection(&usage->ranges[i], range))
 +                      return -1; /* forget it if usages overlap */
 +
 +              if (range->start >= usage->ranges[i].end)
 +                      best_start = MIN2(best_start, range->start - usage->ranges[i].end);
 +
 +              if (range->end != -1 && range->end <= usage->ranges[i].start)
 +                      best_end = MIN2(best_end, usage->ranges[i].start - range->end);
 +      }
 +      return best_start + best_end;
 +}
 +
 +static void find_replacement(struct gpr_usage usage[128], unsigned current,
 +                              struct gpr_usage_range *range, int is_export)
 +{
 +      unsigned i;
 +      int best_gpr = -1, best_rate = 0x7FFFFFFF;
 +
 +      if (range->replacement != -1 && range->replacement <= current) {
 +              struct gpr_usage_range *other = find_src_range(&usage[range->replacement], range->start);
 +              if (other && other->replacement != -1)
 +                      range->replacement = other->replacement;
 +      }
 +
 +      if (range->replacement != -1 && range->replacement < current) {
 +              int rate = rate_replacement(&usage[range->replacement], range);
 +
 +              /* check if prefered replacement can be used */
 +              if (rate != -1) {
 +                      best_rate = rate;
 +                      best_gpr = range->replacement;
 +              }
 +      }
 +
 +      if (best_gpr == -1 && (range->start & ~0xFF) == (range->end & ~0xFF)) {
 +              /* register is just used inside one ALU clause */
 +              /* try to use clause temporaryis for it */
 +              for (i = 127; i > 123; --i) {
 +                      int rate = rate_replacement(&usage[i], range);
 +
 +                      if (rate == -1) /* can't be used because ranges overlap */
 +                              continue;
 +
 +                      if (rate < best_rate) {
 +                              best_rate = rate;
 +                              best_gpr = i;
 +
 +                              /* can't get better than this */
 +                              if (rate == 0 || is_export)
 +                                      break;
 +                      }
 +              }
 +      }
 +
 +      if (best_gpr == -1) {
 +              for (i = 0; i < current; ++i) {
 +                      int rate = rate_replacement(&usage[i], range);
 +
 +                      if (rate == -1) /* can't be used because ranges overlap */
 +                              continue;
 +
 +                      if (rate < best_rate) {
 +                              best_rate = rate;
 +                              best_gpr = i;
 +
 +                              /* can't get better than this */
 +                              if (rate == 0)
 +                                      break;
 +                      }
 +              }
 +      }
 +
 +      range->replacement = best_gpr;
 +      if (best_gpr != -1) {
 +              struct gpr_usage_range *reservation = add_gpr_usage_range(&usage[best_gpr]);
 +              reservation->replacement = -1;
 +              reservation->start = range->start;
 +              reservation->end = range->end;
 +      }
 +}
 +
 +static void find_export_replacement(struct gpr_usage usage[128],
 +                              struct gpr_usage_range *range, struct r600_bc_cf *current,
 +                              struct r600_bc_cf *next, int32_t next_id)
 +{
 +      if (!next || next_id <= range->start || next_id > range->end)
 +              return;
 +
 +      if (current->output.type != next->output.type)
 +              return;
 +
 +      if ((current->output.array_base + 1) != next->output.array_base)
 +              return;
 +
 +      find_src_range(&usage[next->output.gpr], next_id)->replacement = range->replacement + 1;
 +}
 +
-       num_src = r600_bc_get_num_operands(alu);
++static void replace_alu_gprs(struct r600_bc *bc, struct r600_bc_alu *alu, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage;
 +      struct gpr_usage_range *range;
 +      unsigned src, num_src;
 +
- static void optimize_alu_inst(struct r600_bc_cf *cf, struct r600_bc_alu *alu)
++      num_src = r600_bc_get_num_operands(bc, alu);
 +      for (src = 0; src < num_src; ++src) {
 +              // constants doesn't matter
 +              if (!is_gpr(alu->src[src].sel))
 +                      continue;
 +
 +              cur_usage = &usage[alu->src[src].sel];
 +              range = find_src_range(cur_usage, id);
 +              if (range->replacement != -1)
 +                      alu->src[src].sel = range->replacement;
 +
 +              *barrier |= is_barrier_needed(cur_usage, id, alu->src[src].chan, last_barrier);
 +      }
 +
 +      if (alu->dst.write) {
 +              cur_usage = &usage[alu->dst.sel];
 +              range = find_dst_range(cur_usage, id);
 +              if (range->replacement == alu->dst.sel) {
 +                      if (!alu->is_op3)
 +                              alu->dst.write = 0;
 +                      else
 +                              /*TODO: really check that register 123 is useable */
 +                              alu->dst.sel = 123;
 +              } else if (range->replacement != -1) {
 +                      alu->dst.sel = range->replacement;
 +              }
 +              if (alu->dst.rel)
 +                      notice_gpr_rel_last_write(usage, id, alu->dst.chan);
 +              else
 +                      notice_gpr_last_write(cur_usage, id, alu->dst.chan);
 +      }
 +}
 +
 +static void replace_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage = &usage[tex->src_gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      if (tex->src_rel) {
 +              *barrier = 1;
 +        } else {
 +              if (tex->src_sel_x < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_x, last_barrier);
 +              if (tex->src_sel_y < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_y, last_barrier);
 +              if (tex->src_sel_z < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_z, last_barrier);
 +              if (tex->src_sel_w < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_w, last_barrier);
 +      }
 +
 +      if (range->replacement != -1)
 +              tex->src_gpr = range->replacement;
 +
 +      cur_usage = &usage[tex->dst_gpr];
 +      range = find_dst_range(cur_usage, id);
 +      if (range->replacement != -1)
 +              tex->dst_gpr = range->replacement;
 +
 +      if (tex->dst_rel) {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_x);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_y);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_z);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_w);
 +      } else {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_x);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_y);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_z);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_w);
 +      }
 +}
 +
 +static void replace_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage = &usage[vtx->src_gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      *barrier |= is_barrier_needed(cur_usage, id, vtx->src_sel_x, last_barrier);
 +
 +      if (range->replacement != -1)
 +              vtx->src_gpr = range->replacement;
 +
 +      cur_usage = &usage[vtx->dst_gpr];
 +      range = find_dst_range(cur_usage, id);
 +      if (range->replacement != -1)
 +              vtx->dst_gpr = range->replacement;
 +
 +      if (vtx->dst_sel_x != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_x);
 +      if (vtx->dst_sel_y != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_y);
 +      if (vtx->dst_sel_z != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_z);
 +      if (vtx->dst_sel_w != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_w);
 +}
 +
 +static void replace_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier)
 +{
 +      //TODO handle other memory operations
 +      struct gpr_usage *cur_usage = &usage[cf->output.gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      cf->barrier = 0;
 +      if (cf->output.swizzle_x < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_x, last_barrier);
 +      if (cf->output.swizzle_y < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_y, last_barrier);
 +      if (cf->output.swizzle_z < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_z, last_barrier);
 +      if (cf->output.swizzle_w < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_w, last_barrier);
 +
 +      if (range->replacement != -1)
 +              cf->output.gpr = range->replacement;
 +}
 +
-                       chan = is_alu_reduction_inst(alu) ? 0 : alu->dst.chan;
++static void optimize_alu_inst(struct r600_bc *bc, struct r600_bc_cf *cf, struct r600_bc_alu *alu)
 +{
 +      struct r600_bc_alu *alu_next;
 +      unsigned chan;
 +      unsigned src, num_src;
 +
 +      /* check if a MOV could be optimized away */
 +      if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV) {
 +
 +              /* destination equals source? */
 +              if (alu->dst.sel != alu->src[0].sel ||
 +                      alu->dst.chan != alu->src[0].chan)
 +                      return;
 +
 +              /* any special handling for the source? */
 +              if (alu->src[0].rel || alu->src[0].neg || alu->src[0].abs)
 +                      return;
 +
 +              /* any special handling for destination? */
 +              if (alu->dst.rel || alu->dst.clamp)
 +                      return;
 +
 +              /* ok find next instruction group and check if ps/pv is used */
 +              for (alu_next = alu; !alu_next->last; alu_next = NEXT_ALU(alu_next));
 +
 +              if (alu_next->list.next != &cf->alu) {
-                               num_src = r600_bc_get_num_operands(alu_next);
++                      chan = is_alu_reduction_inst(bc, alu) ? 0 : alu->dst.chan;
 +                      for (alu_next = NEXT_ALU(alu_next); alu_next; alu_next = NEXT_ALU(alu_next)) {
-                               notice_alu_src_gprs(alu, usage, id);
++                              num_src = r600_bc_get_num_operands(bc, alu_next);
 +                              for (src = 0; src < num_src; ++src) {
 +                                      if (alu_next->src[src].sel == V_SQ_ALU_SRC_PV &&
 +                                              alu_next->src[src].chan == chan)
 +                                              return;
 +
 +                                      if (alu_next->src[src].sel == V_SQ_ALU_SRC_PS)
 +                                              return;
 +                              }
 +
 +                              if (alu_next->last)
 +                                      break;
 +                      }
 +              }
 +
 +              r600_bc_remove_alu(cf, alu);
 +      }
 +}
 +
 +static void optimize_export_inst(struct r600_bc *bc, struct r600_bc_cf *cf)
 +{
 +      struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, cf->list.prev, list);
 +      if (&prev->list == &bc->cf ||
 +              prev->inst != cf->inst ||
 +              prev->output.type != cf->output.type ||
 +              prev->output.elem_size != cf->output.elem_size ||
 +              prev->output.swizzle_x != cf->output.swizzle_x ||
 +              prev->output.swizzle_y != cf->output.swizzle_y ||
 +              prev->output.swizzle_z != cf->output.swizzle_z ||
 +              prev->output.swizzle_w != cf->output.swizzle_w)
 +              return;
 +
 +      if ((prev->output.burst_count + cf->output.burst_count) > 16)
 +              return;
 +
 +      if ((prev->output.gpr + prev->output.burst_count) == cf->output.gpr &&
 +              (prev->output.array_base + prev->output.burst_count) == cf->output.array_base) {
 +
 +              prev->output.burst_count += cf->output.burst_count;
 +              r600_bc_remove_cf(bc, cf);
 +
 +      } else if (prev->output.gpr == (cf->output.gpr + cf->output.burst_count) &&
 +              prev->output.array_base == (cf->output.array_base + cf->output.burst_count)) {
 +
 +              cf->output.burst_count += prev->output.burst_count;
 +              r600_bc_remove_cf(bc, prev);
 +      }
 +}
 +
 +static void r600_bc_optimize(struct r600_bc *bc)
 +{
 +      struct r600_bc_cf *cf, *next_cf;
 +      struct r600_bc_alu *first, *next_alu;
 +      struct r600_bc_alu *alu;
 +      struct r600_bc_vtx *vtx;
 +      struct r600_bc_tex *tex;
 +      struct gpr_usage usage[128];
 +
 +      /* assume that each gpr is exported only once */
 +      struct r600_bc_cf *export_cf[128] = { NULL };
 +      int32_t export_remap[128];
 +
 +      int32_t id, barrier[bc->nstack];
 +      unsigned i, j, stack, predicate, old_stack;
 +
 +      memset(&usage, 0, sizeof(usage));
 +      for (i = 0; i < 128; ++i) {
 +              usage[i].first_write = -1;
 +              usage[i].last_write[0] = -1;
 +              usage[i].last_write[1] = -1;
 +              usage[i].last_write[2] = -1;
 +              usage[i].last_write[3] = -1;
 +      }
 +
 +      /* first gather some informations about the gpr usage */
 +      id = 0; stack = 0;
 +      LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
 +                      predicate = 0;
 +                      first = NULL;
 +                      LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
 +                              if (!first)
 +                                      first = alu;
-                               if (is_alu_pred_inst(alu))
++                              notice_alu_src_gprs(bc, alu, usage, id);
 +                              if (alu->last) {
 +                                      notice_alu_dst_gprs(first, usage, id, predicate || stack > 0);
 +                                      first = NULL;
 +                                      ++id;
 +                              }
-                               replace_alu_gprs(alu, usage, id, barrier[stack], &cf->barrier);
++                              if (is_alu_pred_inst(bc, alu))
 +                                      predicate++;
 +                      }
 +                      if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
 +                              stack += predicate;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
 +                              stack -= 1;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
 +                              stack -= 2;
 +                      break;
 +              case CF_CLASS_TEXTURE:
 +                      LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 +                              notice_tex_gprs(tex, usage, id++, stack > 0);
 +                      }
 +                      break;
 +              case CF_CLASS_VERTEX:
 +                      LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
 +                              notice_vtx_gprs(vtx, usage, id++, stack > 0);
 +                      }
 +                      break;
 +              case CF_CLASS_EXPORT:
 +                      notice_export_gprs(cf, usage, export_cf, export_remap);
 +                      continue; // don't increment id
 +              case CF_CLASS_OTHER:
 +                      switch (cf->inst) {
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
 +                              break;
 +
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_POP:
 +                              stack -= cf->pop_count;
 +                              break;
 +
 +                      default:
 +                              // TODO implement loop handling
 +                              goto out;
 +                      }
 +              }
 +              id += 0x100;
 +              id &= ~0xFF;
 +      }
 +      assert(stack == 0);
 +
 +      /* try to optimize gpr usage */
 +      for (i = 0; i < 124; ++i) {
 +              for (j = 0; j < usage[i].nranges; ++j) {
 +                      struct gpr_usage_range *range = &usage[i].ranges[j];
 +                      int is_export = export_cf[i] && export_cf[i + 1] &&
 +                              range->start < export_remap[i] &&
 +                              export_remap[i] <= range->end;
 +
 +                      if (range->start == -1)
 +                              range->replacement = -1;
 +                      else if (range->end == -1)
 +                              range->replacement = i;
 +                      else
 +                              find_replacement(usage, i, range, is_export);
 +
 +                      if (range->replacement == -1)
 +                              bc->ngpr = i;
 +                      else if (range->replacement < i && range->replacement > bc->ngpr)
 +                              bc->ngpr = range->replacement;
 +
 +                      if (is_export && range->replacement != -1) {
 +                              find_export_replacement(usage, range, export_cf[i],
 +                                                      export_cf[i + 1], export_remap[i + 1]);
 +                      }
 +              }
 +      }
 +      bc->ngpr++;
 +
 +      /* apply the changes */
 +      for (i = 0; i < 128; ++i) {
 +              usage[i].last_write[0] = -1;
 +              usage[i].last_write[1] = -1;
 +              usage[i].last_write[2] = -1;
 +              usage[i].last_write[3] = -1;
 +      }
 +      barrier[0] = 0;
 +      id = 0; stack = 0;
 +      LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
 +              old_stack = stack;
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
 +                      predicate = 0;
 +                      first = NULL;
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
-                               if (is_alu_pred_inst(alu))
++                              replace_alu_gprs(bc, alu, usage, id, barrier[stack], &cf->barrier);
 +                              if (alu->last)
 +                                      ++id;
 +
-                                       optimize_alu_inst(cf, alu);
++                              if (is_alu_pred_inst(bc, alu))
 +                                      predicate++;
 +
 +                              if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)
++                                      optimize_alu_inst(bc, cf, alu);
 +                      }
 +                      if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
 +                              stack += predicate;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
 +                              stack -= 1;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
 +                              stack -= 2;
 +                      if (LIST_IS_EMPTY(&cf->alu)) {
 +                              r600_bc_remove_cf(bc, cf);
 +                              cf = NULL;
 +                      }
 +                      break;
 +              case CF_CLASS_TEXTURE:
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 +                              replace_tex_gprs(tex, usage, id++, barrier[stack], &cf->barrier);
 +                      }
 +                      break;
 +              case CF_CLASS_VERTEX:
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
 +                              replace_vtx_gprs(vtx, usage, id++, barrier[stack], &cf->barrier);
 +                      }
 +                      break;
 +              case CF_CLASS_EXPORT:
 +                      continue; // don't increment id
 +              case CF_CLASS_OTHER:
 +                      if (cf->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
 +                              cf->barrier = 0;
 +                              stack -= cf->pop_count;
 +                      }
 +                      break;
 +              }
 +
 +              id &= ~0xFF;
 +              if (cf && cf->barrier)
 +                      barrier[old_stack] = id;
 +
 +              for (i = old_stack + 1; i <= stack; ++i)
 +                      barrier[i] = barrier[old_stack];
 +
 +              id += 0x100;
 +              if (stack != 0) /* ensue exports are placed outside of conditional blocks */
 +                      continue;
 +
 +              for (i = 0; i < 128; ++i) {
 +                      if (!export_cf[i] || id < export_remap[i])
 +                              continue;
 +
 +                      r600_bc_move_cf(bc, export_cf[i], next_cf);
 +                      replace_export_gprs(export_cf[i], usage, export_remap[i], barrier[stack]);
 +                      if (export_cf[i]->barrier)
 +                              barrier[stack] = id - 1;
 +                      next_cf = LIST_ENTRY(struct r600_bc_cf, export_cf[i]->list.next, list);
 +                      optimize_export_inst(bc, export_cf[i]);
 +                      export_cf[i] = NULL;
 +              }
 +      }
 +      assert(stack == 0);
 +
 +out:
 +      for (i = 0; i < 128; ++i) {
 +              free(usage[i].ranges);
 +      }
 +}
 +
  int r600_bc_build(struct r600_bc *bc)
  {
        struct r600_bc_cf *cf;
                bc->nstack = 1;
        }
  
 +      r600_bc_optimize(bc);
 +
        /* first path compute addr of each CF block */
        /* addr start after all the CF instructions */
 -      addr = bc->cf_last->id + 2;
 +      addr = LIST_ENTRY(struct r600_bc_cf, bc->cf.prev, list)->id + 2;
        LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
 -              switch (cf->inst) {
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
-                       nliteral = 0;
-                       LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
-                               r = r600_bc_alu_nliterals(alu, literal, &nliteral);
-                               if (r)
-                                       return r;
-                               if (alu->last) {
-                                       cf->ndw += align(nliteral, 2);
-                                       nliteral = 0;
-                               }
-                       }
                        break;
 -              case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
 -              case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
 -              case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
 +              case CF_CLASS_TEXTURE:
 +              case CF_CLASS_VERTEX:
                        /* fetch node need to be 16 bytes aligned*/
                        addr += 3;
                        addr &= 0xFFFFFFFCUL;
                        r = r600_bc_cf_build(bc, cf);
                if (r)
                        return r;
 -              switch (cf->inst) {
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
 -              case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
                        nliteral = 0;
+                       memset(literal, 0, sizeof(literal));
                        LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
-                               r = r600_bc_alu_nliterals(alu, literal, &nliteral);
+                               r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
                                if (r)
                                        return r;
-                               r600_bc_alu_adjust_literals(alu, literal, nliteral);
+                               r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
                                switch(bc->chiprev) {
                                case CHIPREV_R600:
                                        r = r600_bc_alu_build(bc, alu, addr);
Simple merge
Simple merge