src/gallium/drivers/nvfx/nvfx_shader.h

   1 #ifndef __NVFX_SHADER_H__
   2 #define __NVFX_SHADER_H__
   3
   4 /* this will resolve to either the NV30 or the NV40 version
   5  * depending on the current hardware */
   6 /* unusual, but very fast and compact method */
   7 #define NVFX_VP(c) ((NV30_VP_##c) + (nvfx->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
   8
   9 #define NVFX_VP_INST_SLOT_VEC 0
  10 #define NVFX_VP_INST_SLOT_SCA 1
  11
  12 #define NVFX_VP_INST_COND_FL  0 /* guess */
  13 #define NVFX_VP_INST_COND_LT  1
  14 #define NVFX_VP_INST_COND_EQ  2
  15 #define NVFX_VP_INST_COND_LE  3
  16 #define NVFX_VP_INST_COND_GT  4
  17 #define NVFX_VP_INST_COND_NE  5
  18 #define NVFX_VP_INST_COND_GE  6
  19 #define NVFX_VP_INST_COND_TR  7 /* guess */
  20
  21 #define NVFX_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
  22 #define NVFX_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
  23 #define NVFX_VP_INST_IN_NORMAL  2
  24 #define NVFX_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
  25 #define NVFX_VP_INST_IN_COL1  4
  26 #define NVFX_VP_INST_IN_FOGC  5
  27 #define NVFX_VP_INST_IN_TC0  8
  28 #define NVFX_VP_INST_IN_TC(n)  (8+n)
  29
  30 #define NVFX_VP_INST_SCA_OP_NOP 0x00
  31 #define NVFX_VP_INST_SCA_OP_MOV 0x01
  32 #define NVFX_VP_INST_SCA_OP_RCP 0x02
  33 #define NVFX_VP_INST_SCA_OP_RCC 0x03
  34 #define NVFX_VP_INST_SCA_OP_RSQ 0x04
  35 #define NVFX_VP_INST_SCA_OP_EXP 0x05
  36 #define NVFX_VP_INST_SCA_OP_LOG 0x06
  37 #define NVFX_VP_INST_SCA_OP_LIT 0x07
  38 #define NVFX_VP_INST_SCA_OP_BRA 0x09
  39 #define NVFX_VP_INST_SCA_OP_CAL 0x0B
  40 #define NVFX_VP_INST_SCA_OP_RET 0x0C
  41 #define NVFX_VP_INST_SCA_OP_LG2 0x0D
  42 #define NVFX_VP_INST_SCA_OP_EX2 0x0E
  43 #define NVFX_VP_INST_SCA_OP_SIN 0x0F
  44 #define NVFX_VP_INST_SCA_OP_COS 0x10
  45
  46 #define NV40_VP_INST_SCA_OP_PUSHA 0x13
  47 #define NV40_VP_INST_SCA_OP_POPA 0x14
  48
  49 #define NVFX_VP_INST_VEC_OP_NOP 0x00
  50 #define NVFX_VP_INST_VEC_OP_MOV 0x01
  51 #define NVFX_VP_INST_VEC_OP_MUL 0x02
  52 #define NVFX_VP_INST_VEC_OP_ADD 0x03
  53 #define NVFX_VP_INST_VEC_OP_MAD 0x04
  54 #define NVFX_VP_INST_VEC_OP_DP3 0x05
  55 #define NVFX_VP_INST_VEC_OP_DPH 0x06
  56 #define NVFX_VP_INST_VEC_OP_DP4 0x07
  57 #define NVFX_VP_INST_VEC_OP_DST 0x08
  58 #define NVFX_VP_INST_VEC_OP_MIN 0x09
  59 #define NVFX_VP_INST_VEC_OP_MAX 0x0A
  60 #define NVFX_VP_INST_VEC_OP_SLT 0x0B
  61 #define NVFX_VP_INST_VEC_OP_SGE 0x0C
  62 #define NVFX_VP_INST_VEC_OP_ARL 0x0D
  63 #define NVFX_VP_INST_VEC_OP_FRC 0x0E
  64 #define NVFX_VP_INST_VEC_OP_FLR 0x0F
  65 #define NVFX_VP_INST_VEC_OP_SEQ 0x10
  66 #define NVFX_VP_INST_VEC_OP_SFL 0x11
  67 #define NVFX_VP_INST_VEC_OP_SGT 0x12
  68 #define NVFX_VP_INST_VEC_OP_SLE 0x13
  69 #define NVFX_VP_INST_VEC_OP_SNE 0x14
  70 #define NVFX_VP_INST_VEC_OP_STR 0x15
  71 #define NVFX_VP_INST_VEC_OP_SSG 0x16
  72 #define NVFX_VP_INST_VEC_OP_ARR 0x17
  73 #define NVFX_VP_INST_VEC_OP_ARA 0x18
  74
  75 #define NV40_VP_INST_VEC_OP_TXL 0x19
  76
  77 /* DWORD 3 */
  78 #define NVFX_VP_INST_LAST                           (1 << 0)
  79
  80 /*
  81  * Each fragment program opcode appears to be comprised of 4 32-bit values.
  82  *
  83  *   0 - Opcode, output reg/mask, ATTRIB source
  84  *   1 - Source 0
  85  *   2 - Source 1
  86  *   3 - Source 2
  87  *
  88  * There appears to be no special difference between result regs and temp regs.
  89  *     result.color == R0.xyzw
  90  *     result.depth == R1.z
  91  * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
  92  * otherwise it is set to 1.
  93  *
  94  * Constants are inserted directly after the instruction that uses them.
  95  *
  96  * It appears that it's not possible to use two input registers in one
  97  * instruction as the input sourcing is done in the instruction dword
  98  * and not the source selection dwords.  As such instructions such as:
  99  *
 100  *     ADD result.color, fragment.color, fragment.texcoord[0];
 101  *
 102  * must be split into two MOV's and then an ADD (nvidia does this) but
 103  * I'm not sure why it's not just one MOV and then source the second input
 104  * in the ADD instruction..
 105  *
 106  * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
 107  * negation requires multiplication with a const.
 108  *
 109  * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
 110  * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
 111  * is implemented simply by not writing to the relevant components of the destination.
 112  *
 113  * Conditional execution
 114  *   TODO
 115  *
 116  * Non-native instructions:
 117  *   LIT
 118  *   LRP - MAD+MAD
 119  *   SUB - ADD, negate second source
 120  *   RSQ - LG2 + EX2
 121  *   POW - LG2 + MUL + EX2
 122  *   SCS - COS + SIN
 123  *   XPD
 124  *
 125  * NV40 Looping
 126  *   Loops appear to be fairly expensive on NV40 at least, the proprietary
 127  *   driver goes to a lot of effort to avoid using the native looping
 128  *   instructions.  If the total number of *executed* instructions between
 129  *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
 130  *   The maximum loop count is 255.
 131  *
 132  */
 133
 134 //== Opcode / Destination selection ==
 135 #define NVFX_FP_OP_PROGRAM_END          (1 << 0)
 136 #define NVFX_FP_OP_OUT_REG_SHIFT        1
 137 #define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
 138 #define NV40_FP_OP_OUT_REG_MASK          (63 << 1)
 139 /* Needs to be set when writing outputs to get expected result.. */
 140 #define NVFX_FP_OP_OUT_REG_HALF          (1 << 7)
 141 #define NVFX_FP_OP_COND_WRITE_ENABLE        (1 << 8)
 142 #define NVFX_FP_OP_OUTMASK_SHIFT        9
 143 #define NVFX_FP_OP_OUTMASK_MASK          (0xF << 9)
 144 #  define NVFX_FP_OP_OUT_X  (1<<9)
 145 #  define NVFX_FP_OP_OUT_Y  (1<<10)
 146 #  define NVFX_FP_OP_OUT_Z  (1<<11)
 147 #  define NVFX_FP_OP_OUT_W  (1<<12)
 148 /* Uncertain about these, especially the input_src values.. it's possible that
 149  * they can be dynamically changed.
 150  */
 151 #define NVFX_FP_OP_INPUT_SRC_SHIFT        13
 152 #define NVFX_FP_OP_INPUT_SRC_MASK        (15 << 13)
 153 #  define NVFX_FP_OP_INPUT_SRC_POSITION  0x0
 154 #  define NVFX_FP_OP_INPUT_SRC_COL0  0x1
 155 #  define NVFX_FP_OP_INPUT_SRC_COL1  0x2
 156 #  define NVFX_FP_OP_INPUT_SRC_FOGC  0x3
 157 #  define NVFX_FP_OP_INPUT_SRC_TC0    0x4
 158 #  define NVFX_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
 159 #  define NV40_FP_OP_INPUT_SRC_FACING  0xE
 160 #define NVFX_FP_OP_TEX_UNIT_SHIFT        17
 161 #define NVFX_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
 162 #define NVFX_FP_OP_PRECISION_SHIFT        22
 163 #define NVFX_FP_OP_PRECISION_MASK        (3 << 22)
 164 #   define NVFX_FP_PRECISION_FP32  0
 165 #   define NVFX_FP_PRECISION_FP16  1
 166 #   define NVFX_FP_PRECISION_FX12  2
 167 #define NVFX_FP_OP_OPCODE_SHIFT          24
 168 #define NVFX_FP_OP_OPCODE_MASK          (0x3F << 24)
 169 /* NV30/NV40 fragment program opcodes */
 170 #define NVFX_FP_OP_OPCODE_NOP 0x00
 171 #define NVFX_FP_OP_OPCODE_MOV 0x01
 172 #define NVFX_FP_OP_OPCODE_MUL 0x02
 173 #define NVFX_FP_OP_OPCODE_ADD 0x03
 174 #define NVFX_FP_OP_OPCODE_MAD 0x04
 175 #define NVFX_FP_OP_OPCODE_DP3 0x05
 176 #define NVFX_FP_OP_OPCODE_DP4 0x06
 177 #define NVFX_FP_OP_OPCODE_DST 0x07
 178 #define NVFX_FP_OP_OPCODE_MIN 0x08
 179 #define NVFX_FP_OP_OPCODE_MAX 0x09
 180 #define NVFX_FP_OP_OPCODE_SLT 0x0A
 181 #define NVFX_FP_OP_OPCODE_SGE 0x0B
 182 #define NVFX_FP_OP_OPCODE_SLE 0x0C
 183 #define NVFX_FP_OP_OPCODE_SGT 0x0D
 184 #define NVFX_FP_OP_OPCODE_SNE 0x0E
 185 #define NVFX_FP_OP_OPCODE_SEQ 0x0F
 186 #define NVFX_FP_OP_OPCODE_FRC 0x10
 187 #define NVFX_FP_OP_OPCODE_FLR 0x11
 188 #define NVFX_FP_OP_OPCODE_KIL 0x12
 189 #define NVFX_FP_OP_OPCODE_PK4B 0x13
 190 #define NVFX_FP_OP_OPCODE_UP4B 0x14
 191 #define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
 192 #define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
 193 #define NVFX_FP_OP_OPCODE_TEX 0x17
 194 #define NVFX_FP_OP_OPCODE_TXP 0x18
 195 #define NVFX_FP_OP_OPCODE_TXD 0x19
 196 #define NVFX_FP_OP_OPCODE_RCP 0x1A
 197 #define NVFX_FP_OP_OPCODE_EX2 0x1C
 198 #define NVFX_FP_OP_OPCODE_LG2 0x1D
 199 #define NVFX_FP_OP_OPCODE_STR 0x20
 200 #define NVFX_FP_OP_OPCODE_SFL 0x21
 201 #define NVFX_FP_OP_OPCODE_COS 0x22
 202 #define NVFX_FP_OP_OPCODE_SIN 0x23
 203 #define NVFX_FP_OP_OPCODE_PK2H 0x24
 204 #define NVFX_FP_OP_OPCODE_UP2H 0x25
 205 #define NVFX_FP_OP_OPCODE_PK4UB 0x27
 206 #define NVFX_FP_OP_OPCODE_UP4UB 0x28
 207 #define NVFX_FP_OP_OPCODE_PK2US 0x29
 208 #define NVFX_FP_OP_OPCODE_UP2US 0x2A
 209 #define NVFX_FP_OP_OPCODE_DP2A 0x2E
 210 #define NVFX_FP_OP_OPCODE_TXB 0x31
 211 #define NVFX_FP_OP_OPCODE_DIV 0x3A
 212
 213 /* NV30 only fragment program opcodes */
 214 #define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
 215 #define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
 216 #define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
 217 #define NVFX_FP_OP_OPCODE_POW_NV30 0x26
 218 #define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
 219
 220 /* NV40 only fragment program opcodes */
 221 #define NVFX_FP_OP_OPCODE_TXL_NV40 0x31
 222 /* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
 223 #define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
 224 #define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
 225 #define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
 226 #define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
 227 #define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
 228 #define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
 229
 230 #define NVFX_FP_OP_OUT_SAT          (1 << 31)
 231
 232 /* high order bits of SRC0 */
 233 #define NVFX_FP_OP_OUT_ABS          (1 << 29)
 234 #define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
 235 #define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
 236 #define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
 237 #define NVFX_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
 238 #define NVFX_FP_OP_COND_SWZ_Y_SHIFT        23
 239 #define NVFX_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
 240 #define NVFX_FP_OP_COND_SWZ_X_SHIFT        21
 241 #define NVFX_FP_OP_COND_SWZ_X_MASK        (3 << 21)
 242 #define NVFX_FP_OP_COND_SWZ_ALL_SHIFT        21
 243 #define NVFX_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
 244 #define NVFX_FP_OP_COND_SHIFT          18
 245 #define NVFX_FP_OP_COND_MASK          (0x07 << 18)
 246 #  define NVFX_FP_OP_COND_FL  0
 247 #  define NVFX_FP_OP_COND_LT  1
 248 #  define NVFX_FP_OP_COND_EQ  2
 249 #  define NVFX_FP_OP_COND_LE  3
 250 #  define NVFX_FP_OP_COND_GT  4
 251 #  define NVFX_FP_OP_COND_NE  5
 252 #  define NVFX_FP_OP_COND_GE  6
 253 #  define NVFX_FP_OP_COND_TR  7
 254
 255 /* high order bits of SRC1 */
 256 #define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
 257 #define NVFX_FP_OP_DST_SCALE_SHIFT        28
 258 #define NVFX_FP_OP_DST_SCALE_MASK        (3 << 28)
 259 #define NVFX_FP_OP_DST_SCALE_1X                                                0
 260 #define NVFX_FP_OP_DST_SCALE_2X                                                1
 261 #define NVFX_FP_OP_DST_SCALE_4X                                                2
 262 #define NVFX_FP_OP_DST_SCALE_8X                                                3
 263 #define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
 264 #define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
 265 #define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
 266
 267 /* SRC1 LOOP */
 268 #define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
 269 #define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
 270 #define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
 271 #define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
 272 #define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
 273 #define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
 274
 275 /* SRC1 IF */
 276 #define NV40_FP_OP_ELSE_ID_SHIFT                                               2
 277 #define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
 278
 279 /* SRC1 CAL */
 280 #define NV40_FP_OP_IADDR_SHIFT                                                 2
 281 #define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
 282
 283 /* SRC1 REP
 284  *   I have no idea why there are 3 count values here..  but they
 285  *   have always been filled with the same value in my tests so
 286  *   far..
 287  */
 288 #define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
 289 #define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
 290 #define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
 291 #define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
 292 #define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
 293 #define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
 294
 295 /* SRC2 REP/IF */
 296 #define NV40_FP_OP_END_ID_SHIFT                                                2
 297 #define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
 298
 299 /* high order bits of SRC2 */
 300 #define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
 301 #define NV40_FP_OP_ADDR_INDEX_SHIFT        19
 302 #define NV40_FP_OP_ADDR_INDEX_MASK        (0xF << 19)
 303
 304 //== Register selection ==
 305 #define NVFX_FP_REG_TYPE_SHIFT           0
 306 #define NVFX_FP_REG_TYPE_MASK           (3 << 0)
 307 #  define NVFX_FP_REG_TYPE_TEMP   0
 308 #  define NVFX_FP_REG_TYPE_INPUT  1
 309 #  define NVFX_FP_REG_TYPE_CONST  2
 310 #define NVFX_FP_REG_SRC_SHIFT            2
 311 #define NV30_FP_REG_SRC_MASK              (31 << 2)
 312 #define NV40_FP_REG_SRC_MASK              (63 << 2)
 313 #define NVFX_FP_REG_SRC_HALF            (1 << 8)
 314 #define NVFX_FP_REG_SWZ_ALL_SHIFT        9
 315 #define NVFX_FP_REG_SWZ_ALL_MASK        (255 << 9)
 316 #define NVFX_FP_REG_SWZ_X_SHIFT          9
 317 #define NVFX_FP_REG_SWZ_X_MASK          (3 << 9)
 318 #define NVFX_FP_REG_SWZ_Y_SHIFT          11
 319 #define NVFX_FP_REG_SWZ_Y_MASK          (3 << 11)
 320 #define NVFX_FP_REG_SWZ_Z_SHIFT          13
 321 #define NVFX_FP_REG_SWZ_Z_MASK          (3 << 13)
 322 #define NVFX_FP_REG_SWZ_W_SHIFT          15
 323 #define NVFX_FP_REG_SWZ_W_MASK          (3 << 15)
 324 #  define NVFX_FP_SWIZZLE_X  0
 325 #  define NVFX_FP_SWIZZLE_Y  1
 326 #  define NVFX_FP_SWIZZLE_Z  2
 327 #  define NVFX_FP_SWIZZLE_W  3
 328 #define NVFX_FP_REG_NEGATE          (1 << 17)
 329
 330 #ifndef NVFX_SHADER_NO_FUCKEDNESS
 331 #define NVFXSR_NONE     0
 332 #define NVFXSR_OUTPUT   1
 333 #define NVFXSR_INPUT    2
 334 #define NVFXSR_TEMP     3
 335 #define NVFXSR_CONST    4
 336
 337 struct nvfx_sreg {
 338         int type;
 339         int index;
 340
 341         int dst_scale;
 342
 343         int negate;
 344         int abs;
 345         int swz[4];
 346
 347         int cc_update;
 348         int cc_update_reg;
 349         int cc_test;
 350         int cc_test_reg;
 351         int cc_swz[4];
 352 };
 353
 354 static INLINE struct nvfx_sreg
 355 nvfx_sr(int type, int index)
 356 {
 357         struct nvfx_sreg temp = {
 358                 .type = type,
 359                 .index = index,
 360                 .dst_scale = DEF_SCALE,
 361                 .abs = 0,
 362                 .negate = 0,
 363                 .swz = { 0, 1, 2, 3 },
 364                 .cc_update = 0,
 365                 .cc_update_reg = 0,
 366                 .cc_test = DEF_CTEST,
 367                 .cc_test_reg = 0,
 368                 .cc_swz = { 0, 1, 2, 3 },
 369         };
 370         return temp;
 371 }
 372
 373 static INLINE struct nvfx_sreg
 374 nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
 375 {
 376         struct nvfx_sreg dst = src;
 377
 378         dst.swz[SWZ_X] = src.swz[x];
 379         dst.swz[SWZ_Y] = src.swz[y];
 380         dst.swz[SWZ_Z] = src.swz[z];
 381         dst.swz[SWZ_W] = src.swz[w];
 382         return dst;
 383 }
 384
 385 static INLINE struct nvfx_sreg
 386 nvfx_sr_neg(struct nvfx_sreg src)
 387 {
 388         src.negate = !src.negate;
 389         return src;
 390 }
 391
 392 static INLINE struct nvfx_sreg
 393 nvfx_sr_abs(struct nvfx_sreg src)
 394 {
 395         src.abs = 1;
 396         return src;
 397 }
 398
 399 static INLINE struct nvfx_sreg
 400 nvfx_sr_scale(struct nvfx_sreg src, int scale)
 401 {
 402         src.dst_scale = scale;
 403         return src;
 404 }
 405 #endif
 406
 407 #endif