191131a40a1738bd72ee1db53d368ca38deef77f
[mesa.git] / src / gallium / drivers / nvfx / nvfx_shader.h
1 #ifndef __NVFX_SHADER_H__
2 #define __NVFX_SHADER_H__
3
4 /* this will resolve to either the NV30 or the NV40 version
5 * depending on the current hardware */
6 /* unusual, but very fast and compact method */
7 #define NVFX_VP(c) ((NV30_VP_##c) + (nvfx->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
8
9 #define NVFX_VP_INST_SLOT_VEC 0
10 #define NVFX_VP_INST_SLOT_SCA 1
11
12 #define NVFX_VP_INST_COND_FL 0 /* guess */
13 #define NVFX_VP_INST_COND_LT 1
14 #define NVFX_VP_INST_COND_EQ 2
15 #define NVFX_VP_INST_COND_LE 3
16 #define NVFX_VP_INST_COND_GT 4
17 #define NVFX_VP_INST_COND_NE 5
18 #define NVFX_VP_INST_COND_GE 6
19 #define NVFX_VP_INST_COND_TR 7 /* guess */
20
21 #define NVFX_VP_INST_IN_POS 0 /* These seem to match the bindings specified in */
22 #define NVFX_VP_INST_IN_WEIGHT 1 /* the ARB_v_p spec (2.14.3.1) */
23 #define NVFX_VP_INST_IN_NORMAL 2
24 #define NVFX_VP_INST_IN_COL0 3 /* Should probably confirm them all though */
25 #define NVFX_VP_INST_IN_COL1 4
26 #define NVFX_VP_INST_IN_FOGC 5
27 #define NVFX_VP_INST_IN_TC0 8
28 #define NVFX_VP_INST_IN_TC(n) (8+n)
29
30 #define NVFX_VP_INST_SCA_OP_NOP 0x00
31 #define NVFX_VP_INST_SCA_OP_MOV 0x01
32 #define NVFX_VP_INST_SCA_OP_RCP 0x02
33 #define NVFX_VP_INST_SCA_OP_RCC 0x03
34 #define NVFX_VP_INST_SCA_OP_RSQ 0x04
35 #define NVFX_VP_INST_SCA_OP_EXP 0x05
36 #define NVFX_VP_INST_SCA_OP_LOG 0x06
37 #define NVFX_VP_INST_SCA_OP_LIT 0x07
38 #define NVFX_VP_INST_SCA_OP_BRA 0x09
39 #define NVFX_VP_INST_SCA_OP_CAL 0x0B
40 #define NVFX_VP_INST_SCA_OP_RET 0x0C
41 #define NVFX_VP_INST_SCA_OP_LG2 0x0D
42 #define NVFX_VP_INST_SCA_OP_EX2 0x0E
43 #define NVFX_VP_INST_SCA_OP_SIN 0x0F
44 #define NVFX_VP_INST_SCA_OP_COS 0x10
45
46 #define NV40_VP_INST_SCA_OP_PUSHA 0x13
47 #define NV40_VP_INST_SCA_OP_POPA 0x14
48
49 #define NVFX_VP_INST_VEC_OP_NOP 0x00
50 #define NVFX_VP_INST_VEC_OP_MOV 0x01
51 #define NVFX_VP_INST_VEC_OP_MUL 0x02
52 #define NVFX_VP_INST_VEC_OP_ADD 0x03
53 #define NVFX_VP_INST_VEC_OP_MAD 0x04
54 #define NVFX_VP_INST_VEC_OP_DP3 0x05
55 #define NVFX_VP_INST_VEC_OP_DPH 0x06
56 #define NVFX_VP_INST_VEC_OP_DP4 0x07
57 #define NVFX_VP_INST_VEC_OP_DST 0x08
58 #define NVFX_VP_INST_VEC_OP_MIN 0x09
59 #define NVFX_VP_INST_VEC_OP_MAX 0x0A
60 #define NVFX_VP_INST_VEC_OP_SLT 0x0B
61 #define NVFX_VP_INST_VEC_OP_SGE 0x0C
62 #define NVFX_VP_INST_VEC_OP_ARL 0x0D
63 #define NVFX_VP_INST_VEC_OP_FRC 0x0E
64 #define NVFX_VP_INST_VEC_OP_FLR 0x0F
65 #define NVFX_VP_INST_VEC_OP_SEQ 0x10
66 #define NVFX_VP_INST_VEC_OP_SFL 0x11
67 #define NVFX_VP_INST_VEC_OP_SGT 0x12
68 #define NVFX_VP_INST_VEC_OP_SLE 0x13
69 #define NVFX_VP_INST_VEC_OP_SNE 0x14
70 #define NVFX_VP_INST_VEC_OP_STR 0x15
71 #define NVFX_VP_INST_VEC_OP_SSG 0x16
72 #define NVFX_VP_INST_VEC_OP_ARR 0x17
73 #define NVFX_VP_INST_VEC_OP_ARA 0x18
74
75 #define NV40_VP_INST_VEC_OP_TXL 0x19
76
77 /* DWORD 3 */
78 #define NVFX_VP_INST_LAST (1 << 0)
79
80 /*
81 * Each fragment program opcode appears to be comprised of 4 32-bit values.
82 *
83 * 0 - Opcode, output reg/mask, ATTRIB source
84 * 1 - Source 0
85 * 2 - Source 1
86 * 3 - Source 2
87 *
88 * There appears to be no special difference between result regs and temp regs.
89 * result.color == R0.xyzw
90 * result.depth == R1.z
91 * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
92 * otherwise it is set to 1.
93 *
94 * Constants are inserted directly after the instruction that uses them.
95 *
96 * It appears that it's not possible to use two input registers in one
97 * instruction as the input sourcing is done in the instruction dword
98 * and not the source selection dwords. As such instructions such as:
99 *
100 * ADD result.color, fragment.color, fragment.texcoord[0];
101 *
102 * must be split into two MOV's and then an ADD (nvidia does this) but
103 * I'm not sure why it's not just one MOV and then source the second input
104 * in the ADD instruction..
105 *
106 * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
107 * negation requires multiplication with a const.
108 *
109 * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
110 * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
111 * is implemented simply by not writing to the relevant components of the destination.
112 *
113 * Conditional execution
114 * TODO
115 *
116 * Non-native instructions:
117 * LIT
118 * LRP - MAD+MAD
119 * SUB - ADD, negate second source
120 * RSQ - LG2 + EX2
121 * POW - LG2 + MUL + EX2
122 * SCS - COS + SIN
123 * XPD
124 *
125 * NV40 Looping
126 * Loops appear to be fairly expensive on NV40 at least, the proprietary
127 * driver goes to a lot of effort to avoid using the native looping
128 * instructions. If the total number of *executed* instructions between
129 * REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
130 * The maximum loop count is 255.
131 *
132 */
133
134 //== Opcode / Destination selection ==
135 #define NVFX_FP_OP_PROGRAM_END (1 << 0)
136 #define NVFX_FP_OP_OUT_REG_SHIFT 1
137 #define NV30_FP_OP_OUT_REG_MASK (31 << 1) /* uncertain */
138 #define NV40_FP_OP_OUT_REG_MASK (63 << 1)
139 /* Needs to be set when writing outputs to get expected result.. */
140 #define NVFX_FP_OP_OUT_REG_HALF (1 << 7)
141 #define NVFX_FP_OP_COND_WRITE_ENABLE (1 << 8)
142 #define NVFX_FP_OP_OUTMASK_SHIFT 9
143 #define NVFX_FP_OP_OUTMASK_MASK (0xF << 9)
144 # define NVFX_FP_OP_OUT_X (1<<9)
145 # define NVFX_FP_OP_OUT_Y (1<<10)
146 # define NVFX_FP_OP_OUT_Z (1<<11)
147 # define NVFX_FP_OP_OUT_W (1<<12)
148 /* Uncertain about these, especially the input_src values.. it's possible that
149 * they can be dynamically changed.
150 */
151 #define NVFX_FP_OP_INPUT_SRC_SHIFT 13
152 #define NVFX_FP_OP_INPUT_SRC_MASK (15 << 13)
153 # define NVFX_FP_OP_INPUT_SRC_POSITION 0x0
154 # define NVFX_FP_OP_INPUT_SRC_COL0 0x1
155 # define NVFX_FP_OP_INPUT_SRC_COL1 0x2
156 # define NVFX_FP_OP_INPUT_SRC_FOGC 0x3
157 # define NVFX_FP_OP_INPUT_SRC_TC0 0x4
158 # define NVFX_FP_OP_INPUT_SRC_TC(n) (0x4 + n)
159 # define NV40_FP_OP_INPUT_SRC_FACING 0xE
160 #define NVFX_FP_OP_TEX_UNIT_SHIFT 17
161 #define NVFX_FP_OP_TEX_UNIT_MASK (0xF << 17) /* guess */
162 #define NVFX_FP_OP_PRECISION_SHIFT 22
163 #define NVFX_FP_OP_PRECISION_MASK (3 << 22)
164 # define NVFX_FP_PRECISION_FP32 0
165 # define NVFX_FP_PRECISION_FP16 1
166 # define NVFX_FP_PRECISION_FX12 2
167 #define NVFX_FP_OP_OPCODE_SHIFT 24
168 #define NVFX_FP_OP_OPCODE_MASK (0x3F << 24)
169 /* NV30/NV40 fragment program opcodes */
170 #define NVFX_FP_OP_OPCODE_NOP 0x00
171 #define NVFX_FP_OP_OPCODE_MOV 0x01
172 #define NVFX_FP_OP_OPCODE_MUL 0x02
173 #define NVFX_FP_OP_OPCODE_ADD 0x03
174 #define NVFX_FP_OP_OPCODE_MAD 0x04
175 #define NVFX_FP_OP_OPCODE_DP3 0x05
176 #define NVFX_FP_OP_OPCODE_DP4 0x06
177 #define NVFX_FP_OP_OPCODE_DST 0x07
178 #define NVFX_FP_OP_OPCODE_MIN 0x08
179 #define NVFX_FP_OP_OPCODE_MAX 0x09
180 #define NVFX_FP_OP_OPCODE_SLT 0x0A
181 #define NVFX_FP_OP_OPCODE_SGE 0x0B
182 #define NVFX_FP_OP_OPCODE_SLE 0x0C
183 #define NVFX_FP_OP_OPCODE_SGT 0x0D
184 #define NVFX_FP_OP_OPCODE_SNE 0x0E
185 #define NVFX_FP_OP_OPCODE_SEQ 0x0F
186 #define NVFX_FP_OP_OPCODE_FRC 0x10
187 #define NVFX_FP_OP_OPCODE_FLR 0x11
188 #define NVFX_FP_OP_OPCODE_KIL 0x12
189 #define NVFX_FP_OP_OPCODE_PK4B 0x13
190 #define NVFX_FP_OP_OPCODE_UP4B 0x14
191 #define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
192 #define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
193 #define NVFX_FP_OP_OPCODE_TEX 0x17
194 #define NVFX_FP_OP_OPCODE_TXP 0x18
195 #define NVFX_FP_OP_OPCODE_TXD 0x19
196 #define NVFX_FP_OP_OPCODE_RCP 0x1A
197 #define NVFX_FP_OP_OPCODE_EX2 0x1C
198 #define NVFX_FP_OP_OPCODE_LG2 0x1D
199 #define NVFX_FP_OP_OPCODE_STR 0x20
200 #define NVFX_FP_OP_OPCODE_SFL 0x21
201 #define NVFX_FP_OP_OPCODE_COS 0x22
202 #define NVFX_FP_OP_OPCODE_SIN 0x23
203 #define NVFX_FP_OP_OPCODE_PK2H 0x24
204 #define NVFX_FP_OP_OPCODE_UP2H 0x25
205 #define NVFX_FP_OP_OPCODE_PK4UB 0x27
206 #define NVFX_FP_OP_OPCODE_UP4UB 0x28
207 #define NVFX_FP_OP_OPCODE_PK2US 0x29
208 #define NVFX_FP_OP_OPCODE_UP2US 0x2A
209 #define NVFX_FP_OP_OPCODE_DP2A 0x2E
210 #define NVFX_FP_OP_OPCODE_TXB 0x31
211 #define NVFX_FP_OP_OPCODE_DIV 0x3A
212
213 /* NV30 only fragment program opcodes */
214 #define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
215 #define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
216 #define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
217 #define NVFX_FP_OP_OPCODE_POW_NV30 0x26
218 #define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
219
220 /* NV40 only fragment program opcodes */
221 #define NVFX_FP_OP_OPCODE_TXL_NV40 0x31
222 /* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
223 #define NV40_FP_OP_BRA_OPCODE_BRK 0x0
224 #define NV40_FP_OP_BRA_OPCODE_CAL 0x1
225 #define NV40_FP_OP_BRA_OPCODE_IF 0x2
226 #define NV40_FP_OP_BRA_OPCODE_LOOP 0x3
227 #define NV40_FP_OP_BRA_OPCODE_REP 0x4
228 #define NV40_FP_OP_BRA_OPCODE_RET 0x5
229
230 #define NVFX_FP_OP_OUT_SAT (1 << 31)
231
232 /* high order bits of SRC0 */
233 #define NVFX_FP_OP_OUT_ABS (1 << 29)
234 #define NVFX_FP_OP_COND_SWZ_W_SHIFT 27
235 #define NVFX_FP_OP_COND_SWZ_W_MASK (3 << 27)
236 #define NVFX_FP_OP_COND_SWZ_Z_SHIFT 25
237 #define NVFX_FP_OP_COND_SWZ_Z_MASK (3 << 25)
238 #define NVFX_FP_OP_COND_SWZ_Y_SHIFT 23
239 #define NVFX_FP_OP_COND_SWZ_Y_MASK (3 << 23)
240 #define NVFX_FP_OP_COND_SWZ_X_SHIFT 21
241 #define NVFX_FP_OP_COND_SWZ_X_MASK (3 << 21)
242 #define NVFX_FP_OP_COND_SWZ_ALL_SHIFT 21
243 #define NVFX_FP_OP_COND_SWZ_ALL_MASK (0xFF << 21)
244 #define NVFX_FP_OP_COND_SHIFT 18
245 #define NVFX_FP_OP_COND_MASK (0x07 << 18)
246 # define NVFX_FP_OP_COND_FL 0
247 # define NVFX_FP_OP_COND_LT 1
248 # define NVFX_FP_OP_COND_EQ 2
249 # define NVFX_FP_OP_COND_LE 3
250 # define NVFX_FP_OP_COND_GT 4
251 # define NVFX_FP_OP_COND_NE 5
252 # define NVFX_FP_OP_COND_GE 6
253 # define NVFX_FP_OP_COND_TR 7
254
255 /* high order bits of SRC1 */
256 #define NV40_FP_OP_OPCODE_IS_BRANCH (1<<31)
257 #define NVFX_FP_OP_DST_SCALE_SHIFT 28
258 #define NVFX_FP_OP_DST_SCALE_MASK (3 << 28)
259 #define NVFX_FP_OP_DST_SCALE_1X 0
260 #define NVFX_FP_OP_DST_SCALE_2X 1
261 #define NVFX_FP_OP_DST_SCALE_4X 2
262 #define NVFX_FP_OP_DST_SCALE_8X 3
263 #define NVFX_FP_OP_DST_SCALE_INV_2X 5
264 #define NVFX_FP_OP_DST_SCALE_INV_4X 6
265 #define NVFX_FP_OP_DST_SCALE_INV_8X 7
266
267 /* SRC1 LOOP */
268 #define NV40_FP_OP_LOOP_INCR_SHIFT 19
269 #define NV40_FP_OP_LOOP_INCR_MASK (0xFF << 19)
270 #define NV40_FP_OP_LOOP_INDEX_SHIFT 10
271 #define NV40_FP_OP_LOOP_INDEX_MASK (0xFF << 10)
272 #define NV40_FP_OP_LOOP_COUNT_SHIFT 2
273 #define NV40_FP_OP_LOOP_COUNT_MASK (0xFF << 2)
274
275 /* SRC1 IF */
276 #define NV40_FP_OP_ELSE_ID_SHIFT 2
277 #define NV40_FP_OP_ELSE_ID_MASK (0xFF << 2)
278
279 /* SRC1 CAL */
280 #define NV40_FP_OP_IADDR_SHIFT 2
281 #define NV40_FP_OP_IADDR_MASK (0xFF << 2)
282
283 /* SRC1 REP
284 * I have no idea why there are 3 count values here.. but they
285 * have always been filled with the same value in my tests so
286 * far..
287 */
288 #define NV40_FP_OP_REP_COUNT1_SHIFT 2
289 #define NV40_FP_OP_REP_COUNT1_MASK (0xFF << 2)
290 #define NV40_FP_OP_REP_COUNT2_SHIFT 10
291 #define NV40_FP_OP_REP_COUNT2_MASK (0xFF << 10)
292 #define NV40_FP_OP_REP_COUNT3_SHIFT 19
293 #define NV40_FP_OP_REP_COUNT3_MASK (0xFF << 19)
294
295 /* SRC2 REP/IF */
296 #define NV40_FP_OP_END_ID_SHIFT 2
297 #define NV40_FP_OP_END_ID_MASK (0xFF << 2)
298
299 /* high order bits of SRC2 */
300 #define NVFX_FP_OP_INDEX_INPUT (1 << 30)
301 #define NV40_FP_OP_ADDR_INDEX_SHIFT 19
302 #define NV40_FP_OP_ADDR_INDEX_MASK (0xF << 19)
303
304 //== Register selection ==
305 #define NVFX_FP_REG_TYPE_SHIFT 0
306 #define NVFX_FP_REG_TYPE_MASK (3 << 0)
307 # define NVFX_FP_REG_TYPE_TEMP 0
308 # define NVFX_FP_REG_TYPE_INPUT 1
309 # define NVFX_FP_REG_TYPE_CONST 2
310 #define NVFX_FP_REG_SRC_SHIFT 2
311 #define NV30_FP_REG_SRC_MASK (31 << 2)
312 #define NV40_FP_REG_SRC_MASK (63 << 2)
313 #define NVFX_FP_REG_SRC_HALF (1 << 8)
314 #define NVFX_FP_REG_SWZ_ALL_SHIFT 9
315 #define NVFX_FP_REG_SWZ_ALL_MASK (255 << 9)
316 #define NVFX_FP_REG_SWZ_X_SHIFT 9
317 #define NVFX_FP_REG_SWZ_X_MASK (3 << 9)
318 #define NVFX_FP_REG_SWZ_Y_SHIFT 11
319 #define NVFX_FP_REG_SWZ_Y_MASK (3 << 11)
320 #define NVFX_FP_REG_SWZ_Z_SHIFT 13
321 #define NVFX_FP_REG_SWZ_Z_MASK (3 << 13)
322 #define NVFX_FP_REG_SWZ_W_SHIFT 15
323 #define NVFX_FP_REG_SWZ_W_MASK (3 << 15)
324 # define NVFX_FP_SWIZZLE_X 0
325 # define NVFX_FP_SWIZZLE_Y 1
326 # define NVFX_FP_SWIZZLE_Z 2
327 # define NVFX_FP_SWIZZLE_W 3
328 #define NVFX_FP_REG_NEGATE (1 << 17)
329
330 #ifndef NVFX_SHADER_NO_FUCKEDNESS
331 #define NVFXSR_NONE 0
332 #define NVFXSR_OUTPUT 1
333 #define NVFXSR_INPUT 2
334 #define NVFXSR_TEMP 3
335 #define NVFXSR_CONST 4
336
337 struct nvfx_sreg {
338 int type;
339 int index;
340
341 int dst_scale;
342
343 int negate;
344 int abs;
345 int swz[4];
346
347 int cc_update;
348 int cc_update_reg;
349 int cc_test;
350 int cc_test_reg;
351 int cc_swz[4];
352 };
353
354 static INLINE struct nvfx_sreg
355 nvfx_sr(int type, int index)
356 {
357 struct nvfx_sreg temp = {
358 .type = type,
359 .index = index,
360 .dst_scale = DEF_SCALE,
361 .abs = 0,
362 .negate = 0,
363 .swz = { 0, 1, 2, 3 },
364 .cc_update = 0,
365 .cc_update_reg = 0,
366 .cc_test = DEF_CTEST,
367 .cc_test_reg = 0,
368 .cc_swz = { 0, 1, 2, 3 },
369 };
370 return temp;
371 }
372
373 static INLINE struct nvfx_sreg
374 nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
375 {
376 struct nvfx_sreg dst = src;
377
378 dst.swz[SWZ_X] = src.swz[x];
379 dst.swz[SWZ_Y] = src.swz[y];
380 dst.swz[SWZ_Z] = src.swz[z];
381 dst.swz[SWZ_W] = src.swz[w];
382 return dst;
383 }
384
385 static INLINE struct nvfx_sreg
386 nvfx_sr_neg(struct nvfx_sreg src)
387 {
388 src.negate = !src.negate;
389 return src;
390 }
391
392 static INLINE struct nvfx_sreg
393 nvfx_sr_abs(struct nvfx_sreg src)
394 {
395 src.abs = 1;
396 return src;
397 }
398
399 static INLINE struct nvfx_sreg
400 nvfx_sr_scale(struct nvfx_sreg src, int scale)
401 {
402 src.dst_scale = scale;
403 return src;
404 }
405 #endif
406
407 #endif