Merge remote branch 'origin/gallium-0.1' into nouveau-gallium-0.1
[mesa.git] / src / gallium / drivers / i965simple / brw_wm_decl.c
1
2 #include "brw_context.h"
3 #include "brw_eu.h"
4 #include "brw_wm.h"
5 #include "pipe/p_util.h"
6 #include "pipe/p_shader_tokens.h"
7 #include "tgsi/tgsi_parse.h"
8
9 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
10 {
11 c->tmp_index++;
12 c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
13 return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
14 }
15
16 static void release_tmps(struct brw_wm_compile *c)
17 {
18 c->tmp_index = 0;
19 }
20
21
22
23 static int is_null( struct brw_reg reg )
24 {
25 return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
26 reg.nr == BRW_ARF_NULL);
27 }
28
29 static void emit_pixel_xy( struct brw_wm_compile *c )
30 {
31 if (is_null(c->pixel_xy[0])) {
32
33 struct brw_compile *p = &c->func;
34 struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
35
36 c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
37 c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
38
39 /* Calculate pixel centers by adding 1 or 0 to each of the
40 * micro-tile coordinates passed in r1.
41 */
42 brw_ADD(p,
43 c->pixel_xy[0],
44 stride(suboffset(r1_uw, 4), 2, 4, 0),
45 brw_imm_v(0x10101010));
46
47 brw_ADD(p,
48 c->pixel_xy[1],
49 stride(suboffset(r1_uw, 5), 2, 4, 0),
50 brw_imm_v(0x11001100));
51 }
52 }
53
54
55
56
57
58
59 static void emit_delta_xy( struct brw_wm_compile *c )
60 {
61 if (is_null(c->delta_xy[0])) {
62 struct brw_compile *p = &c->func;
63 struct brw_reg r1 = brw_vec1_grf(1, 0);
64
65 emit_pixel_xy(c);
66
67 c->delta_xy[0] = alloc_tmp(c);
68 c->delta_xy[1] = alloc_tmp(c);
69
70 /* Calc delta X,Y by subtracting origin in r1 from the pixel
71 * centers.
72 */
73 brw_ADD(p,
74 c->delta_xy[0],
75 retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
76 negate(r1));
77
78 brw_ADD(p,
79 c->delta_xy[1],
80 retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
81 negate(suboffset(r1,1)));
82 }
83 }
84
85
86
87 #if 0
88 static void emit_pixel_w( struct brw_wm_compile *c )
89 {
90 if (is_null(c->pixel_w)) {
91 struct brw_compile *p = &c->func;
92
93 struct brw_reg interp_wpos = c->coef_wpos;
94
95 c->pixel_w = alloc_tmp(c);
96
97 emit_delta_xy(c);
98
99 /* Calc 1/w - just linterp wpos[3] optimized by putting the
100 * result straight into a message reg.
101 */
102 struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
103 brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
104 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
105
106 /* Calc w */
107 brw_math_16( p,
108 c->pixel_w,
109 BRW_MATH_FUNCTION_INV,
110 BRW_MATH_SATURATE_NONE,
111 2,
112 brw_null_reg(),
113 BRW_MATH_PRECISION_FULL);
114 }
115 }
116 #endif
117
118
119 static void emit_cinterp(struct brw_wm_compile *c,
120 int idx,
121 int mask )
122 {
123 struct brw_compile *p = &c->func;
124 struct brw_reg interp[4];
125 struct brw_reg coef = c->payload_coef[idx];
126 int i;
127
128 interp[0] = brw_vec1_grf(coef.nr, 0);
129 interp[1] = brw_vec1_grf(coef.nr, 4);
130 interp[2] = brw_vec1_grf(coef.nr+1, 0);
131 interp[3] = brw_vec1_grf(coef.nr+1, 4);
132
133 for(i = 0; i < 4; i++ ) {
134 if (mask & (1<<i)) {
135 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
136 brw_MOV(p, dst, suboffset(interp[i],3));
137 }
138 }
139 }
140
141 static void emit_linterp(struct brw_wm_compile *c,
142 int idx,
143 int mask )
144 {
145 struct brw_compile *p = &c->func;
146 struct brw_reg interp[4];
147 struct brw_reg coef = c->payload_coef[idx];
148 int i;
149
150 emit_delta_xy(c);
151
152 interp[0] = brw_vec1_grf(coef.nr, 0);
153 interp[1] = brw_vec1_grf(coef.nr, 4);
154 interp[2] = brw_vec1_grf(coef.nr+1, 0);
155 interp[3] = brw_vec1_grf(coef.nr+1, 4);
156
157 for(i = 0; i < 4; i++ ) {
158 if (mask & (1<<i)) {
159 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
160 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
161 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
162 }
163 }
164 }
165
166 #if 0
167 static void emit_pinterp(struct brw_wm_compile *c,
168 int idx,
169 int mask )
170 {
171 struct brw_compile *p = &c->func;
172 struct brw_reg interp[4];
173 struct brw_reg coef = c->payload_coef[idx];
174 int i;
175
176 get_delta_xy(c);
177 get_pixel_w(c);
178
179 interp[0] = brw_vec1_grf(coef.nr, 0);
180 interp[1] = brw_vec1_grf(coef.nr, 4);
181 interp[2] = brw_vec1_grf(coef.nr+1, 0);
182 interp[3] = brw_vec1_grf(coef.nr+1, 4);
183
184 for(i = 0; i < 4; i++ ) {
185 if (mask & (1<<i)) {
186 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
187 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
188 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
189 brw_MUL(p, dst, dst, c->pixel_w);
190 }
191 }
192 }
193 #endif
194
195
196
197 #if 0
198 static void emit_wpos( )
199 {
200 struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
201 struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
202 struct tgsi_full_src_register deltas = get_delta_xy(c);
203 struct tgsi_full_src_register arg2;
204 unsigned opcode;
205
206 opcode = WM_LINTERP;
207 arg2 = src_undef();
208
209 /* Have to treat wpos.xy specially:
210 */
211 emit_op(c,
212 WM_WPOSXY,
213 dst_mask(dst, WRITEMASK_XY),
214 0, 0, 0,
215 get_pixel_xy(c),
216 src_undef(),
217 src_undef());
218
219 dst = dst_mask(dst, WRITEMASK_ZW);
220
221 /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
222 */
223 emit_op(c,
224 WM_LINTERP,
225 dst,
226 0, 0, 0,
227 interp,
228 deltas,
229 arg2);
230 }
231 #endif
232
233
234
235
236 /* Perform register allocation:
237 *
238 * -- r0???
239 * -- passthrough depth regs (and stencil/aa??)
240 * -- curbe ??
241 * -- inputs (coefficients)
242 *
243 * Use a totally static register allocation. This will perform poorly
244 * but is an easy way to get started (again).
245 */
246 static void prealloc_reg(struct brw_wm_compile *c)
247 {
248 int i, j;
249 int nr_curbe_regs = 0;
250
251 /* R0, then some depth related regs:
252 */
253 for (i = 0; i < c->key.nr_depth_regs; i++) {
254 c->payload_depth[i] = brw_vec8_grf(i*2, 0);
255 c->reg_index += 2;
256 }
257
258
259 /* Then a copy of our part of the CURBE entry:
260 */
261 {
262 int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
263 int index = 0;
264
265 /* XXX number of constants, or highest numbered constant? */
266 assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
267
268 c->prog_data.max_const = 4*nr_constants;
269 for (i = 0; i < nr_constants; i++) {
270 for (j = 0; j < 4; j++, index++)
271 c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
272 index%8);
273 }
274
275 nr_curbe_regs = 2*((4*nr_constants+15)/16);
276 c->reg_index += nr_curbe_regs;
277 }
278
279 /* Adjust for parameter coefficients for position, which are
280 * currently always provided.
281 */
282 // c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
283 c->reg_index += 2;
284
285 /* Next we receive the plane coefficients for parameter
286 * interpolation:
287 */
288 assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
289 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
290 c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
291 c->reg_index += 2;
292 }
293
294 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
295 c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
296 c->prog_data.curb_read_length = nr_curbe_regs;
297
298 /* That's the end of the payload, now we can start allocating registers.
299 */
300 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
301 c->reg_index++;
302
303 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
304 c->reg_index += 2;
305
306 /* Now allocate room for the interpolated inputs and staging
307 * registers for the outputs:
308 */
309 /* XXX do we want to loop over the _number_ of inputs/outputs or loop
310 * to the highest input/output index that's used?
311 * Probably the same, actually.
312 */
313 assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
314 assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
315 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++)
316 for (j = 0; j < 4; j++)
317 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
318
319 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++)
320 for (j = 0; j < 4; j++)
321 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
322
323 /* Beyond this we should only need registers for internal temporaries:
324 */
325 c->tmp_start = c->reg_index;
326 }
327
328
329
330
331
332 /* Need to interpolate fragment program inputs in as a preamble to the
333 * shader. A more sophisticated compiler would do this on demand, but
334 * we'll do it up front:
335 */
336 void brw_wm_emit_decls(struct brw_wm_compile *c)
337 {
338 struct tgsi_parse_context parse;
339 int done = 0;
340
341 prealloc_reg(c);
342
343 tgsi_parse_init( &parse, c->fp->program.tokens );
344
345 while( !done &&
346 !tgsi_parse_end_of_tokens( &parse ) )
347 {
348 tgsi_parse_token( &parse );
349
350 switch( parse.FullToken.Token.Type ) {
351 case TGSI_TOKEN_TYPE_DECLARATION:
352 {
353 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
354 unsigned first = decl->DeclarationRange.First;
355 unsigned last = decl->DeclarationRange.Last;
356 unsigned mask = decl->Declaration.UsageMask; /* ? */
357 unsigned i;
358
359 if (decl->Declaration.File != TGSI_FILE_INPUT)
360 break;
361
362 for( i = first; i <= last; i++ ) {
363 switch (decl->Declaration.Interpolate) {
364 case TGSI_INTERPOLATE_CONSTANT:
365 emit_cinterp(c, i, mask);
366 break;
367
368 case TGSI_INTERPOLATE_LINEAR:
369 emit_linterp(c, i, mask);
370 break;
371
372 case TGSI_INTERPOLATE_PERSPECTIVE:
373 //emit_pinterp(c, i, mask);
374 emit_linterp(c, i, mask);
375 break;
376 }
377 }
378 break;
379 }
380 case TGSI_TOKEN_TYPE_IMMEDIATE:
381 case TGSI_TOKEN_TYPE_INSTRUCTION:
382 default:
383 done = 1;
384 break;
385 }
386 }
387
388 tgsi_parse_free (&parse);
389
390 release_tmps(c);
391 }