Merge commit 'origin/gallium-master-merge'
[mesa.git] / src / gallium / drivers / i965simple / brw_wm_decl.c
1
2 #include "brw_context.h"
3 #include "brw_eu.h"
4 #include "brw_wm.h"
5 #include "util/u_math.h"
6 #include "util/u_memory.h"
7 #include "pipe/p_shader_tokens.h"
8 #include "tgsi/tgsi_parse.h"
9
10 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
11 {
12 c->tmp_index++;
13 c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
14 return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
15 }
16
17 static void release_tmps(struct brw_wm_compile *c)
18 {
19 c->tmp_index = 0;
20 }
21
22
23
24 static int is_null( struct brw_reg reg )
25 {
26 return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
27 reg.nr == BRW_ARF_NULL);
28 }
29
30 static void emit_pixel_xy( struct brw_wm_compile *c )
31 {
32 if (is_null(c->pixel_xy[0])) {
33
34 struct brw_compile *p = &c->func;
35 struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
36
37 c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
38 c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
39
40 /* Calculate pixel centers by adding 1 or 0 to each of the
41 * micro-tile coordinates passed in r1.
42 */
43 brw_ADD(p,
44 c->pixel_xy[0],
45 stride(suboffset(r1_uw, 4), 2, 4, 0),
46 brw_imm_v(0x10101010));
47
48 brw_ADD(p,
49 c->pixel_xy[1],
50 stride(suboffset(r1_uw, 5), 2, 4, 0),
51 brw_imm_v(0x11001100));
52 }
53 }
54
55
56
57
58
59
60 static void emit_delta_xy( struct brw_wm_compile *c )
61 {
62 if (is_null(c->delta_xy[0])) {
63 struct brw_compile *p = &c->func;
64 struct brw_reg r1 = brw_vec1_grf(1, 0);
65
66 emit_pixel_xy(c);
67
68 c->delta_xy[0] = alloc_tmp(c);
69 c->delta_xy[1] = alloc_tmp(c);
70
71 /* Calc delta X,Y by subtracting origin in r1 from the pixel
72 * centers.
73 */
74 brw_ADD(p,
75 c->delta_xy[0],
76 retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
77 negate(r1));
78
79 brw_ADD(p,
80 c->delta_xy[1],
81 retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
82 negate(suboffset(r1,1)));
83 }
84 }
85
86
87
88 #if 0
89 static void emit_pixel_w( struct brw_wm_compile *c )
90 {
91 if (is_null(c->pixel_w)) {
92 struct brw_compile *p = &c->func;
93
94 struct brw_reg interp_wpos = c->coef_wpos;
95
96 c->pixel_w = alloc_tmp(c);
97
98 emit_delta_xy(c);
99
100 /* Calc 1/w - just linterp wpos[3] optimized by putting the
101 * result straight into a message reg.
102 */
103 struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
104 brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
105 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
106
107 /* Calc w */
108 brw_math_16( p,
109 c->pixel_w,
110 BRW_MATH_FUNCTION_INV,
111 BRW_MATH_SATURATE_NONE,
112 2,
113 brw_null_reg(),
114 BRW_MATH_PRECISION_FULL);
115 }
116 }
117 #endif
118
119
120 static void emit_cinterp(struct brw_wm_compile *c,
121 int idx,
122 int mask )
123 {
124 struct brw_compile *p = &c->func;
125 struct brw_reg interp[4];
126 struct brw_reg coef = c->payload_coef[idx];
127 int i;
128
129 interp[0] = brw_vec1_grf(coef.nr, 0);
130 interp[1] = brw_vec1_grf(coef.nr, 4);
131 interp[2] = brw_vec1_grf(coef.nr+1, 0);
132 interp[3] = brw_vec1_grf(coef.nr+1, 4);
133
134 for(i = 0; i < 4; i++ ) {
135 if (mask & (1<<i)) {
136 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
137 brw_MOV(p, dst, suboffset(interp[i],3));
138 }
139 }
140 }
141
142 static void emit_linterp(struct brw_wm_compile *c,
143 int idx,
144 int mask )
145 {
146 struct brw_compile *p = &c->func;
147 struct brw_reg interp[4];
148 struct brw_reg coef = c->payload_coef[idx];
149 int i;
150
151 emit_delta_xy(c);
152
153 interp[0] = brw_vec1_grf(coef.nr, 0);
154 interp[1] = brw_vec1_grf(coef.nr, 4);
155 interp[2] = brw_vec1_grf(coef.nr+1, 0);
156 interp[3] = brw_vec1_grf(coef.nr+1, 4);
157
158 for(i = 0; i < 4; i++ ) {
159 if (mask & (1<<i)) {
160 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
161 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
162 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
163 }
164 }
165 }
166
167 #if 0
168 static void emit_pinterp(struct brw_wm_compile *c,
169 int idx,
170 int mask )
171 {
172 struct brw_compile *p = &c->func;
173 struct brw_reg interp[4];
174 struct brw_reg coef = c->payload_coef[idx];
175 int i;
176
177 get_delta_xy(c);
178 get_pixel_w(c);
179
180 interp[0] = brw_vec1_grf(coef.nr, 0);
181 interp[1] = brw_vec1_grf(coef.nr, 4);
182 interp[2] = brw_vec1_grf(coef.nr+1, 0);
183 interp[3] = brw_vec1_grf(coef.nr+1, 4);
184
185 for(i = 0; i < 4; i++ ) {
186 if (mask & (1<<i)) {
187 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
188 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
189 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
190 brw_MUL(p, dst, dst, c->pixel_w);
191 }
192 }
193 }
194 #endif
195
196
197
198 #if 0
199 static void emit_wpos( )
200 {
201 struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
202 struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
203 struct tgsi_full_src_register deltas = get_delta_xy(c);
204 struct tgsi_full_src_register arg2;
205 unsigned opcode;
206
207 opcode = WM_LINTERP;
208 arg2 = src_undef();
209
210 /* Have to treat wpos.xy specially:
211 */
212 emit_op(c,
213 WM_WPOSXY,
214 dst_mask(dst, WRITEMASK_XY),
215 0, 0, 0,
216 get_pixel_xy(c),
217 src_undef(),
218 src_undef());
219
220 dst = dst_mask(dst, WRITEMASK_ZW);
221
222 /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
223 */
224 emit_op(c,
225 WM_LINTERP,
226 dst,
227 0, 0, 0,
228 interp,
229 deltas,
230 arg2);
231 }
232 #endif
233
234
235
236
237 /* Perform register allocation:
238 *
239 * -- r0???
240 * -- passthrough depth regs (and stencil/aa??)
241 * -- curbe ??
242 * -- inputs (coefficients)
243 *
244 * Use a totally static register allocation. This will perform poorly
245 * but is an easy way to get started (again).
246 */
247 static void prealloc_reg(struct brw_wm_compile *c)
248 {
249 int i, j;
250 int nr_curbe_regs = 0;
251
252 /* R0, then some depth related regs:
253 */
254 for (i = 0; i < c->key.nr_depth_regs; i++) {
255 c->payload_depth[i] = brw_vec8_grf(i*2, 0);
256 c->reg_index += 2;
257 }
258
259
260 /* Then a copy of our part of the CURBE entry:
261 */
262 {
263 int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
264 int index = 0;
265
266 /* XXX number of constants, or highest numbered constant? */
267 assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
268
269 c->prog_data.max_const = 4*nr_constants;
270 for (i = 0; i < nr_constants; i++) {
271 for (j = 0; j < 4; j++, index++)
272 c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
273 index%8);
274 }
275
276 nr_curbe_regs = 2*((4*nr_constants+15)/16);
277 c->reg_index += nr_curbe_regs;
278 }
279
280 /* Adjust for parameter coefficients for position, which are
281 * currently always provided.
282 */
283 // c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
284 c->reg_index += 2;
285
286 /* Next we receive the plane coefficients for parameter
287 * interpolation:
288 */
289 assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
290 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
291 c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
292 c->reg_index += 2;
293 }
294
295 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
296 c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
297 c->prog_data.curb_read_length = nr_curbe_regs;
298
299 /* That's the end of the payload, now we can start allocating registers.
300 */
301 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
302 c->reg_index++;
303
304 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
305 c->reg_index += 2;
306
307 /* Now allocate room for the interpolated inputs and staging
308 * registers for the outputs:
309 */
310 /* XXX do we want to loop over the _number_ of inputs/outputs or loop
311 * to the highest input/output index that's used?
312 * Probably the same, actually.
313 */
314 assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
315 assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
316 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++)
317 for (j = 0; j < 4; j++)
318 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
319
320 for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++)
321 for (j = 0; j < 4; j++)
322 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
323
324 /* Beyond this we should only need registers for internal temporaries:
325 */
326 c->tmp_start = c->reg_index;
327 }
328
329
330
331
332
333 /* Need to interpolate fragment program inputs in as a preamble to the
334 * shader. A more sophisticated compiler would do this on demand, but
335 * we'll do it up front:
336 */
337 void brw_wm_emit_decls(struct brw_wm_compile *c)
338 {
339 struct tgsi_parse_context parse;
340 int done = 0;
341
342 prealloc_reg(c);
343
344 tgsi_parse_init( &parse, c->fp->program.tokens );
345
346 while( !done &&
347 !tgsi_parse_end_of_tokens( &parse ) )
348 {
349 tgsi_parse_token( &parse );
350
351 switch( parse.FullToken.Token.Type ) {
352 case TGSI_TOKEN_TYPE_DECLARATION:
353 {
354 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
355 unsigned first = decl->DeclarationRange.First;
356 unsigned last = decl->DeclarationRange.Last;
357 unsigned mask = decl->Declaration.UsageMask; /* ? */
358 unsigned i;
359
360 if (decl->Declaration.File != TGSI_FILE_INPUT)
361 break;
362
363 for( i = first; i <= last; i++ ) {
364 switch (decl->Declaration.Interpolate) {
365 case TGSI_INTERPOLATE_CONSTANT:
366 emit_cinterp(c, i, mask);
367 break;
368
369 case TGSI_INTERPOLATE_LINEAR:
370 emit_linterp(c, i, mask);
371 break;
372
373 case TGSI_INTERPOLATE_PERSPECTIVE:
374 //emit_pinterp(c, i, mask);
375 emit_linterp(c, i, mask);
376 break;
377 }
378 }
379 break;
380 }
381 case TGSI_TOKEN_TYPE_IMMEDIATE:
382 case TGSI_TOKEN_TYPE_INSTRUCTION:
383 default:
384 done = 1;
385 break;
386 }
387 }
388
389 tgsi_parse_free (&parse);
390
391 release_tmps(c);
392 }