i965: remove 'nr' param from get_src/dst_reg() functions
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195 const struct gl_program_parameter_list *plist =
196 c->fp->program.Base.Parameters;
197 int index = 0;
198
199 /* number of float constants */
200 c->prog_data.nr_params = 4 * nr_params;
201
202 /* loop over program constants (float[4]) */
203 for (i = 0; i < nr_params; i++) {
204 /* loop over XYZW channels */
205 for (j = 0; j < 4; j++, index++) {
206 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
207 /* Save pointer to parameter/constant value.
208 * Constants will be copied in prepare_constant_buffer()
209 */
210 c->prog_data.param[index] = &plist->ParameterValues[i][j];
211 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
212 }
213 }
214 /* number of constant regs used (each reg is float[8]) */
215 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
216 c->reg_index += c->nr_creg;
217 }
218
219 /* fragment shader inputs */
220 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
221 if (inputs & (1<<i)) {
222 nr_interp_regs++;
223 reg = brw_vec8_grf(c->reg_index, 0);
224 for (j = 0; j < 4; j++)
225 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
226 c->reg_index += 2;
227 }
228 }
229
230 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
231 c->prog_data.urb_read_length = nr_interp_regs * 2;
232 c->prog_data.curb_read_length = c->nr_creg;
233 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
234 c->reg_index++;
235 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
236 c->reg_index += 2;
237 }
238
239
240 /**
241 * Convert Mesa dst register to brw register.
242 */
243 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
244 struct prog_instruction *inst, int component)
245 {
246 const int nr = 1;
247 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
248 0, 0);
249 }
250
251
252 /**
253 * Convert Mesa src register to brw register.
254 */
255 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
256 struct prog_src_register *src, int index)
257 {
258 const int nr = 1;
259 int component = GET_SWZ(src->Swizzle, index);
260 return get_reg(c, src->File, src->Index, component, nr,
261 src->NegateBase, src->Abs);
262 }
263
264 /**
265 * Subroutines are minimal support for resusable instruction sequences.
266 * They are implemented as simply as possible to minimise overhead: there
267 * is no explicit support for communication between the caller and callee
268 * other than saving the return address in a temporary register, nor is
269 * there any automatic local storage. This implies that great care is
270 * required before attempting reentrancy or any kind of nested
271 * subroutine invocations.
272 */
273 static void invoke_subroutine( struct brw_wm_compile *c,
274 enum _subroutine subroutine,
275 void (*emit)( struct brw_wm_compile * ) )
276 {
277 struct brw_compile *p = &c->func;
278
279 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
280
281 if( c->subroutines[ subroutine ] ) {
282 /* subroutine previously emitted: reuse existing instructions */
283
284 int mark = mark_tmps( c );
285 struct brw_reg return_address = retype( alloc_tmp( c ),
286 BRW_REGISTER_TYPE_UD );
287 int here = p->nr_insn;
288
289 brw_push_insn_state(p);
290 brw_set_mask_control(p, BRW_MASK_DISABLE);
291 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
292
293 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
294 brw_imm_d( ( c->subroutines[ subroutine ] -
295 here - 1 ) << 4 ) );
296 brw_pop_insn_state(p);
297
298 release_tmps( c, mark );
299 } else {
300 /* previously unused subroutine: emit, and mark for later reuse */
301
302 int mark = mark_tmps( c );
303 struct brw_reg return_address = retype( alloc_tmp( c ),
304 BRW_REGISTER_TYPE_UD );
305 struct brw_instruction *calc;
306 int base = p->nr_insn;
307
308 brw_push_insn_state(p);
309 brw_set_mask_control(p, BRW_MASK_DISABLE);
310 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
311 brw_pop_insn_state(p);
312
313 c->subroutines[ subroutine ] = p->nr_insn;
314
315 emit( c );
316
317 brw_push_insn_state(p);
318 brw_set_mask_control(p, BRW_MASK_DISABLE);
319 brw_MOV( p, brw_ip_reg(), return_address );
320 brw_pop_insn_state(p);
321
322 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
323
324 release_tmps( c, mark );
325 }
326 }
327
328 static void emit_abs( struct brw_wm_compile *c,
329 struct prog_instruction *inst)
330 {
331 int i;
332 struct brw_compile *p = &c->func;
333 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
334 for (i = 0; i < 4; i++) {
335 if (inst->DstReg.WriteMask & (1<<i)) {
336 struct brw_reg src, dst;
337 dst = get_dst_reg(c, inst, i);
338 src = get_src_reg(c, &inst->SrcReg[0], i);
339 brw_MOV(p, dst, brw_abs(src));
340 }
341 }
342 brw_set_saturate(p, 0);
343 }
344
345 static void emit_trunc( struct brw_wm_compile *c,
346 struct prog_instruction *inst)
347 {
348 int i;
349 struct brw_compile *p = &c->func;
350 GLuint mask = inst->DstReg.WriteMask;
351 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
352 for (i = 0; i < 4; i++) {
353 if (mask & (1<<i)) {
354 struct brw_reg src, dst;
355 dst = get_dst_reg(c, inst, i);
356 src = get_src_reg(c, &inst->SrcReg[0], i);
357 brw_RNDZ(p, dst, src);
358 }
359 }
360 brw_set_saturate(p, 0);
361 }
362
363 static void emit_mov( struct brw_wm_compile *c,
364 struct prog_instruction *inst)
365 {
366 int i;
367 struct brw_compile *p = &c->func;
368 GLuint mask = inst->DstReg.WriteMask;
369 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
370 for (i = 0; i < 4; i++) {
371 if (mask & (1<<i)) {
372 struct brw_reg src, dst;
373 dst = get_dst_reg(c, inst, i);
374 src = get_src_reg(c, &inst->SrcReg[0], i);
375 brw_MOV(p, dst, src);
376 }
377 }
378 brw_set_saturate(p, 0);
379 }
380
381 static void emit_pixel_xy(struct brw_wm_compile *c,
382 struct prog_instruction *inst)
383 {
384 struct brw_reg r1 = brw_vec1_grf(1, 0);
385 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
386
387 struct brw_reg dst0, dst1;
388 struct brw_compile *p = &c->func;
389 GLuint mask = inst->DstReg.WriteMask;
390
391 dst0 = get_dst_reg(c, inst, 0);
392 dst1 = get_dst_reg(c, inst, 1);
393 /* Calculate pixel centers by adding 1 or 0 to each of the
394 * micro-tile coordinates passed in r1.
395 */
396 if (mask & WRITEMASK_X) {
397 brw_ADD(p,
398 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
399 stride(suboffset(r1_uw, 4), 2, 4, 0),
400 brw_imm_v(0x10101010));
401 }
402
403 if (mask & WRITEMASK_Y) {
404 brw_ADD(p,
405 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
406 stride(suboffset(r1_uw, 5), 2, 4, 0),
407 brw_imm_v(0x11001100));
408 }
409 }
410
411 static void emit_delta_xy(struct brw_wm_compile *c,
412 struct prog_instruction *inst)
413 {
414 struct brw_reg r1 = brw_vec1_grf(1, 0);
415 struct brw_reg dst0, dst1, src0, src1;
416 struct brw_compile *p = &c->func;
417 GLuint mask = inst->DstReg.WriteMask;
418
419 dst0 = get_dst_reg(c, inst, 0);
420 dst1 = get_dst_reg(c, inst, 1);
421 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
422 src1 = get_src_reg(c, &inst->SrcReg[0], 1);
423 /* Calc delta X,Y by subtracting origin in r1 from the pixel
424 * centers.
425 */
426 if (mask & WRITEMASK_X) {
427 brw_ADD(p,
428 dst0,
429 retype(src0, BRW_REGISTER_TYPE_UW),
430 negate(r1));
431 }
432
433 if (mask & WRITEMASK_Y) {
434 brw_ADD(p,
435 dst1,
436 retype(src1, BRW_REGISTER_TYPE_UW),
437 negate(suboffset(r1,1)));
438
439 }
440 }
441
442 static void fire_fb_write( struct brw_wm_compile *c,
443 GLuint base_reg,
444 GLuint nr,
445 GLuint target,
446 GLuint eot)
447 {
448 struct brw_compile *p = &c->func;
449 /* Pass through control information:
450 */
451 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
452 {
453 brw_push_insn_state(p);
454 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
455 brw_MOV(p,
456 brw_message_reg(base_reg + 1),
457 brw_vec8_grf(1, 0));
458 brw_pop_insn_state(p);
459 }
460 /* Send framebuffer write message: */
461 brw_fb_WRITE(p,
462 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
463 base_reg,
464 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
465 target,
466 nr,
467 0,
468 eot);
469 }
470
471 static void emit_fb_write(struct brw_wm_compile *c,
472 struct prog_instruction *inst)
473 {
474 struct brw_compile *p = &c->func;
475 int nr = 2;
476 int channel;
477 GLuint target, eot;
478 struct brw_reg src0;
479
480 /* Reserve a space for AA - may not be needed:
481 */
482 if (c->key.aa_dest_stencil_reg)
483 nr += 1;
484
485 brw_push_insn_state(p);
486 for (channel = 0; channel < 4; channel++) {
487 src0 = get_src_reg(c, &inst->SrcReg[0], channel);
488 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
489 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
490 brw_MOV(p, brw_message_reg(nr + channel), src0);
491 }
492 /* skip over the regs populated above: */
493 nr += 8;
494 brw_pop_insn_state(p);
495
496 if (c->key.source_depth_to_render_target) {
497 if (c->key.computes_depth) {
498 src0 = get_src_reg(c, &inst->SrcReg[2], 2);
499 brw_MOV(p, brw_message_reg(nr), src0);
500 }
501 else {
502 src0 = get_src_reg(c, &inst->SrcReg[1], 1);
503 brw_MOV(p, brw_message_reg(nr), src0);
504 }
505
506 nr += 2;
507 }
508
509 if (c->key.dest_depth_reg) {
510 GLuint comp = c->key.dest_depth_reg / 2;
511 GLuint off = c->key.dest_depth_reg % 2;
512
513 assert(comp == 1);
514 assert(off == 0);
515 #if 0
516 /* XXX do we need this code? comp always 1, off always 0, it seems */
517 if (off != 0) {
518 brw_push_insn_state(p);
519 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
520
521 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
522 /* 2nd half? */
523 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
524 brw_pop_insn_state(p);
525 }
526 else
527 #endif
528 {
529 struct brw_reg src = get_src_reg(c, &inst->SrcReg[1], 1);
530 brw_MOV(p, brw_message_reg(nr), src);
531 }
532 nr += 2;
533 }
534
535 target = inst->Aux >> 1;
536 eot = inst->Aux & 1;
537 fire_fb_write(c, 0, nr, target, eot);
538 }
539
540 static void emit_pixel_w( struct brw_wm_compile *c,
541 struct prog_instruction *inst)
542 {
543 struct brw_compile *p = &c->func;
544 GLuint mask = inst->DstReg.WriteMask;
545 if (mask & WRITEMASK_W) {
546 struct brw_reg dst, src0, delta0, delta1;
547 struct brw_reg interp3;
548
549 dst = get_dst_reg(c, inst, 3);
550 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
551 delta0 = get_src_reg(c, &inst->SrcReg[1], 0);
552 delta1 = get_src_reg(c, &inst->SrcReg[1], 1);
553
554 interp3 = brw_vec1_grf(src0.nr+1, 4);
555 /* Calc 1/w - just linterp wpos[3] optimized by putting the
556 * result straight into a message reg.
557 */
558 brw_LINE(p, brw_null_reg(), interp3, delta0);
559 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
560
561 /* Calc w */
562 brw_math_16( p, dst,
563 BRW_MATH_FUNCTION_INV,
564 BRW_MATH_SATURATE_NONE,
565 2, brw_null_reg(),
566 BRW_MATH_PRECISION_FULL);
567 }
568 }
569
570 static void emit_linterp(struct brw_wm_compile *c,
571 struct prog_instruction *inst)
572 {
573 struct brw_compile *p = &c->func;
574 GLuint mask = inst->DstReg.WriteMask;
575 struct brw_reg interp[4];
576 struct brw_reg dst, delta0, delta1;
577 struct brw_reg src0;
578 GLuint nr, i;
579
580 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
581 delta0 = get_src_reg(c, &inst->SrcReg[1], 0);
582 delta1 = get_src_reg(c, &inst->SrcReg[1], 1);
583 nr = src0.nr;
584
585 interp[0] = brw_vec1_grf(nr, 0);
586 interp[1] = brw_vec1_grf(nr, 4);
587 interp[2] = brw_vec1_grf(nr+1, 0);
588 interp[3] = brw_vec1_grf(nr+1, 4);
589
590 for(i = 0; i < 4; i++ ) {
591 if (mask & (1<<i)) {
592 dst = get_dst_reg(c, inst, i);
593 brw_LINE(p, brw_null_reg(), interp[i], delta0);
594 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
595 }
596 }
597 }
598
599 static void emit_cinterp(struct brw_wm_compile *c,
600 struct prog_instruction *inst)
601 {
602 struct brw_compile *p = &c->func;
603 GLuint mask = inst->DstReg.WriteMask;
604
605 struct brw_reg interp[4];
606 struct brw_reg dst, src0;
607 GLuint nr, i;
608
609 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
610 nr = src0.nr;
611
612 interp[0] = brw_vec1_grf(nr, 0);
613 interp[1] = brw_vec1_grf(nr, 4);
614 interp[2] = brw_vec1_grf(nr+1, 0);
615 interp[3] = brw_vec1_grf(nr+1, 4);
616
617 for(i = 0; i < 4; i++ ) {
618 if (mask & (1<<i)) {
619 dst = get_dst_reg(c, inst, i);
620 brw_MOV(p, dst, suboffset(interp[i],3));
621 }
622 }
623 }
624
625 static void emit_pinterp(struct brw_wm_compile *c,
626 struct prog_instruction *inst)
627 {
628 struct brw_compile *p = &c->func;
629 GLuint mask = inst->DstReg.WriteMask;
630
631 struct brw_reg interp[4];
632 struct brw_reg dst, delta0, delta1;
633 struct brw_reg src0, w;
634 GLuint nr, i;
635
636 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
637 delta0 = get_src_reg(c, &inst->SrcReg[1], 0);
638 delta1 = get_src_reg(c, &inst->SrcReg[1], 1);
639 w = get_src_reg(c, &inst->SrcReg[2], 3);
640 nr = src0.nr;
641
642 interp[0] = brw_vec1_grf(nr, 0);
643 interp[1] = brw_vec1_grf(nr, 4);
644 interp[2] = brw_vec1_grf(nr+1, 0);
645 interp[3] = brw_vec1_grf(nr+1, 4);
646
647 for(i = 0; i < 4; i++ ) {
648 if (mask & (1<<i)) {
649 dst = get_dst_reg(c, inst, i);
650 brw_LINE(p, brw_null_reg(), interp[i], delta0);
651 brw_MAC(p, dst, suboffset(interp[i],1),
652 delta1);
653 brw_MUL(p, dst, dst, w);
654 }
655 }
656 }
657
658 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
659 static void emit_frontfacing(struct brw_wm_compile *c,
660 struct prog_instruction *inst)
661 {
662 struct brw_compile *p = &c->func;
663 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
664 struct brw_reg dst;
665 GLuint mask = inst->DstReg.WriteMask;
666 int i;
667
668 for (i = 0; i < 4; i++) {
669 if (mask & (1<<i)) {
670 dst = get_dst_reg(c, inst, i);
671 brw_MOV(p, dst, brw_imm_f(0.0));
672 }
673 }
674
675 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
676 * us front face
677 */
678 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
679 for (i = 0; i < 4; i++) {
680 if (mask & (1<<i)) {
681 dst = get_dst_reg(c, inst, i);
682 brw_MOV(p, dst, brw_imm_f(1.0));
683 }
684 }
685 brw_set_predicate_control_flag_value(p, 0xff);
686 }
687
688 static void emit_xpd(struct brw_wm_compile *c,
689 struct prog_instruction *inst)
690 {
691 int i;
692 struct brw_compile *p = &c->func;
693 GLuint mask = inst->DstReg.WriteMask;
694 for (i = 0; i < 4; i++) {
695 GLuint i2 = (i+2)%3;
696 GLuint i1 = (i+1)%3;
697 if (mask & (1<<i)) {
698 struct brw_reg src0, src1, dst;
699 dst = get_dst_reg(c, inst, i);
700 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2));
701 src1 = get_src_reg(c, &inst->SrcReg[1], i1);
702 brw_MUL(p, brw_null_reg(), src0, src1);
703 src0 = get_src_reg(c, &inst->SrcReg[0], i1);
704 src1 = get_src_reg(c, &inst->SrcReg[1], i2);
705 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
706 brw_MAC(p, dst, src0, src1);
707 brw_set_saturate(p, 0);
708 }
709 }
710 brw_set_saturate(p, 0);
711 }
712
713 static void emit_dp3(struct brw_wm_compile *c,
714 struct prog_instruction *inst)
715 {
716 struct brw_reg src0[3], src1[3], dst;
717 int i;
718 struct brw_compile *p = &c->func;
719 for (i = 0; i < 3; i++) {
720 src0[i] = get_src_reg(c, &inst->SrcReg[0], i);
721 src1[i] = get_src_reg(c, &inst->SrcReg[1], i);
722 }
723
724 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
725 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
726 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
727 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
728 brw_MAC(p, dst, src0[2], src1[2]);
729 brw_set_saturate(p, 0);
730 }
731
732 static void emit_dp4(struct brw_wm_compile *c,
733 struct prog_instruction *inst)
734 {
735 struct brw_reg src0[4], src1[4], dst;
736 int i;
737 struct brw_compile *p = &c->func;
738 for (i = 0; i < 4; i++) {
739 src0[i] = get_src_reg(c, &inst->SrcReg[0], i);
740 src1[i] = get_src_reg(c, &inst->SrcReg[1], i);
741 }
742 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
743 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
744 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
745 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
746 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
747 brw_MAC(p, dst, src0[3], src1[3]);
748 brw_set_saturate(p, 0);
749 }
750
751 static void emit_dph(struct brw_wm_compile *c,
752 struct prog_instruction *inst)
753 {
754 struct brw_reg src0[4], src1[4], dst;
755 int i;
756 struct brw_compile *p = &c->func;
757 for (i = 0; i < 4; i++) {
758 src0[i] = get_src_reg(c, &inst->SrcReg[0], i);
759 src1[i] = get_src_reg(c, &inst->SrcReg[1], i);
760 }
761 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
762 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
763 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
764 brw_MAC(p, dst, src0[2], src1[2]);
765 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
766 brw_ADD(p, dst, dst, src1[3]);
767 brw_set_saturate(p, 0);
768 }
769
770 /**
771 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
772 * Note that the result of the function is smeared across the dest
773 * register's X, Y, Z and W channels (subject to writemasking of course).
774 */
775 static void emit_math1(struct brw_wm_compile *c,
776 struct prog_instruction *inst, GLuint func)
777 {
778 struct brw_compile *p = &c->func;
779 struct brw_reg src0, dst, tmp;
780 const int mark = mark_tmps( c );
781 int i;
782
783 tmp = alloc_tmp(c);
784
785 /* Get first component of source register */
786 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
787
788 /* tmp = func(src0) */
789 brw_MOV(p, brw_message_reg(2), src0);
790 brw_math(p,
791 tmp,
792 func,
793 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
794 2,
795 brw_null_reg(),
796 BRW_MATH_DATA_VECTOR,
797 BRW_MATH_PRECISION_FULL);
798
799 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
800
801 /* replicate tmp value across enabled dest channels */
802 for (i = 0; i < 4; i++) {
803 if (inst->DstReg.WriteMask & (1 << i)) {
804 dst = get_dst_reg(c, inst, i);
805 brw_MOV(p, dst, tmp);
806 }
807 }
808
809 release_tmps(c, mark);
810 }
811
812 static void emit_rcp(struct brw_wm_compile *c,
813 struct prog_instruction *inst)
814 {
815 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
816 }
817
818 static void emit_rsq(struct brw_wm_compile *c,
819 struct prog_instruction *inst)
820 {
821 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
822 }
823
824 static void emit_sin(struct brw_wm_compile *c,
825 struct prog_instruction *inst)
826 {
827 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
828 }
829
830 static void emit_cos(struct brw_wm_compile *c,
831 struct prog_instruction *inst)
832 {
833 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
834 }
835
836 static void emit_ex2(struct brw_wm_compile *c,
837 struct prog_instruction *inst)
838 {
839 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
840 }
841
842 static void emit_lg2(struct brw_wm_compile *c,
843 struct prog_instruction *inst)
844 {
845 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
846 }
847
848 static void emit_add(struct brw_wm_compile *c,
849 struct prog_instruction *inst)
850 {
851 struct brw_compile *p = &c->func;
852 struct brw_reg src0, src1, dst;
853 GLuint mask = inst->DstReg.WriteMask;
854 int i;
855 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
856 for (i = 0 ; i < 4; i++) {
857 if (mask & (1<<i)) {
858 dst = get_dst_reg(c, inst, i);
859 src0 = get_src_reg(c, &inst->SrcReg[0], i);
860 src1 = get_src_reg(c, &inst->SrcReg[1], i);
861 brw_ADD(p, dst, src0, src1);
862 }
863 }
864 brw_set_saturate(p, 0);
865 }
866
867 static void emit_sub(struct brw_wm_compile *c,
868 struct prog_instruction *inst)
869 {
870 struct brw_compile *p = &c->func;
871 struct brw_reg src0, src1, dst;
872 GLuint mask = inst->DstReg.WriteMask;
873 int i;
874 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
875 for (i = 0 ; i < 4; i++) {
876 if (mask & (1<<i)) {
877 dst = get_dst_reg(c, inst, i);
878 src0 = get_src_reg(c, &inst->SrcReg[0], i);
879 src1 = get_src_reg(c, &inst->SrcReg[1], i);
880 brw_ADD(p, dst, src0, negate(src1));
881 }
882 }
883 brw_set_saturate(p, 0);
884 }
885
886 static void emit_mul(struct brw_wm_compile *c,
887 struct prog_instruction *inst)
888 {
889 struct brw_compile *p = &c->func;
890 struct brw_reg src0, src1, dst;
891 GLuint mask = inst->DstReg.WriteMask;
892 int i;
893 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
894 for (i = 0 ; i < 4; i++) {
895 if (mask & (1<<i)) {
896 dst = get_dst_reg(c, inst, i);
897 src0 = get_src_reg(c, &inst->SrcReg[0], i);
898 src1 = get_src_reg(c, &inst->SrcReg[1], i);
899 brw_MUL(p, dst, src0, src1);
900 }
901 }
902 brw_set_saturate(p, 0);
903 }
904
905 static void emit_frc(struct brw_wm_compile *c,
906 struct prog_instruction *inst)
907 {
908 struct brw_compile *p = &c->func;
909 struct brw_reg src0, dst;
910 GLuint mask = inst->DstReg.WriteMask;
911 int i;
912 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
913 for (i = 0 ; i < 4; i++) {
914 if (mask & (1<<i)) {
915 dst = get_dst_reg(c, inst, i);
916 src0 = get_src_reg(c, &inst->SrcReg[0], i);
917 brw_FRC(p, dst, src0);
918 }
919 }
920 if (inst->SaturateMode != SATURATE_OFF)
921 brw_set_saturate(p, 0);
922 }
923
924 static void emit_flr(struct brw_wm_compile *c,
925 struct prog_instruction *inst)
926 {
927 struct brw_compile *p = &c->func;
928 struct brw_reg src0, dst;
929 GLuint mask = inst->DstReg.WriteMask;
930 int i;
931 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
932 for (i = 0 ; i < 4; i++) {
933 if (mask & (1<<i)) {
934 dst = get_dst_reg(c, inst, i);
935 src0 = get_src_reg(c, &inst->SrcReg[0], i);
936 brw_RNDD(p, dst, src0);
937 }
938 }
939 brw_set_saturate(p, 0);
940 }
941
942 static void emit_max(struct brw_wm_compile *c,
943 struct prog_instruction *inst)
944 {
945 struct brw_compile *p = &c->func;
946 GLuint mask = inst->DstReg.WriteMask;
947 struct brw_reg src0, src1, dst;
948 int i;
949 brw_push_insn_state(p);
950 for (i = 0; i < 4; i++) {
951 if (mask & (1<<i)) {
952 dst = get_dst_reg(c, inst, i);
953 src0 = get_src_reg(c, &inst->SrcReg[0], i);
954 src1 = get_src_reg(c, &inst->SrcReg[1], i);
955 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
956 brw_MOV(p, dst, src0);
957 brw_set_saturate(p, 0);
958
959 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
960 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
961 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
962 brw_MOV(p, dst, src1);
963 brw_set_saturate(p, 0);
964 brw_set_predicate_control_flag_value(p, 0xff);
965 }
966 }
967 brw_pop_insn_state(p);
968 }
969
970 static void emit_min(struct brw_wm_compile *c,
971 struct prog_instruction *inst)
972 {
973 struct brw_compile *p = &c->func;
974 GLuint mask = inst->DstReg.WriteMask;
975 struct brw_reg src0, src1, dst;
976 int i;
977 brw_push_insn_state(p);
978 for (i = 0; i < 4; i++) {
979 if (mask & (1<<i)) {
980 dst = get_dst_reg(c, inst, i);
981 src0 = get_src_reg(c, &inst->SrcReg[0], i);
982 src1 = get_src_reg(c, &inst->SrcReg[1], i);
983 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
984 brw_MOV(p, dst, src0);
985 brw_set_saturate(p, 0);
986
987 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
988 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
989 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
990 brw_MOV(p, dst, src1);
991 brw_set_saturate(p, 0);
992 brw_set_predicate_control_flag_value(p, 0xff);
993 }
994 }
995 brw_pop_insn_state(p);
996 }
997
998 static void emit_pow(struct brw_wm_compile *c,
999 struct prog_instruction *inst)
1000 {
1001 struct brw_compile *p = &c->func;
1002 struct brw_reg dst, src0, src1;
1003 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1004 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
1005 src1 = get_src_reg(c, &inst->SrcReg[1], 0);
1006
1007 brw_MOV(p, brw_message_reg(2), src0);
1008 brw_MOV(p, brw_message_reg(3), src1);
1009
1010 brw_math(p,
1011 dst,
1012 BRW_MATH_FUNCTION_POW,
1013 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1014 2,
1015 brw_null_reg(),
1016 BRW_MATH_DATA_VECTOR,
1017 BRW_MATH_PRECISION_FULL);
1018 }
1019
1020 static void emit_lrp(struct brw_wm_compile *c,
1021 struct prog_instruction *inst)
1022 {
1023 struct brw_compile *p = &c->func;
1024 GLuint mask = inst->DstReg.WriteMask;
1025 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1026 int i;
1027 int mark = mark_tmps(c);
1028 for (i = 0; i < 4; i++) {
1029 if (mask & (1<<i)) {
1030 dst = get_dst_reg(c, inst, i);
1031 src0 = get_src_reg(c, &inst->SrcReg[0], i);
1032
1033 src1 = get_src_reg(c, &inst->SrcReg[1], i);
1034
1035 if (src1.nr == dst.nr) {
1036 tmp1 = alloc_tmp(c);
1037 brw_MOV(p, tmp1, src1);
1038 } else
1039 tmp1 = src1;
1040
1041 src2 = get_src_reg(c, &inst->SrcReg[2], i);
1042 if (src2.nr == dst.nr) {
1043 tmp2 = alloc_tmp(c);
1044 brw_MOV(p, tmp2, src2);
1045 } else
1046 tmp2 = src2;
1047
1048 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1049 brw_MUL(p, brw_null_reg(), dst, tmp2);
1050 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1051 brw_MAC(p, dst, src0, tmp1);
1052 brw_set_saturate(p, 0);
1053 }
1054 release_tmps(c, mark);
1055 }
1056 }
1057
1058 /**
1059 * For GLSL shaders, this KIL will be unconditional.
1060 * It may be contained inside an IF/ENDIF structure of course.
1061 */
1062 static void emit_kil(struct brw_wm_compile *c)
1063 {
1064 struct brw_compile *p = &c->func;
1065 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1066 brw_push_insn_state(p);
1067 brw_set_mask_control(p, BRW_MASK_DISABLE);
1068 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1069 brw_AND(p, depth, c->emit_mask_reg, depth);
1070 brw_pop_insn_state(p);
1071 }
1072
1073 static void emit_mad(struct brw_wm_compile *c,
1074 struct prog_instruction *inst)
1075 {
1076 struct brw_compile *p = &c->func;
1077 GLuint mask = inst->DstReg.WriteMask;
1078 struct brw_reg dst, src0, src1, src2;
1079 int i;
1080
1081 for (i = 0; i < 4; i++) {
1082 if (mask & (1<<i)) {
1083 dst = get_dst_reg(c, inst, i);
1084 src0 = get_src_reg(c, &inst->SrcReg[0], i);
1085 src1 = get_src_reg(c, &inst->SrcReg[1], i);
1086 src2 = get_src_reg(c, &inst->SrcReg[2], i);
1087 brw_MUL(p, dst, src0, src1);
1088
1089 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1090 brw_ADD(p, dst, dst, src2);
1091 brw_set_saturate(p, 0);
1092 }
1093 }
1094 }
1095
1096 static void emit_sop(struct brw_wm_compile *c,
1097 struct prog_instruction *inst, GLuint cond)
1098 {
1099 struct brw_compile *p = &c->func;
1100 GLuint mask = inst->DstReg.WriteMask;
1101 struct brw_reg dst, src0, src1;
1102 int i;
1103
1104 for (i = 0; i < 4; i++) {
1105 if (mask & (1<<i)) {
1106 dst = get_dst_reg(c, inst, i);
1107 src0 = get_src_reg(c, &inst->SrcReg[0], i);
1108 src1 = get_src_reg(c, &inst->SrcReg[1], i);
1109 brw_push_insn_state(p);
1110 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1111 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1112 brw_MOV(p, dst, brw_imm_f(0.0));
1113 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1114 brw_MOV(p, dst, brw_imm_f(1.0));
1115 brw_pop_insn_state(p);
1116 }
1117 }
1118 }
1119
1120 static void emit_slt(struct brw_wm_compile *c,
1121 struct prog_instruction *inst)
1122 {
1123 emit_sop(c, inst, BRW_CONDITIONAL_L);
1124 }
1125
1126 static void emit_sle(struct brw_wm_compile *c,
1127 struct prog_instruction *inst)
1128 {
1129 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1130 }
1131
1132 static void emit_sgt(struct brw_wm_compile *c,
1133 struct prog_instruction *inst)
1134 {
1135 emit_sop(c, inst, BRW_CONDITIONAL_G);
1136 }
1137
1138 static void emit_sge(struct brw_wm_compile *c,
1139 struct prog_instruction *inst)
1140 {
1141 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1142 }
1143
1144 static void emit_seq(struct brw_wm_compile *c,
1145 struct prog_instruction *inst)
1146 {
1147 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1148 }
1149
1150 static void emit_sne(struct brw_wm_compile *c,
1151 struct prog_instruction *inst)
1152 {
1153 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1154 }
1155
1156 static void emit_ddx(struct brw_wm_compile *c,
1157 struct prog_instruction *inst)
1158 {
1159 struct brw_compile *p = &c->func;
1160 GLuint mask = inst->DstReg.WriteMask;
1161 struct brw_reg interp[4];
1162 struct brw_reg dst;
1163 struct brw_reg src0, w;
1164 GLuint nr, i;
1165 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
1166 w = get_src_reg(c, &inst->SrcReg[1], 3);
1167 nr = src0.nr;
1168 interp[0] = brw_vec1_grf(nr, 0);
1169 interp[1] = brw_vec1_grf(nr, 4);
1170 interp[2] = brw_vec1_grf(nr+1, 0);
1171 interp[3] = brw_vec1_grf(nr+1, 4);
1172 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1173 for(i = 0; i < 4; i++ ) {
1174 if (mask & (1<<i)) {
1175 dst = get_dst_reg(c, inst, i);
1176 brw_MOV(p, dst, interp[i]);
1177 brw_MUL(p, dst, dst, w);
1178 }
1179 }
1180 brw_set_saturate(p, 0);
1181 }
1182
1183 static void emit_ddy(struct brw_wm_compile *c,
1184 struct prog_instruction *inst)
1185 {
1186 struct brw_compile *p = &c->func;
1187 GLuint mask = inst->DstReg.WriteMask;
1188 struct brw_reg interp[4];
1189 struct brw_reg dst;
1190 struct brw_reg src0, w;
1191 GLuint nr, i;
1192
1193 src0 = get_src_reg(c, &inst->SrcReg[0], 0);
1194 nr = src0.nr;
1195 w = get_src_reg(c, &inst->SrcReg[1], 3);
1196 interp[0] = brw_vec1_grf(nr, 0);
1197 interp[1] = brw_vec1_grf(nr, 4);
1198 interp[2] = brw_vec1_grf(nr+1, 0);
1199 interp[3] = brw_vec1_grf(nr+1, 4);
1200 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1201 for(i = 0; i < 4; i++ ) {
1202 if (mask & (1<<i)) {
1203 dst = get_dst_reg(c, inst, i);
1204 brw_MOV(p, dst, suboffset(interp[i], 1));
1205 brw_MUL(p, dst, dst, w);
1206 }
1207 }
1208 brw_set_saturate(p, 0);
1209 }
1210
1211 static INLINE struct brw_reg high_words( struct brw_reg reg )
1212 {
1213 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1214 0, 8, 2 );
1215 }
1216
1217 static INLINE struct brw_reg low_words( struct brw_reg reg )
1218 {
1219 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1220 }
1221
1222 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1223 {
1224 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1225 }
1226
1227 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1228 {
1229 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1230 0, 16, 2 );
1231 }
1232
1233 /* One-, two- and three-dimensional Perlin noise, similar to the description
1234 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1235 static void noise1_sub( struct brw_wm_compile *c ) {
1236
1237 struct brw_compile *p = &c->func;
1238 struct brw_reg param,
1239 x0, x1, /* gradients at each end */
1240 t, tmp[ 2 ], /* float temporaries */
1241 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1242 int i;
1243 int mark = mark_tmps( c );
1244
1245 x0 = alloc_tmp( c );
1246 x1 = alloc_tmp( c );
1247 t = alloc_tmp( c );
1248 tmp[ 0 ] = alloc_tmp( c );
1249 tmp[ 1 ] = alloc_tmp( c );
1250 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1251 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1252 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1253 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1254 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1255
1256 param = lookup_tmp( c, mark - 2 );
1257
1258 brw_set_access_mode( p, BRW_ALIGN_1 );
1259
1260 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1261
1262 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1263 be hashed. Also compute the remainder (offset within the unit
1264 length), interleaved to reduce register dependency penalties. */
1265 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1266 brw_FRC( p, param, param );
1267 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1268 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1269 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1270
1271 /* We're now ready to perform the hashing. The two hashes are
1272 interleaved for performance. The hash function used is
1273 designed to rapidly achieve avalanche and require only 32x16
1274 bit multiplication, and 16-bit swizzles (which we get for
1275 free). We can't use immediate operands in the multiplies,
1276 because immediates are permitted only in src1 and the 16-bit
1277 factor is permitted only in src0. */
1278 for( i = 0; i < 2; i++ )
1279 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1280 for( i = 0; i < 2; i++ )
1281 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1282 high_words( itmp[ i ] ) );
1283 for( i = 0; i < 2; i++ )
1284 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1285 for( i = 0; i < 2; i++ )
1286 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1287 high_words( itmp[ i ] ) );
1288 for( i = 0; i < 2; i++ )
1289 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1290 for( i = 0; i < 2; i++ )
1291 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1292 high_words( itmp[ i ] ) );
1293
1294 /* Now we want to initialise the two gradients based on the
1295 hashes. Format conversion from signed integer to float leaves
1296 everything scaled too high by a factor of pow( 2, 31 ), but
1297 we correct for that right at the end. */
1298 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1299 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1300 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1301
1302 brw_MUL( p, x0, x0, param );
1303 brw_MUL( p, x1, x1, t );
1304
1305 /* We interpolate between the gradients using the polynomial
1306 6t^5 - 15t^4 + 10t^3 (Perlin). */
1307 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1308 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1309 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1310 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1311 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1312 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1313 pipeline */
1314 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1315 brw_MUL( p, param, tmp[ 0 ], param );
1316 brw_MUL( p, x1, x1, param );
1317 brw_ADD( p, x0, x0, x1 );
1318 /* scale by pow( 2, -30 ), to compensate for the format conversion
1319 above and an extra factor of 2 so that a single gradient covers
1320 the [-1,1] range */
1321 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1322
1323 release_tmps( c, mark );
1324 }
1325
1326 static void emit_noise1( struct brw_wm_compile *c,
1327 struct prog_instruction *inst )
1328 {
1329 struct brw_compile *p = &c->func;
1330 struct brw_reg src, param, dst;
1331 GLuint mask = inst->DstReg.WriteMask;
1332 int i;
1333 int mark = mark_tmps( c );
1334
1335 assert( mark == 0 );
1336
1337 src = get_src_reg( c, inst->SrcReg, 0 );
1338
1339 param = alloc_tmp( c );
1340
1341 brw_MOV( p, param, src );
1342
1343 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1344
1345 /* Fill in the result: */
1346 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1347 for (i = 0 ; i < 4; i++) {
1348 if (mask & (1<<i)) {
1349 dst = get_dst_reg(c, inst, i);
1350 brw_MOV( p, dst, param );
1351 }
1352 }
1353 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1354 brw_set_saturate( p, 0 );
1355
1356 release_tmps( c, mark );
1357 }
1358
1359 static void noise2_sub( struct brw_wm_compile *c ) {
1360
1361 struct brw_compile *p = &c->func;
1362 struct brw_reg param0, param1,
1363 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1364 t, tmp[ 4 ], /* float temporaries */
1365 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1366 int i;
1367 int mark = mark_tmps( c );
1368
1369 x0y0 = alloc_tmp( c );
1370 x0y1 = alloc_tmp( c );
1371 x1y0 = alloc_tmp( c );
1372 x1y1 = alloc_tmp( c );
1373 t = alloc_tmp( c );
1374 for( i = 0; i < 4; i++ ) {
1375 tmp[ i ] = alloc_tmp( c );
1376 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1377 }
1378 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1379 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1380 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1381
1382 param0 = lookup_tmp( c, mark - 3 );
1383 param1 = lookup_tmp( c, mark - 2 );
1384
1385 brw_set_access_mode( p, BRW_ALIGN_1 );
1386
1387 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1388 be hashed. Also compute the remainders (offsets within the unit
1389 square), interleaved to reduce register dependency penalties. */
1390 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1391 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1392 brw_FRC( p, param0, param0 );
1393 brw_FRC( p, param1, param1 );
1394 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1395 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1396 low_words( itmp[ 1 ] ) );
1397 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1398 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1399 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1400 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1401 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1402
1403 /* We're now ready to perform the hashing. The four hashes are
1404 interleaved for performance. The hash function used is
1405 designed to rapidly achieve avalanche and require only 32x16
1406 bit multiplication, and 16-bit swizzles (which we get for
1407 free). We can't use immediate operands in the multiplies,
1408 because immediates are permitted only in src1 and the 16-bit
1409 factor is permitted only in src0. */
1410 for( i = 0; i < 4; i++ )
1411 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1412 for( i = 0; i < 4; i++ )
1413 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1414 high_words( itmp[ i ] ) );
1415 for( i = 0; i < 4; i++ )
1416 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1417 for( i = 0; i < 4; i++ )
1418 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1419 high_words( itmp[ i ] ) );
1420 for( i = 0; i < 4; i++ )
1421 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1422 for( i = 0; i < 4; i++ )
1423 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1424 high_words( itmp[ i ] ) );
1425
1426 /* Now we want to initialise the four gradients based on the
1427 hashes. Format conversion from signed integer to float leaves
1428 everything scaled too high by a factor of pow( 2, 15 ), but
1429 we correct for that right at the end. */
1430 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1431 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1432 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1433 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1434 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1435
1436 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1437 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1438 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1439 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1440
1441 brw_MUL( p, x1y0, x1y0, t );
1442 brw_MUL( p, x1y1, x1y1, t );
1443 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1444 brw_MUL( p, x0y0, x0y0, param0 );
1445 brw_MUL( p, x0y1, x0y1, param0 );
1446
1447 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1448 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1449 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1450 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1451
1452 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1453 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1454 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1455 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1456
1457 /* We interpolate between the gradients using the polynomial
1458 6t^5 - 15t^4 + 10t^3 (Perlin). */
1459 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1460 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1461 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1462 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1463 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1464 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1465 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1466 pipeline */
1467 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1468 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1469 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1470 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1471 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1472 pipeline */
1473 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1474 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1475 brw_MUL( p, param0, tmp[ 0 ], param0 );
1476 brw_MUL( p, param1, tmp[ 1 ], param1 );
1477
1478 /* Here we interpolate in the y dimension... */
1479 brw_MUL( p, x0y1, x0y1, param1 );
1480 brw_MUL( p, x1y1, x1y1, param1 );
1481 brw_ADD( p, x0y0, x0y0, x0y1 );
1482 brw_ADD( p, x1y0, x1y0, x1y1 );
1483
1484 /* And now in x. There are horrible register dependencies here,
1485 but we have nothing else to do. */
1486 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1487 brw_MUL( p, x1y0, x1y0, param0 );
1488 brw_ADD( p, x0y0, x0y0, x1y0 );
1489
1490 /* scale by pow( 2, -15 ), as described above */
1491 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1492
1493 release_tmps( c, mark );
1494 }
1495
1496 static void emit_noise2( struct brw_wm_compile *c,
1497 struct prog_instruction *inst )
1498 {
1499 struct brw_compile *p = &c->func;
1500 struct brw_reg src0, src1, param0, param1, dst;
1501 GLuint mask = inst->DstReg.WriteMask;
1502 int i;
1503 int mark = mark_tmps( c );
1504
1505 assert( mark == 0 );
1506
1507 src0 = get_src_reg( c, inst->SrcReg, 0 );
1508 src1 = get_src_reg( c, inst->SrcReg, 1 );
1509
1510 param0 = alloc_tmp( c );
1511 param1 = alloc_tmp( c );
1512
1513 brw_MOV( p, param0, src0 );
1514 brw_MOV( p, param1, src1 );
1515
1516 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1517
1518 /* Fill in the result: */
1519 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1520 for (i = 0 ; i < 4; i++) {
1521 if (mask & (1<<i)) {
1522 dst = get_dst_reg(c, inst, i);
1523 brw_MOV( p, dst, param0 );
1524 }
1525 }
1526 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1527 brw_set_saturate( p, 0 );
1528
1529 release_tmps( c, mark );
1530 }
1531
1532 /**
1533 * The three-dimensional case is much like the one- and two- versions above,
1534 * but since the number of corners is rapidly growing we now pack 16 16-bit
1535 * hashes into each register to extract more parallelism from the EUs.
1536 */
1537 static void noise3_sub( struct brw_wm_compile *c ) {
1538
1539 struct brw_compile *p = &c->func;
1540 struct brw_reg param0, param1, param2,
1541 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1542 xi, yi, zi, /* interpolation coefficients */
1543 t, tmp[ 8 ], /* float temporaries */
1544 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1545 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1546 int i;
1547 int mark = mark_tmps( c );
1548
1549 x0y0 = alloc_tmp( c );
1550 x0y1 = alloc_tmp( c );
1551 x1y0 = alloc_tmp( c );
1552 x1y1 = alloc_tmp( c );
1553 xi = alloc_tmp( c );
1554 yi = alloc_tmp( c );
1555 zi = alloc_tmp( c );
1556 t = alloc_tmp( c );
1557 for( i = 0; i < 8; i++ ) {
1558 tmp[ i ] = alloc_tmp( c );
1559 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1560 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1561 }
1562
1563 param0 = lookup_tmp( c, mark - 4 );
1564 param1 = lookup_tmp( c, mark - 3 );
1565 param2 = lookup_tmp( c, mark - 2 );
1566
1567 brw_set_access_mode( p, BRW_ALIGN_1 );
1568
1569 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1570 be hashed. Also compute the remainders (offsets within the unit
1571 cube), interleaved to reduce register dependency penalties. */
1572 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1573 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1574 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1575 brw_FRC( p, param0, param0 );
1576 brw_FRC( p, param1, param1 );
1577 brw_FRC( p, param2, param2 );
1578 /* Since we now have only 16 bits of precision in the hash, we must
1579 be more careful about thorough mixing to maintain entropy as we
1580 squash the input vector into a small scalar. */
1581 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1582 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1583 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1584 brw_imm_uw( 0x9B93 ) );
1585 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1586 brw_imm_uw( 0xBC8F ) );
1587
1588 /* Temporarily disable the execution mask while we work with ExecSize=16
1589 channels (the mask is set for ExecSize=8 and is probably incorrect).
1590 Although this might cause execution of unwanted channels, the code
1591 writes only to temporary registers and has no side effects, so
1592 disabling the mask is harmless. */
1593 brw_push_insn_state( p );
1594 brw_set_mask_control( p, BRW_MASK_DISABLE );
1595 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1596 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1597 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1598
1599 /* We're now ready to perform the hashing. The eight hashes are
1600 interleaved for performance. The hash function used is
1601 designed to rapidly achieve avalanche and require only 16x16
1602 bit multiplication, and 8-bit swizzles (which we get for
1603 free). */
1604 for( i = 0; i < 4; i++ )
1605 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1606 for( i = 0; i < 4; i++ )
1607 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1608 odd_bytes( wtmp[ i ] ) );
1609 for( i = 0; i < 4; i++ )
1610 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1611 for( i = 0; i < 4; i++ )
1612 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1613 odd_bytes( wtmp[ i ] ) );
1614 brw_pop_insn_state( p );
1615
1616 /* Now we want to initialise the four rear gradients based on the
1617 hashes. Format conversion from signed integer to float leaves
1618 everything scaled too high by a factor of pow( 2, 15 ), but
1619 we correct for that right at the end. */
1620 /* x component */
1621 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1622 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1623 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1624 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1625 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1626
1627 brw_push_insn_state( p );
1628 brw_set_mask_control( p, BRW_MASK_DISABLE );
1629 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1630 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1631 brw_pop_insn_state( p );
1632
1633 brw_MUL( p, x1y0, x1y0, t );
1634 brw_MUL( p, x1y1, x1y1, t );
1635 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1636 brw_MUL( p, x0y0, x0y0, param0 );
1637 brw_MUL( p, x0y1, x0y1, param0 );
1638
1639 /* y component */
1640 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1641 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1642 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1643 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1644
1645 brw_push_insn_state( p );
1646 brw_set_mask_control( p, BRW_MASK_DISABLE );
1647 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1648 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1649 brw_pop_insn_state( p );
1650
1651 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1652 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1653 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1654 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1655 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1656
1657 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1658 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1659 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1660 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1661
1662 /* z component */
1663 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1664 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1665 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1666 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1667
1668 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1669 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1670 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1671 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1672
1673 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1674 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1675 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1676 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1677
1678 /* We interpolate between the gradients using the polynomial
1679 6t^5 - 15t^4 + 10t^3 (Perlin). */
1680 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1681 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1682 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1683 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1684 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1685 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1686 brw_MUL( p, xi, xi, param0 );
1687 brw_MUL( p, yi, yi, param1 );
1688 brw_MUL( p, zi, zi, param2 );
1689 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1690 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1691 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1692 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1693 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1694 brw_MUL( p, xi, xi, param0 );
1695 brw_MUL( p, yi, yi, param1 );
1696 brw_MUL( p, zi, zi, param2 );
1697 brw_MUL( p, xi, xi, param0 );
1698 brw_MUL( p, yi, yi, param1 );
1699 brw_MUL( p, zi, zi, param2 );
1700 brw_MUL( p, xi, xi, param0 );
1701 brw_MUL( p, yi, yi, param1 );
1702 brw_MUL( p, zi, zi, param2 );
1703
1704 /* Here we interpolate in the y dimension... */
1705 brw_MUL( p, x0y1, x0y1, yi );
1706 brw_MUL( p, x1y1, x1y1, yi );
1707 brw_ADD( p, x0y0, x0y0, x0y1 );
1708 brw_ADD( p, x1y0, x1y0, x1y1 );
1709
1710 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1711 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1712 brw_MUL( p, x1y0, x1y0, xi );
1713 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1714
1715 /* Now do the same thing for the front four gradients... */
1716 /* x component */
1717 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1718 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1719 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1720 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1721
1722 brw_push_insn_state( p );
1723 brw_set_mask_control( p, BRW_MASK_DISABLE );
1724 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1725 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1726 brw_pop_insn_state( p );
1727
1728 brw_MUL( p, x1y0, x1y0, t );
1729 brw_MUL( p, x1y1, x1y1, t );
1730 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1731 brw_MUL( p, x0y0, x0y0, param0 );
1732 brw_MUL( p, x0y1, x0y1, param0 );
1733
1734 /* y component */
1735 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1736 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1737 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1738 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1739
1740 brw_push_insn_state( p );
1741 brw_set_mask_control( p, BRW_MASK_DISABLE );
1742 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1743 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1744 brw_pop_insn_state( p );
1745
1746 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1747 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1748 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1749 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1750 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1751
1752 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1753 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1754 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1755 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1756
1757 /* z component */
1758 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1759 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1760 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1761 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1762
1763 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1764 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1765 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1766 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1767
1768 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1769 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1770 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1771 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1772
1773 /* The interpolation coefficients are still around from last time, so
1774 again interpolate in the y dimension... */
1775 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1776 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1777 brw_MUL( p, x0y1, x0y1, yi );
1778 brw_MUL( p, x1y1, x1y1, yi );
1779 brw_ADD( p, x0y0, x0y0, x0y1 );
1780 brw_ADD( p, x1y0, x1y0, x1y1 );
1781
1782 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1783 time put the front face in tmp[ 1 ] and we're nearly there... */
1784 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1785 brw_MUL( p, x1y0, x1y0, xi );
1786 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1787
1788 /* The final interpolation, in the z dimension: */
1789 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1790 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1791 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1792
1793 /* scale by pow( 2, -15 ), as described above */
1794 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1795
1796 release_tmps( c, mark );
1797 }
1798
1799 static void emit_noise3( struct brw_wm_compile *c,
1800 struct prog_instruction *inst )
1801 {
1802 struct brw_compile *p = &c->func;
1803 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1804 GLuint mask = inst->DstReg.WriteMask;
1805 int i;
1806 int mark = mark_tmps( c );
1807
1808 assert( mark == 0 );
1809
1810 src0 = get_src_reg( c, inst->SrcReg, 0 );
1811 src1 = get_src_reg( c, inst->SrcReg, 1 );
1812 src2 = get_src_reg( c, inst->SrcReg, 2 );
1813
1814 param0 = alloc_tmp( c );
1815 param1 = alloc_tmp( c );
1816 param2 = alloc_tmp( c );
1817
1818 brw_MOV( p, param0, src0 );
1819 brw_MOV( p, param1, src1 );
1820 brw_MOV( p, param2, src2 );
1821
1822 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1823
1824 /* Fill in the result: */
1825 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1826 for (i = 0 ; i < 4; i++) {
1827 if (mask & (1<<i)) {
1828 dst = get_dst_reg(c, inst, i);
1829 brw_MOV( p, dst, param0 );
1830 }
1831 }
1832 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1833 brw_set_saturate( p, 0 );
1834
1835 release_tmps( c, mark );
1836 }
1837
1838 /**
1839 * For the four-dimensional case, the little micro-optimisation benefits
1840 * we obtain by unrolling all the loops aren't worth the massive bloat it
1841 * now causes. Instead, we loop twice around performing a similar operation
1842 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1843 * code to glue it all together.
1844 */
1845 static void noise4_sub( struct brw_wm_compile *c )
1846 {
1847 struct brw_compile *p = &c->func;
1848 struct brw_reg param[ 4 ],
1849 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1850 w0, /* noise for the w=0 cube */
1851 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1852 interp[ 4 ], /* interpolation coefficients */
1853 t, tmp[ 8 ], /* float temporaries */
1854 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1855 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1856 int i, j;
1857 int mark = mark_tmps( c );
1858 GLuint loop, origin;
1859
1860 x0y0 = alloc_tmp( c );
1861 x0y1 = alloc_tmp( c );
1862 x1y0 = alloc_tmp( c );
1863 x1y1 = alloc_tmp( c );
1864 t = alloc_tmp( c );
1865 w0 = alloc_tmp( c );
1866 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1867 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1868
1869 for( i = 0; i < 4; i++ ) {
1870 param[ i ] = lookup_tmp( c, mark - 5 + i );
1871 interp[ i ] = alloc_tmp( c );
1872 }
1873
1874 for( i = 0; i < 8; i++ ) {
1875 tmp[ i ] = alloc_tmp( c );
1876 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1877 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1878 }
1879
1880 brw_set_access_mode( p, BRW_ALIGN_1 );
1881
1882 /* We only want 16 bits of precision from the integral part of each
1883 co-ordinate, but unfortunately the RNDD semantics would saturate
1884 at 16 bits if we performed the operation directly to a 16-bit
1885 destination. Therefore, we round to 32-bit temporaries where
1886 appropriate, and then store only the lower 16 bits. */
1887 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1888 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1889 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1890 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1891 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1892 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1893
1894 /* Modify the flag register here, because the side effect is useful
1895 later (see below). We know for certain that all flags will be
1896 cleared, since the FRC instruction cannot possibly generate
1897 negative results. Even for exceptional inputs (infinities, denormals,
1898 NaNs), the architecture guarantees that the L conditional is false. */
1899 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1900 brw_FRC( p, param[ 0 ], param[ 0 ] );
1901 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1902 for( i = 1; i < 4; i++ )
1903 brw_FRC( p, param[ i ], param[ i ] );
1904
1905 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1906 of all. */
1907 for( i = 0; i < 4; i++ )
1908 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1909 for( i = 0; i < 4; i++ )
1910 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1911 for( i = 0; i < 4; i++ )
1912 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1913 for( i = 0; i < 4; i++ )
1914 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1915 for( j = 0; j < 3; j++ )
1916 for( i = 0; i < 4; i++ )
1917 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1918
1919 /* Mark the current address, as it will be a jump destination. The
1920 following code will be executed twice: first, with the flag
1921 register clear indicating the w=0 case, and second with flags
1922 set for w=1. */
1923 loop = p->nr_insn;
1924
1925 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1926 be hashed. Since we have only 16 bits of precision in the hash, we
1927 must be careful about thorough mixing to maintain entropy as we
1928 squash the input vector into a small scalar. */
1929 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1930 brw_imm_uw( 0xBC8F ) );
1931 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1932 brw_imm_uw( 0xD0BD ) );
1933 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1934 brw_imm_uw( 0x9B93 ) );
1935 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1936 brw_imm_uw( 0xA359 ) );
1937 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1938 brw_imm_uw( 0xBC8F ) );
1939
1940 /* Temporarily disable the execution mask while we work with ExecSize=16
1941 channels (the mask is set for ExecSize=8 and is probably incorrect).
1942 Although this might cause execution of unwanted channels, the code
1943 writes only to temporary registers and has no side effects, so
1944 disabling the mask is harmless. */
1945 brw_push_insn_state( p );
1946 brw_set_mask_control( p, BRW_MASK_DISABLE );
1947 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1948 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1949 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1950
1951 /* We're now ready to perform the hashing. The eight hashes are
1952 interleaved for performance. The hash function used is
1953 designed to rapidly achieve avalanche and require only 16x16
1954 bit multiplication, and 8-bit swizzles (which we get for
1955 free). */
1956 for( i = 0; i < 4; i++ )
1957 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1958 for( i = 0; i < 4; i++ )
1959 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1960 odd_bytes( wtmp[ i ] ) );
1961 for( i = 0; i < 4; i++ )
1962 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1963 for( i = 0; i < 4; i++ )
1964 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1965 odd_bytes( wtmp[ i ] ) );
1966 brw_pop_insn_state( p );
1967
1968 /* Now we want to initialise the four rear gradients based on the
1969 hashes. Format conversion from signed integer to float leaves
1970 everything scaled too high by a factor of pow( 2, 15 ), but
1971 we correct for that right at the end. */
1972 /* x component */
1973 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1974 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1975 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1976 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1977 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1978
1979 brw_push_insn_state( p );
1980 brw_set_mask_control( p, BRW_MASK_DISABLE );
1981 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1982 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1983 brw_pop_insn_state( p );
1984
1985 brw_MUL( p, x1y0, x1y0, t );
1986 brw_MUL( p, x1y1, x1y1, t );
1987 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1988 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1989 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1990
1991 /* y component */
1992 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1993 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1994 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1995 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1996
1997 brw_push_insn_state( p );
1998 brw_set_mask_control( p, BRW_MASK_DISABLE );
1999 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2000 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2001 brw_pop_insn_state( p );
2002
2003 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2004 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2005 /* prepare t for the w component (used below): w the first time through
2006 the loop; w - 1 the second time) */
2007 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2008 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2009 p->current->header.predicate_inverse = 1;
2010 brw_MOV( p, t, param[ 3 ] );
2011 p->current->header.predicate_inverse = 0;
2012 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2013 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2014 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2015
2016 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2017 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2018 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2019 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2020
2021 /* z component */
2022 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2023 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2024 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2025 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2026
2027 brw_push_insn_state( p );
2028 brw_set_mask_control( p, BRW_MASK_DISABLE );
2029 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2030 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2031 brw_pop_insn_state( p );
2032
2033 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2034 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2035 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2036 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2037
2038 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2039 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2040 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2041 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2042
2043 /* w component */
2044 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2045 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2046 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2047 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2048
2049 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2050 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2051 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2052 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2053 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2054
2055 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2056 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2057 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2058 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2059
2060 /* Here we interpolate in the y dimension... */
2061 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2062 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2063 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2064 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2065 brw_ADD( p, x0y0, x0y0, x0y1 );
2066 brw_ADD( p, x1y0, x1y0, x1y1 );
2067
2068 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2069 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2070 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2071 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2072
2073 /* Now do the same thing for the front four gradients... */
2074 /* x component */
2075 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2076 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2077 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2078 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2079
2080 brw_push_insn_state( p );
2081 brw_set_mask_control( p, BRW_MASK_DISABLE );
2082 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2083 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2084 brw_pop_insn_state( p );
2085
2086 brw_MUL( p, x1y0, x1y0, t );
2087 brw_MUL( p, x1y1, x1y1, t );
2088 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2089 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2090 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2091
2092 /* y component */
2093 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2094 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2095 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2096 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2097
2098 brw_push_insn_state( p );
2099 brw_set_mask_control( p, BRW_MASK_DISABLE );
2100 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2101 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2102 brw_pop_insn_state( p );
2103
2104 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2105 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2106 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2107 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2108 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2109
2110 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2111 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2112 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2113 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2114
2115 /* z component */
2116 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2117 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2118 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2119 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2120
2121 brw_push_insn_state( p );
2122 brw_set_mask_control( p, BRW_MASK_DISABLE );
2123 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2124 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2125 brw_pop_insn_state( p );
2126
2127 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2128 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2129 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2130 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2131 /* prepare t for the w component (used below): w the first time through
2132 the loop; w - 1 the second time) */
2133 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2134 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2135 p->current->header.predicate_inverse = 1;
2136 brw_MOV( p, t, param[ 3 ] );
2137 p->current->header.predicate_inverse = 0;
2138 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2139
2140 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2141 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2142 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2143 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2144
2145 /* w component */
2146 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2147 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2148 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2149 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2150
2151 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2152 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2153 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2154 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2155
2156 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2157 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2158 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2159 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2160
2161 /* Interpolate in the y dimension: */
2162 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2163 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2164 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2165 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2166 brw_ADD( p, x0y0, x0y0, x0y1 );
2167 brw_ADD( p, x1y0, x1y0, x1y1 );
2168
2169 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2170 time put the front face in tmp[ 1 ] and we're nearly there... */
2171 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2172 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2173 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2174
2175 /* Another interpolation, in the z dimension: */
2176 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2177 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2178 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2179
2180 /* Exit the loop if we've computed both cubes... */
2181 origin = p->nr_insn;
2182 brw_push_insn_state( p );
2183 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2184 brw_set_mask_control( p, BRW_MASK_DISABLE );
2185 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2186 brw_pop_insn_state( p );
2187
2188 /* Save the result for the w=0 case, and increment the w coordinate: */
2189 brw_MOV( p, w0, tmp[ 0 ] );
2190 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2191 brw_imm_uw( 1 ) );
2192
2193 /* Loop around for the other cube. Explicitly set the flag register
2194 (unfortunately we must spend an extra instruction to do this: we
2195 can't rely on a side effect of the previous MOV or ADD because
2196 conditional modifiers which are normally true might be false in
2197 exceptional circumstances, e.g. given a NaN input; the add to
2198 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2199 brw_push_insn_state( p );
2200 brw_set_mask_control( p, BRW_MASK_DISABLE );
2201 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2202 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2203 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2204 brw_pop_insn_state( p );
2205
2206 /* Patch the previous conditional branch now that we know the
2207 destination address. */
2208 brw_set_src1( p->store + origin,
2209 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2210
2211 /* The very last interpolation. */
2212 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2213 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2214 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2215
2216 /* scale by pow( 2, -15 ), as described above */
2217 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2218
2219 release_tmps( c, mark );
2220 }
2221
2222 static void emit_noise4( struct brw_wm_compile *c,
2223 struct prog_instruction *inst )
2224 {
2225 struct brw_compile *p = &c->func;
2226 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2227 GLuint mask = inst->DstReg.WriteMask;
2228 int i;
2229 int mark = mark_tmps( c );
2230
2231 assert( mark == 0 );
2232
2233 src0 = get_src_reg( c, inst->SrcReg, 0 );
2234 src1 = get_src_reg( c, inst->SrcReg, 1 );
2235 src2 = get_src_reg( c, inst->SrcReg, 2 );
2236 src3 = get_src_reg( c, inst->SrcReg, 3 );
2237
2238 param0 = alloc_tmp( c );
2239 param1 = alloc_tmp( c );
2240 param2 = alloc_tmp( c );
2241 param3 = alloc_tmp( c );
2242
2243 brw_MOV( p, param0, src0 );
2244 brw_MOV( p, param1, src1 );
2245 brw_MOV( p, param2, src2 );
2246 brw_MOV( p, param3, src3 );
2247
2248 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2249
2250 /* Fill in the result: */
2251 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2252 for (i = 0 ; i < 4; i++) {
2253 if (mask & (1<<i)) {
2254 dst = get_dst_reg(c, inst, i);
2255 brw_MOV( p, dst, param0 );
2256 }
2257 }
2258 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2259 brw_set_saturate( p, 0 );
2260
2261 release_tmps( c, mark );
2262 }
2263
2264 static void emit_wpos_xy(struct brw_wm_compile *c,
2265 struct prog_instruction *inst)
2266 {
2267 struct brw_compile *p = &c->func;
2268 GLuint mask = inst->DstReg.WriteMask;
2269 struct brw_reg src0[2], dst[2];
2270
2271 dst[0] = get_dst_reg(c, inst, 0);
2272 dst[1] = get_dst_reg(c, inst, 1);
2273
2274 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0);
2275 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1);
2276
2277 /* Calculate the pixel offset from window bottom left into destination
2278 * X and Y channels.
2279 */
2280 if (mask & WRITEMASK_X) {
2281 /* X' = X - origin_x */
2282 brw_ADD(p,
2283 dst[0],
2284 retype(src0[0], BRW_REGISTER_TYPE_W),
2285 brw_imm_d(0 - c->key.origin_x));
2286 }
2287
2288 if (mask & WRITEMASK_Y) {
2289 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2290 brw_ADD(p,
2291 dst[1],
2292 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2293 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2294 }
2295 }
2296
2297 /* TODO
2298 BIAS on SIMD8 not working yet...
2299 */
2300 static void emit_txb(struct brw_wm_compile *c,
2301 struct prog_instruction *inst)
2302 {
2303 struct brw_compile *p = &c->func;
2304 struct brw_reg dst[4], src[4], payload_reg;
2305 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2306
2307 GLuint i;
2308 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2309 for (i = 0; i < 4; i++)
2310 dst[i] = get_dst_reg(c, inst, i);
2311 for (i = 0; i < 4; i++)
2312 src[i] = get_src_reg(c, &inst->SrcReg[0], i);
2313
2314 switch (inst->TexSrcTarget) {
2315 case TEXTURE_1D_INDEX:
2316 brw_MOV(p, brw_message_reg(2), src[0]);
2317 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2318 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2319 break;
2320 case TEXTURE_2D_INDEX:
2321 case TEXTURE_RECT_INDEX:
2322 brw_MOV(p, brw_message_reg(2), src[0]);
2323 brw_MOV(p, brw_message_reg(3), src[1]);
2324 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2325 break;
2326 default:
2327 brw_MOV(p, brw_message_reg(2), src[0]);
2328 brw_MOV(p, brw_message_reg(3), src[1]);
2329 brw_MOV(p, brw_message_reg(4), src[2]);
2330 break;
2331 }
2332 brw_MOV(p, brw_message_reg(5), src[3]);
2333 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2334 brw_SAMPLE(p,
2335 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2336 1,
2337 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2338 unit + MAX_DRAW_BUFFERS, /* surface */
2339 unit, /* sampler */
2340 inst->DstReg.WriteMask,
2341 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2342 4,
2343 4,
2344 0);
2345 }
2346
2347 static void emit_tex(struct brw_wm_compile *c,
2348 struct prog_instruction *inst)
2349 {
2350 struct brw_compile *p = &c->func;
2351 struct brw_reg dst[4], src[4], payload_reg;
2352 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2353
2354 GLuint msg_len;
2355 GLuint i, nr;
2356 GLuint emit;
2357 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2358
2359 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2360
2361 for (i = 0; i < 4; i++)
2362 dst[i] = get_dst_reg(c, inst, i);
2363 for (i = 0; i < 4; i++)
2364 src[i] = get_src_reg(c, &inst->SrcReg[0], i);
2365
2366
2367 switch (inst->TexSrcTarget) {
2368 case TEXTURE_1D_INDEX:
2369 emit = WRITEMASK_X;
2370 nr = 1;
2371 break;
2372 case TEXTURE_2D_INDEX:
2373 case TEXTURE_RECT_INDEX:
2374 emit = WRITEMASK_XY;
2375 nr = 2;
2376 break;
2377 default:
2378 emit = WRITEMASK_XYZ;
2379 nr = 3;
2380 break;
2381 }
2382 msg_len = 1;
2383
2384 for (i = 0; i < nr; i++) {
2385 static const GLuint swz[4] = {0,1,2,2};
2386 if (emit & (1<<i))
2387 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2388 else
2389 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2390 msg_len += 1;
2391 }
2392
2393 if (shadow) {
2394 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2395 brw_MOV(p, brw_message_reg(6), src[2]);
2396 }
2397
2398 brw_SAMPLE(p,
2399 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2400 1,
2401 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2402 unit + MAX_DRAW_BUFFERS, /* surface */
2403 unit, /* sampler */
2404 inst->DstReg.WriteMask,
2405 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2406 4,
2407 shadow ? 6 : 4,
2408 0);
2409
2410 if (shadow)
2411 brw_MOV(p, dst[3], brw_imm_f(1.0));
2412 }
2413
2414 /**
2415 * Resolve subroutine calls after code emit is done.
2416 */
2417 static void post_wm_emit( struct brw_wm_compile *c )
2418 {
2419 brw_resolve_cals(&c->func);
2420 }
2421
2422 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2423 {
2424 #define MAX_IFSN 32
2425 #define MAX_LOOP_DEPTH 32
2426 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2427 struct brw_instruction *inst0, *inst1;
2428 int i, if_insn = 0, loop_insn = 0;
2429 struct brw_compile *p = &c->func;
2430 struct brw_indirect stack_index = brw_indirect(0, 0);
2431
2432 c->reg_index = 0;
2433 prealloc_reg(c);
2434 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2435 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2436
2437 for (i = 0; i < c->nr_fp_insns; i++) {
2438 struct prog_instruction *inst = &c->prog_instructions[i];
2439
2440 if (inst->CondUpdate)
2441 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2442 else
2443 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2444
2445 switch (inst->Opcode) {
2446 case WM_PIXELXY:
2447 emit_pixel_xy(c, inst);
2448 break;
2449 case WM_DELTAXY:
2450 emit_delta_xy(c, inst);
2451 break;
2452 case WM_PIXELW:
2453 emit_pixel_w(c, inst);
2454 break;
2455 case WM_LINTERP:
2456 emit_linterp(c, inst);
2457 break;
2458 case WM_PINTERP:
2459 emit_pinterp(c, inst);
2460 break;
2461 case WM_CINTERP:
2462 emit_cinterp(c, inst);
2463 break;
2464 case WM_WPOSXY:
2465 emit_wpos_xy(c, inst);
2466 break;
2467 case WM_FB_WRITE:
2468 emit_fb_write(c, inst);
2469 break;
2470 case WM_FRONTFACING:
2471 emit_frontfacing(c, inst);
2472 break;
2473 case OPCODE_ABS:
2474 emit_abs(c, inst);
2475 break;
2476 case OPCODE_ADD:
2477 emit_add(c, inst);
2478 break;
2479 case OPCODE_SUB:
2480 emit_sub(c, inst);
2481 break;
2482 case OPCODE_FRC:
2483 emit_frc(c, inst);
2484 break;
2485 case OPCODE_FLR:
2486 emit_flr(c, inst);
2487 break;
2488 case OPCODE_LRP:
2489 emit_lrp(c, inst);
2490 break;
2491 case OPCODE_TRUNC:
2492 emit_trunc(c, inst);
2493 break;
2494 case OPCODE_MOV:
2495 emit_mov(c, inst);
2496 break;
2497 case OPCODE_DP3:
2498 emit_dp3(c, inst);
2499 break;
2500 case OPCODE_DP4:
2501 emit_dp4(c, inst);
2502 break;
2503 case OPCODE_XPD:
2504 emit_xpd(c, inst);
2505 break;
2506 case OPCODE_DPH:
2507 emit_dph(c, inst);
2508 break;
2509 case OPCODE_RCP:
2510 emit_rcp(c, inst);
2511 break;
2512 case OPCODE_RSQ:
2513 emit_rsq(c, inst);
2514 break;
2515 case OPCODE_SIN:
2516 emit_sin(c, inst);
2517 break;
2518 case OPCODE_COS:
2519 emit_cos(c, inst);
2520 break;
2521 case OPCODE_EX2:
2522 emit_ex2(c, inst);
2523 break;
2524 case OPCODE_LG2:
2525 emit_lg2(c, inst);
2526 break;
2527 case OPCODE_MAX:
2528 emit_max(c, inst);
2529 break;
2530 case OPCODE_MIN:
2531 emit_min(c, inst);
2532 break;
2533 case OPCODE_DDX:
2534 emit_ddx(c, inst);
2535 break;
2536 case OPCODE_DDY:
2537 emit_ddy(c, inst);
2538 break;
2539 case OPCODE_SLT:
2540 emit_slt(c, inst);
2541 break;
2542 case OPCODE_SLE:
2543 emit_sle(c, inst);
2544 break;
2545 case OPCODE_SGT:
2546 emit_sgt(c, inst);
2547 break;
2548 case OPCODE_SGE:
2549 emit_sge(c, inst);
2550 break;
2551 case OPCODE_SEQ:
2552 emit_seq(c, inst);
2553 break;
2554 case OPCODE_SNE:
2555 emit_sne(c, inst);
2556 break;
2557 case OPCODE_MUL:
2558 emit_mul(c, inst);
2559 break;
2560 case OPCODE_POW:
2561 emit_pow(c, inst);
2562 break;
2563 case OPCODE_MAD:
2564 emit_mad(c, inst);
2565 break;
2566 case OPCODE_NOISE1:
2567 emit_noise1(c, inst);
2568 break;
2569 case OPCODE_NOISE2:
2570 emit_noise2(c, inst);
2571 break;
2572 case OPCODE_NOISE3:
2573 emit_noise3(c, inst);
2574 break;
2575 case OPCODE_NOISE4:
2576 emit_noise4(c, inst);
2577 break;
2578 case OPCODE_TEX:
2579 emit_tex(c, inst);
2580 break;
2581 case OPCODE_TXB:
2582 emit_txb(c, inst);
2583 break;
2584 case OPCODE_KIL_NV:
2585 emit_kil(c);
2586 break;
2587 case OPCODE_IF:
2588 assert(if_insn < MAX_IFSN);
2589 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2590 break;
2591 case OPCODE_ELSE:
2592 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2593 break;
2594 case OPCODE_ENDIF:
2595 assert(if_insn > 0);
2596 brw_ENDIF(p, if_inst[--if_insn]);
2597 break;
2598 case OPCODE_BGNSUB:
2599 brw_save_label(p, inst->Comment, p->nr_insn);
2600 break;
2601 case OPCODE_ENDSUB:
2602 /* no-op */
2603 break;
2604 case OPCODE_CAL:
2605 brw_push_insn_state(p);
2606 brw_set_mask_control(p, BRW_MASK_DISABLE);
2607 brw_set_access_mode(p, BRW_ALIGN_1);
2608 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2609 brw_set_access_mode(p, BRW_ALIGN_16);
2610 brw_ADD(p, get_addr_reg(stack_index),
2611 get_addr_reg(stack_index), brw_imm_d(4));
2612 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2613 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2614 brw_pop_insn_state(p);
2615 break;
2616
2617 case OPCODE_RET:
2618 brw_push_insn_state(p);
2619 brw_set_mask_control(p, BRW_MASK_DISABLE);
2620 brw_ADD(p, get_addr_reg(stack_index),
2621 get_addr_reg(stack_index), brw_imm_d(-4));
2622 brw_set_access_mode(p, BRW_ALIGN_1);
2623 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2624 brw_set_access_mode(p, BRW_ALIGN_16);
2625 brw_pop_insn_state(p);
2626
2627 break;
2628 case OPCODE_BGNLOOP:
2629 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2630 break;
2631 case OPCODE_BRK:
2632 brw_BREAK(p);
2633 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2634 break;
2635 case OPCODE_CONT:
2636 brw_CONT(p);
2637 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2638 break;
2639 case OPCODE_ENDLOOP:
2640 loop_insn--;
2641 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2642 /* patch all the BREAK instructions from
2643 last BEGINLOOP */
2644 while (inst0 > loop_inst[loop_insn]) {
2645 inst0--;
2646 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2647 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2648 inst0->bits3.if_else.pop_count = 0;
2649 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2650 inst0->bits3.if_else.jump_count = inst1 - inst0;
2651 inst0->bits3.if_else.pop_count = 0;
2652 }
2653 }
2654 break;
2655 default:
2656 _mesa_printf("unsupported IR in fragment shader %d\n",
2657 inst->Opcode);
2658 }
2659 if (inst->CondUpdate)
2660 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2661 else
2662 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2663 }
2664 post_wm_emit(c);
2665
2666 if (c->reg_index >= BRW_WM_MAX_GRF) {
2667 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2668 /* XXX we need to do some proper error recovery here */
2669 }
2670 }
2671
2672
2673 /**
2674 * Do GPU code generation for shaders that use GLSL features such as
2675 * flow control. Other shaders will be compiled with the
2676 */
2677 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2678 {
2679 if (INTEL_DEBUG & DEBUG_WM) {
2680 _mesa_printf("brw_wm_glsl_emit:\n");
2681 }
2682
2683 /* initial instruction translation/simplification */
2684 brw_wm_pass_fp(c);
2685
2686 /* actual code generation */
2687 brw_wm_emit_glsl(brw, c);
2688
2689 if (INTEL_DEBUG & DEBUG_WM) {
2690 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2691 }
2692
2693 c->prog_data.total_grf = c->reg_index;
2694 c->prog_data.total_scratch = 0;
2695 }