i965: Clean up double initialization of dst_flags from a rebase resolve.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553 /**
554 * Subroutines are minimal support for resusable instruction sequences.
555 * They are implemented as simply as possible to minimise overhead: there
556 * is no explicit support for communication between the caller and callee
557 * other than saving the return address in a temporary register, nor is
558 * there any automatic local storage. This implies that great care is
559 * required before attempting reentrancy or any kind of nested
560 * subroutine invocations.
561 */
562 static void invoke_subroutine( struct brw_wm_compile *c,
563 enum _subroutine subroutine,
564 void (*emit)( struct brw_wm_compile * ) )
565 {
566 struct brw_compile *p = &c->func;
567
568 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
569
570 if( c->subroutines[ subroutine ] ) {
571 /* subroutine previously emitted: reuse existing instructions */
572
573 int mark = mark_tmps( c );
574 struct brw_reg return_address = retype( alloc_tmp( c ),
575 BRW_REGISTER_TYPE_UD );
576 int here = p->nr_insn;
577
578 brw_push_insn_state(p);
579 brw_set_mask_control(p, BRW_MASK_DISABLE);
580 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
581
582 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
583 brw_imm_d( ( c->subroutines[ subroutine ] -
584 here - 1 ) << 4 ) );
585 brw_pop_insn_state(p);
586
587 release_tmps( c, mark );
588 } else {
589 /* previously unused subroutine: emit, and mark for later reuse */
590
591 int mark = mark_tmps( c );
592 struct brw_reg return_address = retype( alloc_tmp( c ),
593 BRW_REGISTER_TYPE_UD );
594 struct brw_instruction *calc;
595 int base = p->nr_insn;
596
597 brw_push_insn_state(p);
598 brw_set_mask_control(p, BRW_MASK_DISABLE);
599 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
600 brw_pop_insn_state(p);
601
602 c->subroutines[ subroutine ] = p->nr_insn;
603
604 emit( c );
605
606 brw_push_insn_state(p);
607 brw_set_mask_control(p, BRW_MASK_DISABLE);
608 brw_MOV( p, brw_ip_reg(), return_address );
609 brw_pop_insn_state(p);
610
611 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
612
613 release_tmps( c, mark );
614 }
615 }
616
617 /* Workaround for using brw_wm_emit.c's emit functions, which expect
618 * destination regs to be uniquely written. Moves arguments out to
619 * temporaries as necessary for instructions which use their destination as
620 * a temporary.
621 */
622 static void
623 unalias3(struct brw_wm_compile *c,
624 void (*func)(struct brw_compile *c,
625 const struct brw_reg *dst,
626 GLuint mask,
627 const struct brw_reg *arg0,
628 const struct brw_reg *arg1,
629 const struct brw_reg *arg2),
630 const struct brw_reg *dst,
631 GLuint mask,
632 const struct brw_reg *arg0,
633 const struct brw_reg *arg1,
634 const struct brw_reg *arg2)
635 {
636 struct brw_compile *p = &c->func;
637 struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
638 int i, j;
639 int mark = mark_tmps(c);
640
641 for (j = 0; j < 4; j++) {
642 tmp_arg0[j] = arg0[j];
643 tmp_arg1[j] = arg1[j];
644 tmp_arg2[j] = arg2[j];
645 }
646
647 for (i = 0; i < 4; i++) {
648 if (mask & (1<<i)) {
649 for (j = 0; j < 4; j++) {
650 if (arg0[j].file == dst[i].file &&
651 dst[i].nr == arg0[j].nr) {
652 tmp_arg0[j] = alloc_tmp(c);
653 brw_MOV(p, tmp_arg0[j], arg0[j]);
654 }
655 if (arg1[j].file == dst[i].file &&
656 dst[i].nr == arg1[j].nr) {
657 tmp_arg1[j] = alloc_tmp(c);
658 brw_MOV(p, tmp_arg1[j], arg1[j]);
659 }
660 if (arg2[j].file == dst[i].file &&
661 dst[i].nr == arg2[j].nr) {
662 tmp_arg2[j] = alloc_tmp(c);
663 brw_MOV(p, tmp_arg2[j], arg2[j]);
664 }
665 }
666 }
667 }
668
669 func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
670
671 release_tmps(c, mark);
672 }
673
674 /* Workaround for using brw_wm_emit.c's emit functions, which expect
675 * destination regs to be uniquely written. Moves arguments out to
676 * temporaries as necessary for instructions which use their destination as
677 * a temporary.
678 */
679 static void
680 unalias2(struct brw_wm_compile *c,
681 void (*func)(struct brw_compile *c,
682 const struct brw_reg *dst,
683 GLuint mask,
684 const struct brw_reg *arg0,
685 const struct brw_reg *arg1),
686 const struct brw_reg *dst,
687 GLuint mask,
688 const struct brw_reg *arg0,
689 const struct brw_reg *arg1)
690 {
691 struct brw_compile *p = &c->func;
692 struct brw_reg tmp_arg0[4], tmp_arg1[4];
693 int i, j;
694 int mark = mark_tmps(c);
695
696 for (j = 0; j < 4; j++) {
697 tmp_arg0[j] = arg0[j];
698 tmp_arg1[j] = arg1[j];
699 }
700
701 for (i = 0; i < 4; i++) {
702 if (mask & (1<<i)) {
703 for (j = 0; j < 4; j++) {
704 if (arg0[j].file == dst[i].file &&
705 dst[i].nr == arg0[j].nr) {
706 tmp_arg0[j] = alloc_tmp(c);
707 brw_MOV(p, tmp_arg0[j], arg0[j]);
708 }
709 if (arg1[j].file == dst[i].file &&
710 dst[i].nr == arg1[j].nr) {
711 tmp_arg1[j] = alloc_tmp(c);
712 brw_MOV(p, tmp_arg1[j], arg1[j]);
713 }
714 }
715 }
716 }
717
718 func(p, dst, mask, tmp_arg0, tmp_arg1);
719
720 release_tmps(c, mark);
721 }
722
723 static void emit_arl(struct brw_wm_compile *c,
724 const struct prog_instruction *inst)
725 {
726 struct brw_compile *p = &c->func;
727 struct brw_reg src0, addr_reg;
728 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
729 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
730 BRW_ARF_ADDRESS, 0);
731 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
732 brw_MOV(p, addr_reg, src0);
733 brw_set_saturate(p, 0);
734 }
735
736 /**
737 * For GLSL shaders, this KIL will be unconditional.
738 * It may be contained inside an IF/ENDIF structure of course.
739 */
740 static void emit_kil(struct brw_wm_compile *c)
741 {
742 struct brw_compile *p = &c->func;
743 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
744 brw_push_insn_state(p);
745 brw_set_mask_control(p, BRW_MASK_DISABLE);
746 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
747 brw_AND(p, depth, c->emit_mask_reg, depth);
748 brw_pop_insn_state(p);
749 }
750
751 static INLINE struct brw_reg high_words( struct brw_reg reg )
752 {
753 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
754 0, 8, 2 );
755 }
756
757 static INLINE struct brw_reg low_words( struct brw_reg reg )
758 {
759 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
760 }
761
762 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
763 {
764 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
765 }
766
767 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
768 {
769 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
770 0, 16, 2 );
771 }
772
773 /* One-, two- and three-dimensional Perlin noise, similar to the description
774 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
775 static void noise1_sub( struct brw_wm_compile *c ) {
776
777 struct brw_compile *p = &c->func;
778 struct brw_reg param,
779 x0, x1, /* gradients at each end */
780 t, tmp[ 2 ], /* float temporaries */
781 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
782 int i;
783 int mark = mark_tmps( c );
784
785 x0 = alloc_tmp( c );
786 x1 = alloc_tmp( c );
787 t = alloc_tmp( c );
788 tmp[ 0 ] = alloc_tmp( c );
789 tmp[ 1 ] = alloc_tmp( c );
790 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
791 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
792 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
793 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
794 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
795
796 param = lookup_tmp( c, mark - 2 );
797
798 brw_set_access_mode( p, BRW_ALIGN_1 );
799
800 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
801
802 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
803 be hashed. Also compute the remainder (offset within the unit
804 length), interleaved to reduce register dependency penalties. */
805 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
806 brw_FRC( p, param, param );
807 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
808 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
809 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
810
811 /* We're now ready to perform the hashing. The two hashes are
812 interleaved for performance. The hash function used is
813 designed to rapidly achieve avalanche and require only 32x16
814 bit multiplication, and 16-bit swizzles (which we get for
815 free). We can't use immediate operands in the multiplies,
816 because immediates are permitted only in src1 and the 16-bit
817 factor is permitted only in src0. */
818 for( i = 0; i < 2; i++ )
819 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
820 for( i = 0; i < 2; i++ )
821 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
822 high_words( itmp[ i ] ) );
823 for( i = 0; i < 2; i++ )
824 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
825 for( i = 0; i < 2; i++ )
826 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
827 high_words( itmp[ i ] ) );
828 for( i = 0; i < 2; i++ )
829 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
830 for( i = 0; i < 2; i++ )
831 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
832 high_words( itmp[ i ] ) );
833
834 /* Now we want to initialise the two gradients based on the
835 hashes. Format conversion from signed integer to float leaves
836 everything scaled too high by a factor of pow( 2, 31 ), but
837 we correct for that right at the end. */
838 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
839 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
840 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
841
842 brw_MUL( p, x0, x0, param );
843 brw_MUL( p, x1, x1, t );
844
845 /* We interpolate between the gradients using the polynomial
846 6t^5 - 15t^4 + 10t^3 (Perlin). */
847 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
848 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
849 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
850 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
851 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
852 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
853 pipeline */
854 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
855 brw_MUL( p, param, tmp[ 0 ], param );
856 brw_MUL( p, x1, x1, param );
857 brw_ADD( p, x0, x0, x1 );
858 /* scale by pow( 2, -30 ), to compensate for the format conversion
859 above and an extra factor of 2 so that a single gradient covers
860 the [-1,1] range */
861 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
862
863 release_tmps( c, mark );
864 }
865
866 static void emit_noise1( struct brw_wm_compile *c,
867 const struct prog_instruction *inst )
868 {
869 struct brw_compile *p = &c->func;
870 struct brw_reg src, param, dst;
871 GLuint mask = inst->DstReg.WriteMask;
872 int i;
873 int mark = mark_tmps( c );
874
875 assert( mark == 0 );
876
877 src = get_src_reg( c, inst, 0, 0 );
878
879 param = alloc_tmp( c );
880
881 brw_MOV( p, param, src );
882
883 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
884
885 /* Fill in the result: */
886 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
887 for (i = 0 ; i < 4; i++) {
888 if (mask & (1<<i)) {
889 dst = get_dst_reg(c, inst, i);
890 brw_MOV( p, dst, param );
891 }
892 }
893 if( inst->SaturateMode == SATURATE_ZERO_ONE )
894 brw_set_saturate( p, 0 );
895
896 release_tmps( c, mark );
897 }
898
899 static void noise2_sub( struct brw_wm_compile *c ) {
900
901 struct brw_compile *p = &c->func;
902 struct brw_reg param0, param1,
903 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
904 t, tmp[ 4 ], /* float temporaries */
905 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
906 int i;
907 int mark = mark_tmps( c );
908
909 x0y0 = alloc_tmp( c );
910 x0y1 = alloc_tmp( c );
911 x1y0 = alloc_tmp( c );
912 x1y1 = alloc_tmp( c );
913 t = alloc_tmp( c );
914 for( i = 0; i < 4; i++ ) {
915 tmp[ i ] = alloc_tmp( c );
916 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
917 }
918 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
919 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
920 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
921
922 param0 = lookup_tmp( c, mark - 3 );
923 param1 = lookup_tmp( c, mark - 2 );
924
925 brw_set_access_mode( p, BRW_ALIGN_1 );
926
927 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
928 be hashed. Also compute the remainders (offsets within the unit
929 square), interleaved to reduce register dependency penalties. */
930 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
931 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
932 brw_FRC( p, param0, param0 );
933 brw_FRC( p, param1, param1 );
934 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
935 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
936 low_words( itmp[ 1 ] ) );
937 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
938 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
939 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
940 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
941 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
942
943 /* We're now ready to perform the hashing. The four hashes are
944 interleaved for performance. The hash function used is
945 designed to rapidly achieve avalanche and require only 32x16
946 bit multiplication, and 16-bit swizzles (which we get for
947 free). We can't use immediate operands in the multiplies,
948 because immediates are permitted only in src1 and the 16-bit
949 factor is permitted only in src0. */
950 for( i = 0; i < 4; i++ )
951 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
952 for( i = 0; i < 4; i++ )
953 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
954 high_words( itmp[ i ] ) );
955 for( i = 0; i < 4; i++ )
956 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
957 for( i = 0; i < 4; i++ )
958 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
959 high_words( itmp[ i ] ) );
960 for( i = 0; i < 4; i++ )
961 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
962 for( i = 0; i < 4; i++ )
963 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
964 high_words( itmp[ i ] ) );
965
966 /* Now we want to initialise the four gradients based on the
967 hashes. Format conversion from signed integer to float leaves
968 everything scaled too high by a factor of pow( 2, 15 ), but
969 we correct for that right at the end. */
970 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
971 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
972 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
973 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
974 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
975
976 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
977 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
978 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
979 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
980
981 brw_MUL( p, x1y0, x1y0, t );
982 brw_MUL( p, x1y1, x1y1, t );
983 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
984 brw_MUL( p, x0y0, x0y0, param0 );
985 brw_MUL( p, x0y1, x0y1, param0 );
986
987 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
988 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
989 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
990 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
991
992 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
993 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
994 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
995 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
996
997 /* We interpolate between the gradients using the polynomial
998 6t^5 - 15t^4 + 10t^3 (Perlin). */
999 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1000 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1001 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1002 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1003 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1004 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1005 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1006 pipeline */
1007 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1008 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1009 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1010 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1011 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1012 pipeline */
1013 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1014 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1015 brw_MUL( p, param0, tmp[ 0 ], param0 );
1016 brw_MUL( p, param1, tmp[ 1 ], param1 );
1017
1018 /* Here we interpolate in the y dimension... */
1019 brw_MUL( p, x0y1, x0y1, param1 );
1020 brw_MUL( p, x1y1, x1y1, param1 );
1021 brw_ADD( p, x0y0, x0y0, x0y1 );
1022 brw_ADD( p, x1y0, x1y0, x1y1 );
1023
1024 /* And now in x. There are horrible register dependencies here,
1025 but we have nothing else to do. */
1026 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1027 brw_MUL( p, x1y0, x1y0, param0 );
1028 brw_ADD( p, x0y0, x0y0, x1y0 );
1029
1030 /* scale by pow( 2, -15 ), as described above */
1031 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1032
1033 release_tmps( c, mark );
1034 }
1035
1036 static void emit_noise2( struct brw_wm_compile *c,
1037 const struct prog_instruction *inst )
1038 {
1039 struct brw_compile *p = &c->func;
1040 struct brw_reg src0, src1, param0, param1, dst;
1041 GLuint mask = inst->DstReg.WriteMask;
1042 int i;
1043 int mark = mark_tmps( c );
1044
1045 assert( mark == 0 );
1046
1047 src0 = get_src_reg( c, inst, 0, 0 );
1048 src1 = get_src_reg( c, inst, 0, 1 );
1049
1050 param0 = alloc_tmp( c );
1051 param1 = alloc_tmp( c );
1052
1053 brw_MOV( p, param0, src0 );
1054 brw_MOV( p, param1, src1 );
1055
1056 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1057
1058 /* Fill in the result: */
1059 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1060 for (i = 0 ; i < 4; i++) {
1061 if (mask & (1<<i)) {
1062 dst = get_dst_reg(c, inst, i);
1063 brw_MOV( p, dst, param0 );
1064 }
1065 }
1066 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1067 brw_set_saturate( p, 0 );
1068
1069 release_tmps( c, mark );
1070 }
1071
1072 /**
1073 * The three-dimensional case is much like the one- and two- versions above,
1074 * but since the number of corners is rapidly growing we now pack 16 16-bit
1075 * hashes into each register to extract more parallelism from the EUs.
1076 */
1077 static void noise3_sub( struct brw_wm_compile *c ) {
1078
1079 struct brw_compile *p = &c->func;
1080 struct brw_reg param0, param1, param2,
1081 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1082 xi, yi, zi, /* interpolation coefficients */
1083 t, tmp[ 8 ], /* float temporaries */
1084 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1085 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1086 int i;
1087 int mark = mark_tmps( c );
1088
1089 x0y0 = alloc_tmp( c );
1090 x0y1 = alloc_tmp( c );
1091 x1y0 = alloc_tmp( c );
1092 x1y1 = alloc_tmp( c );
1093 xi = alloc_tmp( c );
1094 yi = alloc_tmp( c );
1095 zi = alloc_tmp( c );
1096 t = alloc_tmp( c );
1097 for( i = 0; i < 8; i++ ) {
1098 tmp[ i ] = alloc_tmp( c );
1099 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1100 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1101 }
1102
1103 param0 = lookup_tmp( c, mark - 4 );
1104 param1 = lookup_tmp( c, mark - 3 );
1105 param2 = lookup_tmp( c, mark - 2 );
1106
1107 brw_set_access_mode( p, BRW_ALIGN_1 );
1108
1109 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1110 be hashed. Also compute the remainders (offsets within the unit
1111 cube), interleaved to reduce register dependency penalties. */
1112 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1113 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1114 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1115 brw_FRC( p, param0, param0 );
1116 brw_FRC( p, param1, param1 );
1117 brw_FRC( p, param2, param2 );
1118 /* Since we now have only 16 bits of precision in the hash, we must
1119 be more careful about thorough mixing to maintain entropy as we
1120 squash the input vector into a small scalar. */
1121 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1122 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1123 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1124 brw_imm_uw( 0x9B93 ) );
1125 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1126 brw_imm_uw( 0xBC8F ) );
1127
1128 /* Temporarily disable the execution mask while we work with ExecSize=16
1129 channels (the mask is set for ExecSize=8 and is probably incorrect).
1130 Although this might cause execution of unwanted channels, the code
1131 writes only to temporary registers and has no side effects, so
1132 disabling the mask is harmless. */
1133 brw_push_insn_state( p );
1134 brw_set_mask_control( p, BRW_MASK_DISABLE );
1135 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1136 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1137 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1138
1139 /* We're now ready to perform the hashing. The eight hashes are
1140 interleaved for performance. The hash function used is
1141 designed to rapidly achieve avalanche and require only 16x16
1142 bit multiplication, and 8-bit swizzles (which we get for
1143 free). */
1144 for( i = 0; i < 4; i++ )
1145 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1146 for( i = 0; i < 4; i++ )
1147 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1148 odd_bytes( wtmp[ i ] ) );
1149 for( i = 0; i < 4; i++ )
1150 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1151 for( i = 0; i < 4; i++ )
1152 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1153 odd_bytes( wtmp[ i ] ) );
1154 brw_pop_insn_state( p );
1155
1156 /* Now we want to initialise the four rear gradients based on the
1157 hashes. Format conversion from signed integer to float leaves
1158 everything scaled too high by a factor of pow( 2, 15 ), but
1159 we correct for that right at the end. */
1160 /* x component */
1161 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1162 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1163 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1164 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1165 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1166
1167 brw_push_insn_state( p );
1168 brw_set_mask_control( p, BRW_MASK_DISABLE );
1169 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1170 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1171 brw_pop_insn_state( p );
1172
1173 brw_MUL( p, x1y0, x1y0, t );
1174 brw_MUL( p, x1y1, x1y1, t );
1175 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1176 brw_MUL( p, x0y0, x0y0, param0 );
1177 brw_MUL( p, x0y1, x0y1, param0 );
1178
1179 /* y component */
1180 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1181 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1182 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1183 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1184
1185 brw_push_insn_state( p );
1186 brw_set_mask_control( p, BRW_MASK_DISABLE );
1187 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1188 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1189 brw_pop_insn_state( p );
1190
1191 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1192 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1193 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1194 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1195 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1196
1197 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1198 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1199 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1200 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1201
1202 /* z component */
1203 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1204 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1205 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1206 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1207
1208 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1209 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1210 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1211 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1212
1213 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1214 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1215 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1216 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1217
1218 /* We interpolate between the gradients using the polynomial
1219 6t^5 - 15t^4 + 10t^3 (Perlin). */
1220 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1221 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1222 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1223 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1224 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1225 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1226 brw_MUL( p, xi, xi, param0 );
1227 brw_MUL( p, yi, yi, param1 );
1228 brw_MUL( p, zi, zi, param2 );
1229 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1230 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1231 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1232 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1233 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1234 brw_MUL( p, xi, xi, param0 );
1235 brw_MUL( p, yi, yi, param1 );
1236 brw_MUL( p, zi, zi, param2 );
1237 brw_MUL( p, xi, xi, param0 );
1238 brw_MUL( p, yi, yi, param1 );
1239 brw_MUL( p, zi, zi, param2 );
1240 brw_MUL( p, xi, xi, param0 );
1241 brw_MUL( p, yi, yi, param1 );
1242 brw_MUL( p, zi, zi, param2 );
1243
1244 /* Here we interpolate in the y dimension... */
1245 brw_MUL( p, x0y1, x0y1, yi );
1246 brw_MUL( p, x1y1, x1y1, yi );
1247 brw_ADD( p, x0y0, x0y0, x0y1 );
1248 brw_ADD( p, x1y0, x1y0, x1y1 );
1249
1250 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1251 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1252 brw_MUL( p, x1y0, x1y0, xi );
1253 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1254
1255 /* Now do the same thing for the front four gradients... */
1256 /* x component */
1257 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1258 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1259 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1260 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1261
1262 brw_push_insn_state( p );
1263 brw_set_mask_control( p, BRW_MASK_DISABLE );
1264 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1265 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1266 brw_pop_insn_state( p );
1267
1268 brw_MUL( p, x1y0, x1y0, t );
1269 brw_MUL( p, x1y1, x1y1, t );
1270 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1271 brw_MUL( p, x0y0, x0y0, param0 );
1272 brw_MUL( p, x0y1, x0y1, param0 );
1273
1274 /* y component */
1275 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1276 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1277 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1278 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1279
1280 brw_push_insn_state( p );
1281 brw_set_mask_control( p, BRW_MASK_DISABLE );
1282 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1283 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1284 brw_pop_insn_state( p );
1285
1286 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1287 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1288 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1289 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1290 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1291
1292 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1293 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1294 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1295 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1296
1297 /* z component */
1298 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1299 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1300 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1301 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1302
1303 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1304 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1305 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1306 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1307
1308 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1309 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1310 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1311 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1312
1313 /* The interpolation coefficients are still around from last time, so
1314 again interpolate in the y dimension... */
1315 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1316 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1317 brw_MUL( p, x0y1, x0y1, yi );
1318 brw_MUL( p, x1y1, x1y1, yi );
1319 brw_ADD( p, x0y0, x0y0, x0y1 );
1320 brw_ADD( p, x1y0, x1y0, x1y1 );
1321
1322 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1323 time put the front face in tmp[ 1 ] and we're nearly there... */
1324 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1325 brw_MUL( p, x1y0, x1y0, xi );
1326 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1327
1328 /* The final interpolation, in the z dimension: */
1329 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1330 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1331 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1332
1333 /* scale by pow( 2, -15 ), as described above */
1334 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1335
1336 release_tmps( c, mark );
1337 }
1338
1339 static void emit_noise3( struct brw_wm_compile *c,
1340 const struct prog_instruction *inst )
1341 {
1342 struct brw_compile *p = &c->func;
1343 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1344 GLuint mask = inst->DstReg.WriteMask;
1345 int i;
1346 int mark = mark_tmps( c );
1347
1348 assert( mark == 0 );
1349
1350 src0 = get_src_reg( c, inst, 0, 0 );
1351 src1 = get_src_reg( c, inst, 0, 1 );
1352 src2 = get_src_reg( c, inst, 0, 2 );
1353
1354 param0 = alloc_tmp( c );
1355 param1 = alloc_tmp( c );
1356 param2 = alloc_tmp( c );
1357
1358 brw_MOV( p, param0, src0 );
1359 brw_MOV( p, param1, src1 );
1360 brw_MOV( p, param2, src2 );
1361
1362 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1363
1364 /* Fill in the result: */
1365 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1366 for (i = 0 ; i < 4; i++) {
1367 if (mask & (1<<i)) {
1368 dst = get_dst_reg(c, inst, i);
1369 brw_MOV( p, dst, param0 );
1370 }
1371 }
1372 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1373 brw_set_saturate( p, 0 );
1374
1375 release_tmps( c, mark );
1376 }
1377
1378 /**
1379 * For the four-dimensional case, the little micro-optimisation benefits
1380 * we obtain by unrolling all the loops aren't worth the massive bloat it
1381 * now causes. Instead, we loop twice around performing a similar operation
1382 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1383 * code to glue it all together.
1384 */
1385 static void noise4_sub( struct brw_wm_compile *c )
1386 {
1387 struct brw_compile *p = &c->func;
1388 struct brw_reg param[ 4 ],
1389 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1390 w0, /* noise for the w=0 cube */
1391 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1392 interp[ 4 ], /* interpolation coefficients */
1393 t, tmp[ 8 ], /* float temporaries */
1394 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1395 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1396 int i, j;
1397 int mark = mark_tmps( c );
1398 GLuint loop, origin;
1399
1400 x0y0 = alloc_tmp( c );
1401 x0y1 = alloc_tmp( c );
1402 x1y0 = alloc_tmp( c );
1403 x1y1 = alloc_tmp( c );
1404 t = alloc_tmp( c );
1405 w0 = alloc_tmp( c );
1406 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1407 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1408
1409 for( i = 0; i < 4; i++ ) {
1410 param[ i ] = lookup_tmp( c, mark - 5 + i );
1411 interp[ i ] = alloc_tmp( c );
1412 }
1413
1414 for( i = 0; i < 8; i++ ) {
1415 tmp[ i ] = alloc_tmp( c );
1416 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1417 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1418 }
1419
1420 brw_set_access_mode( p, BRW_ALIGN_1 );
1421
1422 /* We only want 16 bits of precision from the integral part of each
1423 co-ordinate, but unfortunately the RNDD semantics would saturate
1424 at 16 bits if we performed the operation directly to a 16-bit
1425 destination. Therefore, we round to 32-bit temporaries where
1426 appropriate, and then store only the lower 16 bits. */
1427 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1428 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1429 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1430 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1431 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1432 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1433
1434 /* Modify the flag register here, because the side effect is useful
1435 later (see below). We know for certain that all flags will be
1436 cleared, since the FRC instruction cannot possibly generate
1437 negative results. Even for exceptional inputs (infinities, denormals,
1438 NaNs), the architecture guarantees that the L conditional is false. */
1439 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1440 brw_FRC( p, param[ 0 ], param[ 0 ] );
1441 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1442 for( i = 1; i < 4; i++ )
1443 brw_FRC( p, param[ i ], param[ i ] );
1444
1445 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1446 of all. */
1447 for( i = 0; i < 4; i++ )
1448 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1449 for( i = 0; i < 4; i++ )
1450 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1451 for( i = 0; i < 4; i++ )
1452 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1453 for( i = 0; i < 4; i++ )
1454 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1455 for( j = 0; j < 3; j++ )
1456 for( i = 0; i < 4; i++ )
1457 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1458
1459 /* Mark the current address, as it will be a jump destination. The
1460 following code will be executed twice: first, with the flag
1461 register clear indicating the w=0 case, and second with flags
1462 set for w=1. */
1463 loop = p->nr_insn;
1464
1465 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1466 be hashed. Since we have only 16 bits of precision in the hash, we
1467 must be careful about thorough mixing to maintain entropy as we
1468 squash the input vector into a small scalar. */
1469 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1470 brw_imm_uw( 0xBC8F ) );
1471 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1472 brw_imm_uw( 0xD0BD ) );
1473 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1474 brw_imm_uw( 0x9B93 ) );
1475 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1476 brw_imm_uw( 0xA359 ) );
1477 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1478 brw_imm_uw( 0xBC8F ) );
1479
1480 /* Temporarily disable the execution mask while we work with ExecSize=16
1481 channels (the mask is set for ExecSize=8 and is probably incorrect).
1482 Although this might cause execution of unwanted channels, the code
1483 writes only to temporary registers and has no side effects, so
1484 disabling the mask is harmless. */
1485 brw_push_insn_state( p );
1486 brw_set_mask_control( p, BRW_MASK_DISABLE );
1487 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1488 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1489 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1490
1491 /* We're now ready to perform the hashing. The eight hashes are
1492 interleaved for performance. The hash function used is
1493 designed to rapidly achieve avalanche and require only 16x16
1494 bit multiplication, and 8-bit swizzles (which we get for
1495 free). */
1496 for( i = 0; i < 4; i++ )
1497 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1498 for( i = 0; i < 4; i++ )
1499 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1500 odd_bytes( wtmp[ i ] ) );
1501 for( i = 0; i < 4; i++ )
1502 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1503 for( i = 0; i < 4; i++ )
1504 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1505 odd_bytes( wtmp[ i ] ) );
1506 brw_pop_insn_state( p );
1507
1508 /* Now we want to initialise the four rear gradients based on the
1509 hashes. Format conversion from signed integer to float leaves
1510 everything scaled too high by a factor of pow( 2, 15 ), but
1511 we correct for that right at the end. */
1512 /* x component */
1513 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1514 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1515 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1516 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1517 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1518
1519 brw_push_insn_state( p );
1520 brw_set_mask_control( p, BRW_MASK_DISABLE );
1521 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1522 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1523 brw_pop_insn_state( p );
1524
1525 brw_MUL( p, x1y0, x1y0, t );
1526 brw_MUL( p, x1y1, x1y1, t );
1527 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1528 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1529 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1530
1531 /* y component */
1532 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1533 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1534 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1535 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1536
1537 brw_push_insn_state( p );
1538 brw_set_mask_control( p, BRW_MASK_DISABLE );
1539 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1540 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1541 brw_pop_insn_state( p );
1542
1543 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1544 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1545 /* prepare t for the w component (used below): w the first time through
1546 the loop; w - 1 the second time) */
1547 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1548 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1549 p->current->header.predicate_inverse = 1;
1550 brw_MOV( p, t, param[ 3 ] );
1551 p->current->header.predicate_inverse = 0;
1552 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1553 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1554 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1555
1556 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1557 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1558 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1559 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1560
1561 /* z component */
1562 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1563 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1564 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1565 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1566
1567 brw_push_insn_state( p );
1568 brw_set_mask_control( p, BRW_MASK_DISABLE );
1569 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1570 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1571 brw_pop_insn_state( p );
1572
1573 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1574 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1575 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1576 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1577
1578 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1579 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1580 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1581 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1582
1583 /* w component */
1584 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1585 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1586 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1587 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1588
1589 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1590 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1591 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1592 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1593 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1594
1595 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1596 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1597 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1598 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1599
1600 /* Here we interpolate in the y dimension... */
1601 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1602 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1603 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1604 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1605 brw_ADD( p, x0y0, x0y0, x0y1 );
1606 brw_ADD( p, x1y0, x1y0, x1y1 );
1607
1608 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1609 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1610 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1611 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1612
1613 /* Now do the same thing for the front four gradients... */
1614 /* x component */
1615 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1616 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1617 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1618 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1619
1620 brw_push_insn_state( p );
1621 brw_set_mask_control( p, BRW_MASK_DISABLE );
1622 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1623 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1624 brw_pop_insn_state( p );
1625
1626 brw_MUL( p, x1y0, x1y0, t );
1627 brw_MUL( p, x1y1, x1y1, t );
1628 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1629 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1630 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1631
1632 /* y component */
1633 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1634 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1635 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1636 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1637
1638 brw_push_insn_state( p );
1639 brw_set_mask_control( p, BRW_MASK_DISABLE );
1640 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1641 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1642 brw_pop_insn_state( p );
1643
1644 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1645 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1646 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1647 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1648 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1649
1650 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1651 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1652 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1653 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1654
1655 /* z component */
1656 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1657 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1658 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1659 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1660
1661 brw_push_insn_state( p );
1662 brw_set_mask_control( p, BRW_MASK_DISABLE );
1663 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1664 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1665 brw_pop_insn_state( p );
1666
1667 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1668 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1669 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1670 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1671 /* prepare t for the w component (used below): w the first time through
1672 the loop; w - 1 the second time) */
1673 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1674 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1675 p->current->header.predicate_inverse = 1;
1676 brw_MOV( p, t, param[ 3 ] );
1677 p->current->header.predicate_inverse = 0;
1678 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1679
1680 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1681 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1682 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1683 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1684
1685 /* w component */
1686 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1687 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1688 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1689 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1690
1691 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1692 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1693 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1694 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1695
1696 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1697 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1698 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1699 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1700
1701 /* Interpolate in the y dimension: */
1702 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1703 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1704 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1705 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1706 brw_ADD( p, x0y0, x0y0, x0y1 );
1707 brw_ADD( p, x1y0, x1y0, x1y1 );
1708
1709 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1710 time put the front face in tmp[ 1 ] and we're nearly there... */
1711 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1712 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1713 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1714
1715 /* Another interpolation, in the z dimension: */
1716 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1717 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1718 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1719
1720 /* Exit the loop if we've computed both cubes... */
1721 origin = p->nr_insn;
1722 brw_push_insn_state( p );
1723 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1724 brw_set_mask_control( p, BRW_MASK_DISABLE );
1725 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1726 brw_pop_insn_state( p );
1727
1728 /* Save the result for the w=0 case, and increment the w coordinate: */
1729 brw_MOV( p, w0, tmp[ 0 ] );
1730 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1731 brw_imm_uw( 1 ) );
1732
1733 /* Loop around for the other cube. Explicitly set the flag register
1734 (unfortunately we must spend an extra instruction to do this: we
1735 can't rely on a side effect of the previous MOV or ADD because
1736 conditional modifiers which are normally true might be false in
1737 exceptional circumstances, e.g. given a NaN input; the add to
1738 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1739 brw_push_insn_state( p );
1740 brw_set_mask_control( p, BRW_MASK_DISABLE );
1741 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1742 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1743 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1744 brw_pop_insn_state( p );
1745
1746 /* Patch the previous conditional branch now that we know the
1747 destination address. */
1748 brw_set_src1( p->store + origin,
1749 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1750
1751 /* The very last interpolation. */
1752 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1753 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1754 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1755
1756 /* scale by pow( 2, -15 ), as described above */
1757 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1758
1759 release_tmps( c, mark );
1760 }
1761
1762 static void emit_noise4( struct brw_wm_compile *c,
1763 const struct prog_instruction *inst )
1764 {
1765 struct brw_compile *p = &c->func;
1766 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1767 GLuint mask = inst->DstReg.WriteMask;
1768 int i;
1769 int mark = mark_tmps( c );
1770
1771 assert( mark == 0 );
1772
1773 src0 = get_src_reg( c, inst, 0, 0 );
1774 src1 = get_src_reg( c, inst, 0, 1 );
1775 src2 = get_src_reg( c, inst, 0, 2 );
1776 src3 = get_src_reg( c, inst, 0, 3 );
1777
1778 param0 = alloc_tmp( c );
1779 param1 = alloc_tmp( c );
1780 param2 = alloc_tmp( c );
1781 param3 = alloc_tmp( c );
1782
1783 brw_MOV( p, param0, src0 );
1784 brw_MOV( p, param1, src1 );
1785 brw_MOV( p, param2, src2 );
1786 brw_MOV( p, param3, src3 );
1787
1788 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1789
1790 /* Fill in the result: */
1791 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1792 for (i = 0 ; i < 4; i++) {
1793 if (mask & (1<<i)) {
1794 dst = get_dst_reg(c, inst, i);
1795 brw_MOV( p, dst, param0 );
1796 }
1797 }
1798 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1799 brw_set_saturate( p, 0 );
1800
1801 release_tmps( c, mark );
1802 }
1803
1804 /**
1805 * Resolve subroutine calls after code emit is done.
1806 */
1807 static void post_wm_emit( struct brw_wm_compile *c )
1808 {
1809 brw_resolve_cals(&c->func);
1810 }
1811
1812 static void
1813 get_argument_regs(struct brw_wm_compile *c,
1814 const struct prog_instruction *inst,
1815 int index,
1816 struct brw_reg *regs,
1817 int mask)
1818 {
1819 int i;
1820
1821 for (i = 0; i < 4; i++) {
1822 if (mask & (1 << i))
1823 regs[i] = get_src_reg(c, inst, index, i);
1824 }
1825 }
1826
1827 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1828 {
1829 struct intel_context *intel = &brw->intel;
1830 #define MAX_IF_DEPTH 32
1831 #define MAX_LOOP_DEPTH 32
1832 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1833 GLuint i, if_depth = 0, loop_depth = 0;
1834 struct brw_compile *p = &c->func;
1835 struct brw_indirect stack_index = brw_indirect(0, 0);
1836
1837 c->out_of_regs = GL_FALSE;
1838
1839 prealloc_reg(c);
1840 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1841 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1842
1843 for (i = 0; i < c->nr_fp_insns; i++) {
1844 const struct prog_instruction *inst = &c->prog_instructions[i];
1845 int dst_flags;
1846 struct brw_reg args[3][4], dst[4];
1847 int j;
1848
1849 c->cur_inst = i;
1850
1851 #if 0
1852 _mesa_printf("Inst %d: ", i);
1853 _mesa_print_instruction(inst);
1854 #endif
1855
1856 /* fetch any constants that this instruction needs */
1857 if (c->fp->use_const_buffer)
1858 fetch_constants(c, inst);
1859
1860 if (inst->Opcode != OPCODE_ARL) {
1861 for (j = 0; j < 4; j++) {
1862 if (inst->DstReg.WriteMask & (1 << j))
1863 dst[j] = get_dst_reg(c, inst, j);
1864 else
1865 dst[j] = brw_null_reg();
1866 }
1867 }
1868 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1869 get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
1870
1871 dst_flags = inst->DstReg.WriteMask;
1872 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1873 dst_flags |= SATURATE;
1874
1875 if (inst->CondUpdate)
1876 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1877 else
1878 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1879
1880 switch (inst->Opcode) {
1881 case WM_PIXELXY:
1882 emit_pixel_xy(c, dst, dst_flags);
1883 break;
1884 case WM_DELTAXY:
1885 emit_delta_xy(p, dst, dst_flags, args[0]);
1886 break;
1887 case WM_PIXELW:
1888 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1889 break;
1890 case WM_LINTERP:
1891 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1892 break;
1893 case WM_PINTERP:
1894 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1895 break;
1896 case WM_CINTERP:
1897 emit_cinterp(p, dst, dst_flags, args[0]);
1898 break;
1899 case WM_WPOSXY:
1900 emit_wpos_xy(c, dst, dst_flags, args[0]);
1901 break;
1902 case WM_FB_WRITE:
1903 emit_fb_write(c, args[0], args[1], args[2],
1904 INST_AUX_GET_TARGET(inst->Aux),
1905 inst->Aux & INST_AUX_EOT);
1906 break;
1907 case WM_FRONTFACING:
1908 emit_frontfacing(p, dst, dst_flags);
1909 break;
1910 case OPCODE_ADD:
1911 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1912 break;
1913 case OPCODE_ARL:
1914 emit_arl(c, inst);
1915 break;
1916 case OPCODE_FRC:
1917 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1918 break;
1919 case OPCODE_FLR:
1920 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1921 break;
1922 case OPCODE_LRP:
1923 unalias3(c, emit_lrp,
1924 dst, dst_flags, args[0], args[1], args[2]);
1925 break;
1926 case OPCODE_TRUNC:
1927 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1928 break;
1929 case OPCODE_MOV:
1930 case OPCODE_SWZ:
1931 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1932 break;
1933 case OPCODE_DP3:
1934 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1935 break;
1936 case OPCODE_DP4:
1937 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1938 break;
1939 case OPCODE_XPD:
1940 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1941 break;
1942 case OPCODE_DPH:
1943 emit_dph(p, dst, dst_flags, args[0], args[1]);
1944 break;
1945 case OPCODE_RCP:
1946 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1947 break;
1948 case OPCODE_RSQ:
1949 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1950 break;
1951 case OPCODE_SIN:
1952 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1953 break;
1954 case OPCODE_COS:
1955 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1956 break;
1957 case OPCODE_EX2:
1958 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1959 break;
1960 case OPCODE_LG2:
1961 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1962 break;
1963 case OPCODE_MIN:
1964 unalias2(c, emit_min, dst, dst_flags, args[0], args[1]);
1965 break;
1966 case OPCODE_MAX:
1967 unalias2(c, emit_max, dst, dst_flags, args[0], args[1]);
1968 break;
1969 case OPCODE_DDX:
1970 case OPCODE_DDY:
1971 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1972 args[0]);
1973 break;
1974 case OPCODE_SLT:
1975 emit_sop(p, dst, dst_flags,
1976 BRW_CONDITIONAL_L, args[0], args[1]);
1977 break;
1978 case OPCODE_SLE:
1979 emit_sop(p, dst, dst_flags,
1980 BRW_CONDITIONAL_LE, args[0], args[1]);
1981 break;
1982 case OPCODE_SGT:
1983 emit_sop(p, dst, dst_flags,
1984 BRW_CONDITIONAL_G, args[0], args[1]);
1985 break;
1986 case OPCODE_SGE:
1987 emit_sop(p, dst, dst_flags,
1988 BRW_CONDITIONAL_GE, args[0], args[1]);
1989 break;
1990 case OPCODE_SEQ:
1991 emit_sop(p, dst, dst_flags,
1992 BRW_CONDITIONAL_EQ, args[0], args[1]);
1993 break;
1994 case OPCODE_SNE:
1995 emit_sop(p, dst, dst_flags,
1996 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1997 break;
1998 case OPCODE_MUL:
1999 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2000 break;
2001 case OPCODE_POW:
2002 emit_math2(c, BRW_MATH_FUNCTION_POW,
2003 dst, dst_flags, args[0], args[1]);
2004 break;
2005 case OPCODE_MAD:
2006 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2007 break;
2008 case OPCODE_NOISE1:
2009 emit_noise1(c, inst);
2010 break;
2011 case OPCODE_NOISE2:
2012 emit_noise2(c, inst);
2013 break;
2014 case OPCODE_NOISE3:
2015 emit_noise3(c, inst);
2016 break;
2017 case OPCODE_NOISE4:
2018 emit_noise4(c, inst);
2019 break;
2020 case OPCODE_TEX:
2021 emit_tex(c, dst, dst_flags, args[0],
2022 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2023 0, 1, 0, 0),
2024 inst->TexSrcTarget,
2025 inst->TexSrcUnit,
2026 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
2027 break;
2028 case OPCODE_TXB:
2029 emit_txb(c, dst, dst_flags, args[0],
2030 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2031 0, 1, 0, 0),
2032 inst->TexSrcTarget,
2033 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2034 break;
2035 case OPCODE_KIL_NV:
2036 emit_kil(c);
2037 break;
2038 case OPCODE_IF:
2039 assert(if_depth < MAX_IF_DEPTH);
2040 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2041 break;
2042 case OPCODE_ELSE:
2043 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2044 break;
2045 case OPCODE_ENDIF:
2046 assert(if_depth > 0);
2047 brw_ENDIF(p, if_inst[--if_depth]);
2048 break;
2049 case OPCODE_BGNSUB:
2050 brw_save_label(p, inst->Comment, p->nr_insn);
2051 break;
2052 case OPCODE_ENDSUB:
2053 /* no-op */
2054 break;
2055 case OPCODE_CAL:
2056 brw_push_insn_state(p);
2057 brw_set_mask_control(p, BRW_MASK_DISABLE);
2058 brw_set_access_mode(p, BRW_ALIGN_1);
2059 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2060 brw_set_access_mode(p, BRW_ALIGN_16);
2061 brw_ADD(p, get_addr_reg(stack_index),
2062 get_addr_reg(stack_index), brw_imm_d(4));
2063 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2064 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2065 brw_pop_insn_state(p);
2066 break;
2067
2068 case OPCODE_RET:
2069 brw_push_insn_state(p);
2070 brw_set_mask_control(p, BRW_MASK_DISABLE);
2071 brw_ADD(p, get_addr_reg(stack_index),
2072 get_addr_reg(stack_index), brw_imm_d(-4));
2073 brw_set_access_mode(p, BRW_ALIGN_1);
2074 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2075 brw_set_access_mode(p, BRW_ALIGN_16);
2076 brw_pop_insn_state(p);
2077
2078 break;
2079 case OPCODE_BGNLOOP:
2080 /* XXX may need to invalidate the current_constant regs */
2081 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2082 break;
2083 case OPCODE_BRK:
2084 brw_BREAK(p);
2085 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2086 break;
2087 case OPCODE_CONT:
2088 brw_CONT(p);
2089 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2090 break;
2091 case OPCODE_ENDLOOP:
2092 {
2093 struct brw_instruction *inst0, *inst1;
2094 GLuint br = 1;
2095
2096 if (intel->is_ironlake)
2097 br = 2;
2098
2099 loop_depth--;
2100 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2101 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2102 while (inst0 > loop_inst[loop_depth]) {
2103 inst0--;
2104 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2105 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2106 inst0->bits3.if_else.pop_count = 0;
2107 }
2108 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2109 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2110 inst0->bits3.if_else.pop_count = 0;
2111 }
2112 }
2113 }
2114 break;
2115 default:
2116 _mesa_printf("unsupported IR in fragment shader %d\n",
2117 inst->Opcode);
2118 }
2119
2120 if (inst->CondUpdate)
2121 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2122 else
2123 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2124 }
2125 post_wm_emit(c);
2126
2127 if (INTEL_DEBUG & DEBUG_WM) {
2128 _mesa_printf("wm-native:\n");
2129 for (i = 0; i < p->nr_insn; i++)
2130 brw_disasm(stderr, &p->store[i]);
2131 _mesa_printf("\n");
2132 }
2133 }
2134
2135 /**
2136 * Do GPU code generation for shaders that use GLSL features such as
2137 * flow control. Other shaders will be compiled with the
2138 */
2139 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2140 {
2141 if (INTEL_DEBUG & DEBUG_WM) {
2142 _mesa_printf("brw_wm_glsl_emit:\n");
2143 }
2144
2145 /* initial instruction translation/simplification */
2146 brw_wm_pass_fp(c);
2147
2148 /* actual code generation */
2149 brw_wm_emit_glsl(brw, c);
2150
2151 if (INTEL_DEBUG & DEBUG_WM) {
2152 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2153 }
2154
2155 c->prog_data.total_grf = num_grf_used(c);
2156 c->prog_data.total_scratch = 0;
2157 }