Merge commit 'origin/master' into gallium-sw-api-2
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553 /**
554 * Subroutines are minimal support for resusable instruction sequences.
555 * They are implemented as simply as possible to minimise overhead: there
556 * is no explicit support for communication between the caller and callee
557 * other than saving the return address in a temporary register, nor is
558 * there any automatic local storage. This implies that great care is
559 * required before attempting reentrancy or any kind of nested
560 * subroutine invocations.
561 */
562 static void invoke_subroutine( struct brw_wm_compile *c,
563 enum _subroutine subroutine,
564 void (*emit)( struct brw_wm_compile * ) )
565 {
566 struct brw_compile *p = &c->func;
567
568 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
569
570 if( c->subroutines[ subroutine ] ) {
571 /* subroutine previously emitted: reuse existing instructions */
572
573 int mark = mark_tmps( c );
574 struct brw_reg return_address = retype( alloc_tmp( c ),
575 BRW_REGISTER_TYPE_UD );
576 int here = p->nr_insn;
577
578 brw_push_insn_state(p);
579 brw_set_mask_control(p, BRW_MASK_DISABLE);
580 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
581
582 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
583 brw_imm_d( ( c->subroutines[ subroutine ] -
584 here - 1 ) << 4 ) );
585 brw_pop_insn_state(p);
586
587 release_tmps( c, mark );
588 } else {
589 /* previously unused subroutine: emit, and mark for later reuse */
590
591 int mark = mark_tmps( c );
592 struct brw_reg return_address = retype( alloc_tmp( c ),
593 BRW_REGISTER_TYPE_UD );
594 struct brw_instruction *calc;
595 int base = p->nr_insn;
596
597 brw_push_insn_state(p);
598 brw_set_mask_control(p, BRW_MASK_DISABLE);
599 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
600 brw_pop_insn_state(p);
601
602 c->subroutines[ subroutine ] = p->nr_insn;
603
604 emit( c );
605
606 brw_push_insn_state(p);
607 brw_set_mask_control(p, BRW_MASK_DISABLE);
608 brw_MOV( p, brw_ip_reg(), return_address );
609 brw_pop_insn_state(p);
610
611 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
612
613 release_tmps( c, mark );
614 }
615 }
616
617 static void emit_arl(struct brw_wm_compile *c,
618 const struct prog_instruction *inst)
619 {
620 struct brw_compile *p = &c->func;
621 struct brw_reg src0, addr_reg;
622 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
623 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
624 BRW_ARF_ADDRESS, 0);
625 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
626 brw_MOV(p, addr_reg, src0);
627 brw_set_saturate(p, 0);
628 }
629
630 /**
631 * For GLSL shaders, this KIL will be unconditional.
632 * It may be contained inside an IF/ENDIF structure of course.
633 */
634 static void emit_kil(struct brw_wm_compile *c)
635 {
636 struct brw_compile *p = &c->func;
637 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
638 brw_push_insn_state(p);
639 brw_set_mask_control(p, BRW_MASK_DISABLE);
640 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
641 brw_AND(p, depth, c->emit_mask_reg, depth);
642 brw_pop_insn_state(p);
643 }
644
645 static INLINE struct brw_reg high_words( struct brw_reg reg )
646 {
647 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
648 0, 8, 2 );
649 }
650
651 static INLINE struct brw_reg low_words( struct brw_reg reg )
652 {
653 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
654 }
655
656 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
657 {
658 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
659 }
660
661 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
662 {
663 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
664 0, 16, 2 );
665 }
666
667 /* One-, two- and three-dimensional Perlin noise, similar to the description
668 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
669 static void noise1_sub( struct brw_wm_compile *c ) {
670
671 struct brw_compile *p = &c->func;
672 struct brw_reg param,
673 x0, x1, /* gradients at each end */
674 t, tmp[ 2 ], /* float temporaries */
675 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
676 int i;
677 int mark = mark_tmps( c );
678
679 x0 = alloc_tmp( c );
680 x1 = alloc_tmp( c );
681 t = alloc_tmp( c );
682 tmp[ 0 ] = alloc_tmp( c );
683 tmp[ 1 ] = alloc_tmp( c );
684 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
685 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
686 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
687 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
688 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
689
690 param = lookup_tmp( c, mark - 2 );
691
692 brw_set_access_mode( p, BRW_ALIGN_1 );
693
694 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
695
696 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
697 be hashed. Also compute the remainder (offset within the unit
698 length), interleaved to reduce register dependency penalties. */
699 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
700 brw_FRC( p, param, param );
701 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
702 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
703 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
704
705 /* We're now ready to perform the hashing. The two hashes are
706 interleaved for performance. The hash function used is
707 designed to rapidly achieve avalanche and require only 32x16
708 bit multiplication, and 16-bit swizzles (which we get for
709 free). We can't use immediate operands in the multiplies,
710 because immediates are permitted only in src1 and the 16-bit
711 factor is permitted only in src0. */
712 for( i = 0; i < 2; i++ )
713 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
714 for( i = 0; i < 2; i++ )
715 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
716 high_words( itmp[ i ] ) );
717 for( i = 0; i < 2; i++ )
718 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
719 for( i = 0; i < 2; i++ )
720 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
721 high_words( itmp[ i ] ) );
722 for( i = 0; i < 2; i++ )
723 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
724 for( i = 0; i < 2; i++ )
725 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
726 high_words( itmp[ i ] ) );
727
728 /* Now we want to initialise the two gradients based on the
729 hashes. Format conversion from signed integer to float leaves
730 everything scaled too high by a factor of pow( 2, 31 ), but
731 we correct for that right at the end. */
732 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
733 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
734 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
735
736 brw_MUL( p, x0, x0, param );
737 brw_MUL( p, x1, x1, t );
738
739 /* We interpolate between the gradients using the polynomial
740 6t^5 - 15t^4 + 10t^3 (Perlin). */
741 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
742 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
743 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
744 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
745 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
746 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
747 pipeline */
748 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
749 brw_MUL( p, param, tmp[ 0 ], param );
750 brw_MUL( p, x1, x1, param );
751 brw_ADD( p, x0, x0, x1 );
752 /* scale by pow( 2, -30 ), to compensate for the format conversion
753 above and an extra factor of 2 so that a single gradient covers
754 the [-1,1] range */
755 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
756
757 release_tmps( c, mark );
758 }
759
760 static void emit_noise1( struct brw_wm_compile *c,
761 const struct prog_instruction *inst )
762 {
763 struct brw_compile *p = &c->func;
764 struct brw_reg src, param, dst;
765 GLuint mask = inst->DstReg.WriteMask;
766 int i;
767 int mark = mark_tmps( c );
768
769 assert( mark == 0 );
770
771 src = get_src_reg( c, inst, 0, 0 );
772
773 param = alloc_tmp( c );
774
775 brw_MOV( p, param, src );
776
777 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
778
779 /* Fill in the result: */
780 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
781 for (i = 0 ; i < 4; i++) {
782 if (mask & (1<<i)) {
783 dst = get_dst_reg(c, inst, i);
784 brw_MOV( p, dst, param );
785 }
786 }
787 if( inst->SaturateMode == SATURATE_ZERO_ONE )
788 brw_set_saturate( p, 0 );
789
790 release_tmps( c, mark );
791 }
792
793 static void noise2_sub( struct brw_wm_compile *c ) {
794
795 struct brw_compile *p = &c->func;
796 struct brw_reg param0, param1,
797 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
798 t, tmp[ 4 ], /* float temporaries */
799 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
800 int i;
801 int mark = mark_tmps( c );
802
803 x0y0 = alloc_tmp( c );
804 x0y1 = alloc_tmp( c );
805 x1y0 = alloc_tmp( c );
806 x1y1 = alloc_tmp( c );
807 t = alloc_tmp( c );
808 for( i = 0; i < 4; i++ ) {
809 tmp[ i ] = alloc_tmp( c );
810 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
811 }
812 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
813 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
814 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
815
816 param0 = lookup_tmp( c, mark - 3 );
817 param1 = lookup_tmp( c, mark - 2 );
818
819 brw_set_access_mode( p, BRW_ALIGN_1 );
820
821 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
822 be hashed. Also compute the remainders (offsets within the unit
823 square), interleaved to reduce register dependency penalties. */
824 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
825 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
826 brw_FRC( p, param0, param0 );
827 brw_FRC( p, param1, param1 );
828 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
829 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
830 low_words( itmp[ 1 ] ) );
831 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
832 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
833 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
834 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
835 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
836
837 /* We're now ready to perform the hashing. The four hashes are
838 interleaved for performance. The hash function used is
839 designed to rapidly achieve avalanche and require only 32x16
840 bit multiplication, and 16-bit swizzles (which we get for
841 free). We can't use immediate operands in the multiplies,
842 because immediates are permitted only in src1 and the 16-bit
843 factor is permitted only in src0. */
844 for( i = 0; i < 4; i++ )
845 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
846 for( i = 0; i < 4; i++ )
847 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
848 high_words( itmp[ i ] ) );
849 for( i = 0; i < 4; i++ )
850 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
851 for( i = 0; i < 4; i++ )
852 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
853 high_words( itmp[ i ] ) );
854 for( i = 0; i < 4; i++ )
855 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
856 for( i = 0; i < 4; i++ )
857 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
858 high_words( itmp[ i ] ) );
859
860 /* Now we want to initialise the four gradients based on the
861 hashes. Format conversion from signed integer to float leaves
862 everything scaled too high by a factor of pow( 2, 15 ), but
863 we correct for that right at the end. */
864 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
865 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
866 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
867 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
868 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
869
870 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
871 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
872 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
873 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
874
875 brw_MUL( p, x1y0, x1y0, t );
876 brw_MUL( p, x1y1, x1y1, t );
877 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
878 brw_MUL( p, x0y0, x0y0, param0 );
879 brw_MUL( p, x0y1, x0y1, param0 );
880
881 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
882 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
883 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
884 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
885
886 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
887 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
888 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
889 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
890
891 /* We interpolate between the gradients using the polynomial
892 6t^5 - 15t^4 + 10t^3 (Perlin). */
893 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
894 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
895 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
896 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
897 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
898 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
899 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
900 pipeline */
901 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
902 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
903 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
904 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
905 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
906 pipeline */
907 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
908 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
909 brw_MUL( p, param0, tmp[ 0 ], param0 );
910 brw_MUL( p, param1, tmp[ 1 ], param1 );
911
912 /* Here we interpolate in the y dimension... */
913 brw_MUL( p, x0y1, x0y1, param1 );
914 brw_MUL( p, x1y1, x1y1, param1 );
915 brw_ADD( p, x0y0, x0y0, x0y1 );
916 brw_ADD( p, x1y0, x1y0, x1y1 );
917
918 /* And now in x. There are horrible register dependencies here,
919 but we have nothing else to do. */
920 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
921 brw_MUL( p, x1y0, x1y0, param0 );
922 brw_ADD( p, x0y0, x0y0, x1y0 );
923
924 /* scale by pow( 2, -15 ), as described above */
925 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
926
927 release_tmps( c, mark );
928 }
929
930 static void emit_noise2( struct brw_wm_compile *c,
931 const struct prog_instruction *inst )
932 {
933 struct brw_compile *p = &c->func;
934 struct brw_reg src0, src1, param0, param1, dst;
935 GLuint mask = inst->DstReg.WriteMask;
936 int i;
937 int mark = mark_tmps( c );
938
939 assert( mark == 0 );
940
941 src0 = get_src_reg( c, inst, 0, 0 );
942 src1 = get_src_reg( c, inst, 0, 1 );
943
944 param0 = alloc_tmp( c );
945 param1 = alloc_tmp( c );
946
947 brw_MOV( p, param0, src0 );
948 brw_MOV( p, param1, src1 );
949
950 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
951
952 /* Fill in the result: */
953 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
954 for (i = 0 ; i < 4; i++) {
955 if (mask & (1<<i)) {
956 dst = get_dst_reg(c, inst, i);
957 brw_MOV( p, dst, param0 );
958 }
959 }
960 if( inst->SaturateMode == SATURATE_ZERO_ONE )
961 brw_set_saturate( p, 0 );
962
963 release_tmps( c, mark );
964 }
965
966 /**
967 * The three-dimensional case is much like the one- and two- versions above,
968 * but since the number of corners is rapidly growing we now pack 16 16-bit
969 * hashes into each register to extract more parallelism from the EUs.
970 */
971 static void noise3_sub( struct brw_wm_compile *c ) {
972
973 struct brw_compile *p = &c->func;
974 struct brw_reg param0, param1, param2,
975 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
976 xi, yi, zi, /* interpolation coefficients */
977 t, tmp[ 8 ], /* float temporaries */
978 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
979 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
980 int i;
981 int mark = mark_tmps( c );
982
983 x0y0 = alloc_tmp( c );
984 x0y1 = alloc_tmp( c );
985 x1y0 = alloc_tmp( c );
986 x1y1 = alloc_tmp( c );
987 xi = alloc_tmp( c );
988 yi = alloc_tmp( c );
989 zi = alloc_tmp( c );
990 t = alloc_tmp( c );
991 for( i = 0; i < 8; i++ ) {
992 tmp[ i ] = alloc_tmp( c );
993 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
994 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
995 }
996
997 param0 = lookup_tmp( c, mark - 4 );
998 param1 = lookup_tmp( c, mark - 3 );
999 param2 = lookup_tmp( c, mark - 2 );
1000
1001 brw_set_access_mode( p, BRW_ALIGN_1 );
1002
1003 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1004 be hashed. Also compute the remainders (offsets within the unit
1005 cube), interleaved to reduce register dependency penalties. */
1006 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1007 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1008 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1009 brw_FRC( p, param0, param0 );
1010 brw_FRC( p, param1, param1 );
1011 brw_FRC( p, param2, param2 );
1012 /* Since we now have only 16 bits of precision in the hash, we must
1013 be more careful about thorough mixing to maintain entropy as we
1014 squash the input vector into a small scalar. */
1015 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1016 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1017 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1018 brw_imm_uw( 0x9B93 ) );
1019 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1020 brw_imm_uw( 0xBC8F ) );
1021
1022 /* Temporarily disable the execution mask while we work with ExecSize=16
1023 channels (the mask is set for ExecSize=8 and is probably incorrect).
1024 Although this might cause execution of unwanted channels, the code
1025 writes only to temporary registers and has no side effects, so
1026 disabling the mask is harmless. */
1027 brw_push_insn_state( p );
1028 brw_set_mask_control( p, BRW_MASK_DISABLE );
1029 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1030 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1031 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1032
1033 /* We're now ready to perform the hashing. The eight hashes are
1034 interleaved for performance. The hash function used is
1035 designed to rapidly achieve avalanche and require only 16x16
1036 bit multiplication, and 8-bit swizzles (which we get for
1037 free). */
1038 for( i = 0; i < 4; i++ )
1039 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1040 for( i = 0; i < 4; i++ )
1041 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1042 odd_bytes( wtmp[ i ] ) );
1043 for( i = 0; i < 4; i++ )
1044 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1045 for( i = 0; i < 4; i++ )
1046 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1047 odd_bytes( wtmp[ i ] ) );
1048 brw_pop_insn_state( p );
1049
1050 /* Now we want to initialise the four rear gradients based on the
1051 hashes. Format conversion from signed integer to float leaves
1052 everything scaled too high by a factor of pow( 2, 15 ), but
1053 we correct for that right at the end. */
1054 /* x component */
1055 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1056 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1057 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1058 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1059 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1060
1061 brw_push_insn_state( p );
1062 brw_set_mask_control( p, BRW_MASK_DISABLE );
1063 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1064 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1065 brw_pop_insn_state( p );
1066
1067 brw_MUL( p, x1y0, x1y0, t );
1068 brw_MUL( p, x1y1, x1y1, t );
1069 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1070 brw_MUL( p, x0y0, x0y0, param0 );
1071 brw_MUL( p, x0y1, x0y1, param0 );
1072
1073 /* y component */
1074 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1075 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1076 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1077 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1078
1079 brw_push_insn_state( p );
1080 brw_set_mask_control( p, BRW_MASK_DISABLE );
1081 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1082 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1083 brw_pop_insn_state( p );
1084
1085 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1086 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1087 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1088 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1089 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1090
1091 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1092 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1093 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1094 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1095
1096 /* z component */
1097 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1098 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1099 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1100 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1101
1102 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1103 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1104 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1105 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1106
1107 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1108 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1109 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1110 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1111
1112 /* We interpolate between the gradients using the polynomial
1113 6t^5 - 15t^4 + 10t^3 (Perlin). */
1114 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1115 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1116 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1117 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1118 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1119 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1120 brw_MUL( p, xi, xi, param0 );
1121 brw_MUL( p, yi, yi, param1 );
1122 brw_MUL( p, zi, zi, param2 );
1123 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1124 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1125 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1126 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1127 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1128 brw_MUL( p, xi, xi, param0 );
1129 brw_MUL( p, yi, yi, param1 );
1130 brw_MUL( p, zi, zi, param2 );
1131 brw_MUL( p, xi, xi, param0 );
1132 brw_MUL( p, yi, yi, param1 );
1133 brw_MUL( p, zi, zi, param2 );
1134 brw_MUL( p, xi, xi, param0 );
1135 brw_MUL( p, yi, yi, param1 );
1136 brw_MUL( p, zi, zi, param2 );
1137
1138 /* Here we interpolate in the y dimension... */
1139 brw_MUL( p, x0y1, x0y1, yi );
1140 brw_MUL( p, x1y1, x1y1, yi );
1141 brw_ADD( p, x0y0, x0y0, x0y1 );
1142 brw_ADD( p, x1y0, x1y0, x1y1 );
1143
1144 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1145 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1146 brw_MUL( p, x1y0, x1y0, xi );
1147 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1148
1149 /* Now do the same thing for the front four gradients... */
1150 /* x component */
1151 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1152 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1153 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1154 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1155
1156 brw_push_insn_state( p );
1157 brw_set_mask_control( p, BRW_MASK_DISABLE );
1158 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1159 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1160 brw_pop_insn_state( p );
1161
1162 brw_MUL( p, x1y0, x1y0, t );
1163 brw_MUL( p, x1y1, x1y1, t );
1164 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1165 brw_MUL( p, x0y0, x0y0, param0 );
1166 brw_MUL( p, x0y1, x0y1, param0 );
1167
1168 /* y component */
1169 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1170 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1171 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1172 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1173
1174 brw_push_insn_state( p );
1175 brw_set_mask_control( p, BRW_MASK_DISABLE );
1176 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1177 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1178 brw_pop_insn_state( p );
1179
1180 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1181 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1182 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1183 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1184 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1185
1186 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1187 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1188 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1189 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1190
1191 /* z component */
1192 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1193 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1194 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1195 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1196
1197 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1198 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1199 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1200 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1201
1202 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1203 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1204 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1205 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1206
1207 /* The interpolation coefficients are still around from last time, so
1208 again interpolate in the y dimension... */
1209 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1210 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1211 brw_MUL( p, x0y1, x0y1, yi );
1212 brw_MUL( p, x1y1, x1y1, yi );
1213 brw_ADD( p, x0y0, x0y0, x0y1 );
1214 brw_ADD( p, x1y0, x1y0, x1y1 );
1215
1216 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1217 time put the front face in tmp[ 1 ] and we're nearly there... */
1218 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1219 brw_MUL( p, x1y0, x1y0, xi );
1220 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1221
1222 /* The final interpolation, in the z dimension: */
1223 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1224 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1225 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1226
1227 /* scale by pow( 2, -15 ), as described above */
1228 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1229
1230 release_tmps( c, mark );
1231 }
1232
1233 static void emit_noise3( struct brw_wm_compile *c,
1234 const struct prog_instruction *inst )
1235 {
1236 struct brw_compile *p = &c->func;
1237 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1238 GLuint mask = inst->DstReg.WriteMask;
1239 int i;
1240 int mark = mark_tmps( c );
1241
1242 assert( mark == 0 );
1243
1244 src0 = get_src_reg( c, inst, 0, 0 );
1245 src1 = get_src_reg( c, inst, 0, 1 );
1246 src2 = get_src_reg( c, inst, 0, 2 );
1247
1248 param0 = alloc_tmp( c );
1249 param1 = alloc_tmp( c );
1250 param2 = alloc_tmp( c );
1251
1252 brw_MOV( p, param0, src0 );
1253 brw_MOV( p, param1, src1 );
1254 brw_MOV( p, param2, src2 );
1255
1256 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1257
1258 /* Fill in the result: */
1259 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1260 for (i = 0 ; i < 4; i++) {
1261 if (mask & (1<<i)) {
1262 dst = get_dst_reg(c, inst, i);
1263 brw_MOV( p, dst, param0 );
1264 }
1265 }
1266 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1267 brw_set_saturate( p, 0 );
1268
1269 release_tmps( c, mark );
1270 }
1271
1272 /**
1273 * For the four-dimensional case, the little micro-optimisation benefits
1274 * we obtain by unrolling all the loops aren't worth the massive bloat it
1275 * now causes. Instead, we loop twice around performing a similar operation
1276 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1277 * code to glue it all together.
1278 */
1279 static void noise4_sub( struct brw_wm_compile *c )
1280 {
1281 struct brw_compile *p = &c->func;
1282 struct brw_reg param[ 4 ],
1283 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1284 w0, /* noise for the w=0 cube */
1285 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1286 interp[ 4 ], /* interpolation coefficients */
1287 t, tmp[ 8 ], /* float temporaries */
1288 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1289 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1290 int i, j;
1291 int mark = mark_tmps( c );
1292 GLuint loop, origin;
1293
1294 x0y0 = alloc_tmp( c );
1295 x0y1 = alloc_tmp( c );
1296 x1y0 = alloc_tmp( c );
1297 x1y1 = alloc_tmp( c );
1298 t = alloc_tmp( c );
1299 w0 = alloc_tmp( c );
1300 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1301 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1302
1303 for( i = 0; i < 4; i++ ) {
1304 param[ i ] = lookup_tmp( c, mark - 5 + i );
1305 interp[ i ] = alloc_tmp( c );
1306 }
1307
1308 for( i = 0; i < 8; i++ ) {
1309 tmp[ i ] = alloc_tmp( c );
1310 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1311 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1312 }
1313
1314 brw_set_access_mode( p, BRW_ALIGN_1 );
1315
1316 /* We only want 16 bits of precision from the integral part of each
1317 co-ordinate, but unfortunately the RNDD semantics would saturate
1318 at 16 bits if we performed the operation directly to a 16-bit
1319 destination. Therefore, we round to 32-bit temporaries where
1320 appropriate, and then store only the lower 16 bits. */
1321 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1322 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1323 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1324 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1325 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1326 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1327
1328 /* Modify the flag register here, because the side effect is useful
1329 later (see below). We know for certain that all flags will be
1330 cleared, since the FRC instruction cannot possibly generate
1331 negative results. Even for exceptional inputs (infinities, denormals,
1332 NaNs), the architecture guarantees that the L conditional is false. */
1333 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1334 brw_FRC( p, param[ 0 ], param[ 0 ] );
1335 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1336 for( i = 1; i < 4; i++ )
1337 brw_FRC( p, param[ i ], param[ i ] );
1338
1339 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1340 of all. */
1341 for( i = 0; i < 4; i++ )
1342 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1343 for( i = 0; i < 4; i++ )
1344 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1345 for( i = 0; i < 4; i++ )
1346 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1347 for( i = 0; i < 4; i++ )
1348 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1349 for( j = 0; j < 3; j++ )
1350 for( i = 0; i < 4; i++ )
1351 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1352
1353 /* Mark the current address, as it will be a jump destination. The
1354 following code will be executed twice: first, with the flag
1355 register clear indicating the w=0 case, and second with flags
1356 set for w=1. */
1357 loop = p->nr_insn;
1358
1359 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1360 be hashed. Since we have only 16 bits of precision in the hash, we
1361 must be careful about thorough mixing to maintain entropy as we
1362 squash the input vector into a small scalar. */
1363 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1364 brw_imm_uw( 0xBC8F ) );
1365 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1366 brw_imm_uw( 0xD0BD ) );
1367 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1368 brw_imm_uw( 0x9B93 ) );
1369 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1370 brw_imm_uw( 0xA359 ) );
1371 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1372 brw_imm_uw( 0xBC8F ) );
1373
1374 /* Temporarily disable the execution mask while we work with ExecSize=16
1375 channels (the mask is set for ExecSize=8 and is probably incorrect).
1376 Although this might cause execution of unwanted channels, the code
1377 writes only to temporary registers and has no side effects, so
1378 disabling the mask is harmless. */
1379 brw_push_insn_state( p );
1380 brw_set_mask_control( p, BRW_MASK_DISABLE );
1381 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1382 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1383 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1384
1385 /* We're now ready to perform the hashing. The eight hashes are
1386 interleaved for performance. The hash function used is
1387 designed to rapidly achieve avalanche and require only 16x16
1388 bit multiplication, and 8-bit swizzles (which we get for
1389 free). */
1390 for( i = 0; i < 4; i++ )
1391 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1392 for( i = 0; i < 4; i++ )
1393 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1394 odd_bytes( wtmp[ i ] ) );
1395 for( i = 0; i < 4; i++ )
1396 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1397 for( i = 0; i < 4; i++ )
1398 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1399 odd_bytes( wtmp[ i ] ) );
1400 brw_pop_insn_state( p );
1401
1402 /* Now we want to initialise the four rear gradients based on the
1403 hashes. Format conversion from signed integer to float leaves
1404 everything scaled too high by a factor of pow( 2, 15 ), but
1405 we correct for that right at the end. */
1406 /* x component */
1407 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1408 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1409 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1410 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1411 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1412
1413 brw_push_insn_state( p );
1414 brw_set_mask_control( p, BRW_MASK_DISABLE );
1415 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1416 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1417 brw_pop_insn_state( p );
1418
1419 brw_MUL( p, x1y0, x1y0, t );
1420 brw_MUL( p, x1y1, x1y1, t );
1421 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1422 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1423 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1424
1425 /* y component */
1426 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1427 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1428 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1429 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1430
1431 brw_push_insn_state( p );
1432 brw_set_mask_control( p, BRW_MASK_DISABLE );
1433 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1434 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1435 brw_pop_insn_state( p );
1436
1437 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1438 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1439 /* prepare t for the w component (used below): w the first time through
1440 the loop; w - 1 the second time) */
1441 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1442 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1443 p->current->header.predicate_inverse = 1;
1444 brw_MOV( p, t, param[ 3 ] );
1445 p->current->header.predicate_inverse = 0;
1446 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1447 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1448 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1449
1450 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1451 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1452 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1453 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1454
1455 /* z component */
1456 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1457 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1458 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1459 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1460
1461 brw_push_insn_state( p );
1462 brw_set_mask_control( p, BRW_MASK_DISABLE );
1463 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1464 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1465 brw_pop_insn_state( p );
1466
1467 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1468 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1469 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1470 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1471
1472 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1473 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1474 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1475 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1476
1477 /* w component */
1478 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1479 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1480 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1481 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1482
1483 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1484 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1485 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1486 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1487 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1488
1489 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1490 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1491 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1492 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1493
1494 /* Here we interpolate in the y dimension... */
1495 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1496 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1497 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1498 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1499 brw_ADD( p, x0y0, x0y0, x0y1 );
1500 brw_ADD( p, x1y0, x1y0, x1y1 );
1501
1502 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1503 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1504 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1505 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1506
1507 /* Now do the same thing for the front four gradients... */
1508 /* x component */
1509 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1510 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1511 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1512 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1513
1514 brw_push_insn_state( p );
1515 brw_set_mask_control( p, BRW_MASK_DISABLE );
1516 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1517 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1518 brw_pop_insn_state( p );
1519
1520 brw_MUL( p, x1y0, x1y0, t );
1521 brw_MUL( p, x1y1, x1y1, t );
1522 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1523 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1524 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1525
1526 /* y component */
1527 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1528 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1529 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1530 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1531
1532 brw_push_insn_state( p );
1533 brw_set_mask_control( p, BRW_MASK_DISABLE );
1534 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1535 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1536 brw_pop_insn_state( p );
1537
1538 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1539 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1540 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1541 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1542 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1543
1544 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1545 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1546 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1547 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1548
1549 /* z component */
1550 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1551 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1552 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1553 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1554
1555 brw_push_insn_state( p );
1556 brw_set_mask_control( p, BRW_MASK_DISABLE );
1557 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1558 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1559 brw_pop_insn_state( p );
1560
1561 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1562 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1563 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1564 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1565 /* prepare t for the w component (used below): w the first time through
1566 the loop; w - 1 the second time) */
1567 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1568 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1569 p->current->header.predicate_inverse = 1;
1570 brw_MOV( p, t, param[ 3 ] );
1571 p->current->header.predicate_inverse = 0;
1572 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1573
1574 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1575 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1576 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1577 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1578
1579 /* w component */
1580 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1581 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1582 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1583 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1584
1585 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1586 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1587 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1588 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1589
1590 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1591 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1592 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1593 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1594
1595 /* Interpolate in the y dimension: */
1596 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1597 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1598 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1599 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1600 brw_ADD( p, x0y0, x0y0, x0y1 );
1601 brw_ADD( p, x1y0, x1y0, x1y1 );
1602
1603 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1604 time put the front face in tmp[ 1 ] and we're nearly there... */
1605 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1606 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1607 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1608
1609 /* Another interpolation, in the z dimension: */
1610 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1611 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1612 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1613
1614 /* Exit the loop if we've computed both cubes... */
1615 origin = p->nr_insn;
1616 brw_push_insn_state( p );
1617 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1618 brw_set_mask_control( p, BRW_MASK_DISABLE );
1619 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1620 brw_pop_insn_state( p );
1621
1622 /* Save the result for the w=0 case, and increment the w coordinate: */
1623 brw_MOV( p, w0, tmp[ 0 ] );
1624 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1625 brw_imm_uw( 1 ) );
1626
1627 /* Loop around for the other cube. Explicitly set the flag register
1628 (unfortunately we must spend an extra instruction to do this: we
1629 can't rely on a side effect of the previous MOV or ADD because
1630 conditional modifiers which are normally true might be false in
1631 exceptional circumstances, e.g. given a NaN input; the add to
1632 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1633 brw_push_insn_state( p );
1634 brw_set_mask_control( p, BRW_MASK_DISABLE );
1635 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1636 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1637 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1638 brw_pop_insn_state( p );
1639
1640 /* Patch the previous conditional branch now that we know the
1641 destination address. */
1642 brw_set_src1( p->store + origin,
1643 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1644
1645 /* The very last interpolation. */
1646 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1647 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1648 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1649
1650 /* scale by pow( 2, -15 ), as described above */
1651 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1652
1653 release_tmps( c, mark );
1654 }
1655
1656 static void emit_noise4( struct brw_wm_compile *c,
1657 const struct prog_instruction *inst )
1658 {
1659 struct brw_compile *p = &c->func;
1660 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1661 GLuint mask = inst->DstReg.WriteMask;
1662 int i;
1663 int mark = mark_tmps( c );
1664
1665 assert( mark == 0 );
1666
1667 src0 = get_src_reg( c, inst, 0, 0 );
1668 src1 = get_src_reg( c, inst, 0, 1 );
1669 src2 = get_src_reg( c, inst, 0, 2 );
1670 src3 = get_src_reg( c, inst, 0, 3 );
1671
1672 param0 = alloc_tmp( c );
1673 param1 = alloc_tmp( c );
1674 param2 = alloc_tmp( c );
1675 param3 = alloc_tmp( c );
1676
1677 brw_MOV( p, param0, src0 );
1678 brw_MOV( p, param1, src1 );
1679 brw_MOV( p, param2, src2 );
1680 brw_MOV( p, param3, src3 );
1681
1682 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1683
1684 /* Fill in the result: */
1685 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1686 for (i = 0 ; i < 4; i++) {
1687 if (mask & (1<<i)) {
1688 dst = get_dst_reg(c, inst, i);
1689 brw_MOV( p, dst, param0 );
1690 }
1691 }
1692 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1693 brw_set_saturate( p, 0 );
1694
1695 release_tmps( c, mark );
1696 }
1697
1698 /**
1699 * Resolve subroutine calls after code emit is done.
1700 */
1701 static void post_wm_emit( struct brw_wm_compile *c )
1702 {
1703 brw_resolve_cals(&c->func);
1704 }
1705
1706 static void
1707 get_argument_regs(struct brw_wm_compile *c,
1708 const struct prog_instruction *inst,
1709 int index,
1710 struct brw_reg *dst,
1711 struct brw_reg *regs,
1712 int mask)
1713 {
1714 struct brw_compile *p = &c->func;
1715 int i, j;
1716
1717 for (i = 0; i < 4; i++) {
1718 if (mask & (1 << i)) {
1719 regs[i] = get_src_reg(c, inst, index, i);
1720
1721 /* Unalias destination registers from our sources. */
1722 if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1723 for (j = 0; j < 4; j++) {
1724 if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1725 struct brw_reg tmp = alloc_tmp(c);
1726 brw_MOV(p, tmp, regs[i]);
1727 regs[i] = tmp;
1728 break;
1729 }
1730 }
1731 }
1732 }
1733 }
1734 }
1735
1736 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1737 {
1738 struct intel_context *intel = &brw->intel;
1739 #define MAX_IF_DEPTH 32
1740 #define MAX_LOOP_DEPTH 32
1741 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1742 GLuint i, if_depth = 0, loop_depth = 0;
1743 struct brw_compile *p = &c->func;
1744 struct brw_indirect stack_index = brw_indirect(0, 0);
1745
1746 c->out_of_regs = GL_FALSE;
1747
1748 prealloc_reg(c);
1749 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1750 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1751
1752 for (i = 0; i < c->nr_fp_insns; i++) {
1753 const struct prog_instruction *inst = &c->prog_instructions[i];
1754 int dst_flags;
1755 struct brw_reg args[3][4], dst[4];
1756 int j;
1757 int mark = mark_tmps( c );
1758
1759 c->cur_inst = i;
1760
1761 #if 0
1762 printf("Inst %d: ", i);
1763 _mesa_print_instruction(inst);
1764 #endif
1765
1766 /* fetch any constants that this instruction needs */
1767 if (c->fp->use_const_buffer)
1768 fetch_constants(c, inst);
1769
1770 if (inst->Opcode != OPCODE_ARL) {
1771 for (j = 0; j < 4; j++) {
1772 if (inst->DstReg.WriteMask & (1 << j))
1773 dst[j] = get_dst_reg(c, inst, j);
1774 else
1775 dst[j] = brw_null_reg();
1776 }
1777 }
1778 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1779 get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1780
1781 dst_flags = inst->DstReg.WriteMask;
1782 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1783 dst_flags |= SATURATE;
1784
1785 if (inst->CondUpdate)
1786 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1787 else
1788 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1789
1790 switch (inst->Opcode) {
1791 case WM_PIXELXY:
1792 emit_pixel_xy(c, dst, dst_flags);
1793 break;
1794 case WM_DELTAXY:
1795 emit_delta_xy(p, dst, dst_flags, args[0]);
1796 break;
1797 case WM_PIXELW:
1798 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1799 break;
1800 case WM_LINTERP:
1801 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1802 break;
1803 case WM_PINTERP:
1804 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1805 break;
1806 case WM_CINTERP:
1807 emit_cinterp(p, dst, dst_flags, args[0]);
1808 break;
1809 case WM_WPOSXY:
1810 emit_wpos_xy(c, dst, dst_flags, args[0]);
1811 break;
1812 case WM_FB_WRITE:
1813 emit_fb_write(c, args[0], args[1], args[2],
1814 INST_AUX_GET_TARGET(inst->Aux),
1815 inst->Aux & INST_AUX_EOT);
1816 break;
1817 case WM_FRONTFACING:
1818 emit_frontfacing(p, dst, dst_flags);
1819 break;
1820 case OPCODE_ADD:
1821 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1822 break;
1823 case OPCODE_ARL:
1824 emit_arl(c, inst);
1825 break;
1826 case OPCODE_FRC:
1827 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1828 break;
1829 case OPCODE_FLR:
1830 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1831 break;
1832 case OPCODE_LRP:
1833 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1834 break;
1835 case OPCODE_TRUNC:
1836 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1837 break;
1838 case OPCODE_MOV:
1839 case OPCODE_SWZ:
1840 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1841 break;
1842 case OPCODE_DP3:
1843 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1844 break;
1845 case OPCODE_DP4:
1846 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1847 break;
1848 case OPCODE_XPD:
1849 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1850 break;
1851 case OPCODE_DPH:
1852 emit_dph(p, dst, dst_flags, args[0], args[1]);
1853 break;
1854 case OPCODE_RCP:
1855 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1856 break;
1857 case OPCODE_RSQ:
1858 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1859 break;
1860 case OPCODE_SIN:
1861 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1862 break;
1863 case OPCODE_COS:
1864 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1865 break;
1866 case OPCODE_EX2:
1867 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1868 break;
1869 case OPCODE_LG2:
1870 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1871 break;
1872 case OPCODE_MIN:
1873 emit_min(p, dst, dst_flags, args[0], args[1]);
1874 break;
1875 case OPCODE_MAX:
1876 emit_max(p, dst, dst_flags, args[0], args[1]);
1877 break;
1878 case OPCODE_DDX:
1879 case OPCODE_DDY:
1880 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1881 args[0]);
1882 break;
1883 case OPCODE_SLT:
1884 emit_sop(p, dst, dst_flags,
1885 BRW_CONDITIONAL_L, args[0], args[1]);
1886 break;
1887 case OPCODE_SLE:
1888 emit_sop(p, dst, dst_flags,
1889 BRW_CONDITIONAL_LE, args[0], args[1]);
1890 break;
1891 case OPCODE_SGT:
1892 emit_sop(p, dst, dst_flags,
1893 BRW_CONDITIONAL_G, args[0], args[1]);
1894 break;
1895 case OPCODE_SGE:
1896 emit_sop(p, dst, dst_flags,
1897 BRW_CONDITIONAL_GE, args[0], args[1]);
1898 break;
1899 case OPCODE_SEQ:
1900 emit_sop(p, dst, dst_flags,
1901 BRW_CONDITIONAL_EQ, args[0], args[1]);
1902 break;
1903 case OPCODE_SNE:
1904 emit_sop(p, dst, dst_flags,
1905 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1906 break;
1907 case OPCODE_MUL:
1908 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1909 break;
1910 case OPCODE_POW:
1911 emit_math2(c, BRW_MATH_FUNCTION_POW,
1912 dst, dst_flags, args[0], args[1]);
1913 break;
1914 case OPCODE_MAD:
1915 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1916 break;
1917 case OPCODE_NOISE1:
1918 emit_noise1(c, inst);
1919 break;
1920 case OPCODE_NOISE2:
1921 emit_noise2(c, inst);
1922 break;
1923 case OPCODE_NOISE3:
1924 emit_noise3(c, inst);
1925 break;
1926 case OPCODE_NOISE4:
1927 emit_noise4(c, inst);
1928 break;
1929 case OPCODE_TEX:
1930 emit_tex(c, dst, dst_flags, args[0],
1931 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1932 0, 1, 0, 0),
1933 inst->TexSrcTarget,
1934 inst->TexSrcUnit,
1935 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1936 break;
1937 case OPCODE_TXB:
1938 emit_txb(c, dst, dst_flags, args[0],
1939 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1940 0, 1, 0, 0),
1941 inst->TexSrcTarget,
1942 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1943 break;
1944 case OPCODE_KIL_NV:
1945 emit_kil(c);
1946 break;
1947 case OPCODE_IF:
1948 assert(if_depth < MAX_IF_DEPTH);
1949 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1950 break;
1951 case OPCODE_ELSE:
1952 assert(if_depth > 0);
1953 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1954 break;
1955 case OPCODE_ENDIF:
1956 assert(if_depth > 0);
1957 brw_ENDIF(p, if_inst[--if_depth]);
1958 break;
1959 case OPCODE_BGNSUB:
1960 brw_save_label(p, inst->Comment, p->nr_insn);
1961 break;
1962 case OPCODE_ENDSUB:
1963 /* no-op */
1964 break;
1965 case OPCODE_CAL:
1966 brw_push_insn_state(p);
1967 brw_set_mask_control(p, BRW_MASK_DISABLE);
1968 brw_set_access_mode(p, BRW_ALIGN_1);
1969 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1970 brw_set_access_mode(p, BRW_ALIGN_16);
1971 brw_ADD(p, get_addr_reg(stack_index),
1972 get_addr_reg(stack_index), brw_imm_d(4));
1973 brw_save_call(&c->func, inst->Comment, p->nr_insn);
1974 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1975 brw_pop_insn_state(p);
1976 break;
1977
1978 case OPCODE_RET:
1979 brw_push_insn_state(p);
1980 brw_set_mask_control(p, BRW_MASK_DISABLE);
1981 brw_ADD(p, get_addr_reg(stack_index),
1982 get_addr_reg(stack_index), brw_imm_d(-4));
1983 brw_set_access_mode(p, BRW_ALIGN_1);
1984 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
1985 brw_set_access_mode(p, BRW_ALIGN_16);
1986 brw_pop_insn_state(p);
1987
1988 break;
1989 case OPCODE_BGNLOOP:
1990 /* XXX may need to invalidate the current_constant regs */
1991 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1992 break;
1993 case OPCODE_BRK:
1994 brw_BREAK(p);
1995 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1996 break;
1997 case OPCODE_CONT:
1998 brw_CONT(p);
1999 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2000 break;
2001 case OPCODE_ENDLOOP:
2002 {
2003 struct brw_instruction *inst0, *inst1;
2004 GLuint br = 1;
2005
2006 if (intel->is_ironlake)
2007 br = 2;
2008
2009 assert(loop_depth > 0);
2010 loop_depth--;
2011 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2012 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2013 while (inst0 > loop_inst[loop_depth]) {
2014 inst0--;
2015 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2016 inst0->bits3.if_else.jump_count == 0) {
2017 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2018 inst0->bits3.if_else.pop_count = 0;
2019 }
2020 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2021 inst0->bits3.if_else.jump_count == 0) {
2022 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2023 inst0->bits3.if_else.pop_count = 0;
2024 }
2025 }
2026 }
2027 break;
2028 default:
2029 printf("unsupported IR in fragment shader %d\n",
2030 inst->Opcode);
2031 }
2032
2033 /* Release temporaries containing any unaliased source regs. */
2034 release_tmps( c, mark );
2035
2036 if (inst->CondUpdate)
2037 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2038 else
2039 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2040 }
2041 post_wm_emit(c);
2042
2043 if (INTEL_DEBUG & DEBUG_WM) {
2044 printf("wm-native:\n");
2045 for (i = 0; i < p->nr_insn; i++)
2046 brw_disasm(stderr, &p->store[i]);
2047 printf("\n");
2048 }
2049 }
2050
2051 /**
2052 * Do GPU code generation for shaders that use GLSL features such as
2053 * flow control. Other shaders will be compiled with the
2054 */
2055 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2056 {
2057 if (INTEL_DEBUG & DEBUG_WM) {
2058 printf("brw_wm_glsl_emit:\n");
2059 }
2060
2061 /* initial instruction translation/simplification */
2062 brw_wm_pass_fp(c);
2063
2064 /* actual code generation */
2065 brw_wm_emit_glsl(brw, c);
2066
2067 if (INTEL_DEBUG & DEBUG_WM) {
2068 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2069 }
2070
2071 c->prog_data.total_grf = num_grf_used(c);
2072 c->prog_data.total_scratch = 0;
2073 }