i965: Add INTEL_DEBUG=glsl_force to force brw_wm_glsl.c.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
27 return GL_TRUE;
28
29 for (i = 0; i < fp->Base.NumInstructions; i++) {
30 const struct prog_instruction *inst = &fp->Base.Instructions[i];
31 switch (inst->Opcode) {
32 case OPCODE_ARL:
33 case OPCODE_IF:
34 case OPCODE_ENDIF:
35 case OPCODE_CAL:
36 case OPCODE_BRK:
37 case OPCODE_RET:
38 case OPCODE_NOISE1:
39 case OPCODE_NOISE2:
40 case OPCODE_NOISE3:
41 case OPCODE_NOISE4:
42 case OPCODE_BGNLOOP:
43 return GL_TRUE;
44 default:
45 break;
46 }
47 }
48 return GL_FALSE;
49 }
50
51
52
53 static void
54 reclaim_temps(struct brw_wm_compile *c);
55
56
57 /** Mark GRF register as used. */
58 static void
59 prealloc_grf(struct brw_wm_compile *c, int r)
60 {
61 c->used_grf[r] = GL_TRUE;
62 }
63
64
65 /** Mark given GRF register as not in use. */
66 static void
67 release_grf(struct brw_wm_compile *c, int r)
68 {
69 /*assert(c->used_grf[r]);*/
70 c->used_grf[r] = GL_FALSE;
71 c->first_free_grf = MIN2(c->first_free_grf, r);
72 }
73
74
75 /** Return index of a free GRF, mark it as used. */
76 static int
77 alloc_grf(struct brw_wm_compile *c)
78 {
79 GLuint r;
80 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
81 if (!c->used_grf[r]) {
82 c->used_grf[r] = GL_TRUE;
83 c->first_free_grf = r + 1; /* a guess */
84 return r;
85 }
86 }
87
88 /* no free temps, try to reclaim some */
89 reclaim_temps(c);
90 c->first_free_grf = 0;
91
92 /* try alloc again */
93 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
94 if (!c->used_grf[r]) {
95 c->used_grf[r] = GL_TRUE;
96 c->first_free_grf = r + 1; /* a guess */
97 return r;
98 }
99 }
100
101 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
102 assert(c->used_grf[r]);
103 }
104
105 /* really, no free GRF regs found */
106 if (!c->out_of_regs) {
107 /* print warning once per compilation */
108 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
109 c->out_of_regs = GL_TRUE;
110 }
111
112 return -1;
113 }
114
115
116 /** Return number of GRF registers used */
117 static int
118 num_grf_used(const struct brw_wm_compile *c)
119 {
120 int r;
121 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
122 if (c->used_grf[r])
123 return r + 1;
124 return 0;
125 }
126
127
128
129 /**
130 * Record the mapping of a Mesa register to a hardware register.
131 */
132 static void set_reg(struct brw_wm_compile *c, int file, int index,
133 int component, struct brw_reg reg)
134 {
135 c->wm_regs[file][index][component].reg = reg;
136 c->wm_regs[file][index][component].inited = GL_TRUE;
137 }
138
139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
140 {
141 struct brw_reg reg;
142
143 /* if we need to allocate another temp, grow the tmp_regs[] array */
144 if (c->tmp_index == c->tmp_max) {
145 int r = alloc_grf(c);
146 if (r < 0) {
147 /*printf("Out of temps in %s\n", __FUNCTION__);*/
148 r = 50; /* XXX random register! */
149 }
150 c->tmp_regs[ c->tmp_max++ ] = r;
151 }
152
153 /* form the GRF register */
154 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
155 /*printf("alloc_temp %d\n", reg.nr);*/
156 assert(reg.nr < BRW_WM_MAX_GRF);
157 return reg;
158
159 }
160
161 /**
162 * Save current temp register info.
163 * There must be a matching call to release_tmps().
164 */
165 static int mark_tmps(struct brw_wm_compile *c)
166 {
167 return c->tmp_index;
168 }
169
170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
171 {
172 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
173 }
174
175 static void release_tmps(struct brw_wm_compile *c, int mark)
176 {
177 c->tmp_index = mark;
178 }
179
180 /**
181 * Convert Mesa src register to brw register.
182 *
183 * Since we're running in SOA mode each Mesa register corresponds to four
184 * hardware registers. We allocate the hardware registers as needed here.
185 *
186 * \param file register file, one of PROGRAM_x
187 * \param index register number
188 * \param component src component (X=0, Y=1, Z=2, W=3)
189 * \param nr not used?!?
190 * \param neg negate value?
191 * \param abs take absolute value?
192 */
193 static struct brw_reg
194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
195 int nr, GLuint neg, GLuint abs)
196 {
197 struct brw_reg reg;
198 switch (file) {
199 case PROGRAM_STATE_VAR:
200 case PROGRAM_CONSTANT:
201 case PROGRAM_UNIFORM:
202 file = PROGRAM_STATE_VAR;
203 break;
204 case PROGRAM_UNDEFINED:
205 return brw_null_reg();
206 case PROGRAM_TEMPORARY:
207 case PROGRAM_INPUT:
208 case PROGRAM_OUTPUT:
209 case PROGRAM_PAYLOAD:
210 break;
211 default:
212 _mesa_problem(NULL, "Unexpected file in get_reg()");
213 return brw_null_reg();
214 }
215
216 assert(index < 256);
217 assert(component < 4);
218
219 /* see if we've already allocated a HW register for this Mesa register */
220 if (c->wm_regs[file][index][component].inited) {
221 /* yes, re-use */
222 reg = c->wm_regs[file][index][component].reg;
223 }
224 else {
225 /* no, allocate new register */
226 int grf = alloc_grf(c);
227 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
228 if (grf < 0) {
229 /* totally out of temps */
230 grf = 51; /* XXX random register! */
231 }
232
233 reg = brw_vec8_grf(grf, 0);
234 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
235
236 set_reg(c, file, index, component, reg);
237 }
238
239 if (neg & (1 << component)) {
240 reg = negate(reg);
241 }
242 if (abs)
243 reg = brw_abs(reg);
244 return reg;
245 }
246
247
248
249 /**
250 * This is called if we run out of GRF registers. Examine the live intervals
251 * of temp regs in the program and free those which won't be used again.
252 */
253 static void
254 reclaim_temps(struct brw_wm_compile *c)
255 {
256 GLint intBegin[MAX_PROGRAM_TEMPS];
257 GLint intEnd[MAX_PROGRAM_TEMPS];
258 int index;
259
260 /*printf("Reclaim temps:\n");*/
261
262 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
263 intBegin, intEnd);
264
265 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
266 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
267 /* program temp[i] can be freed */
268 int component;
269 /*printf(" temp[%d] is dead\n", index);*/
270 for (component = 0; component < 4; component++) {
271 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
272 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
273 release_grf(c, r);
274 /*
275 printf(" Reclaim temp %d, reg %d at inst %d\n",
276 index, r, c->cur_inst);
277 */
278 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
279 }
280 }
281 }
282 }
283 }
284
285
286
287
288 /**
289 * Preallocate registers. This sets up the Mesa to hardware register
290 * mapping for certain registers, such as constants (uniforms/state vars)
291 * and shader inputs.
292 */
293 static void prealloc_reg(struct brw_wm_compile *c)
294 {
295 struct intel_context *intel = &c->func.brw->intel;
296 int i, j;
297 struct brw_reg reg;
298 int urb_read_length = 0;
299 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
300 GLuint reg_index = 0;
301
302 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
303 c->first_free_grf = 0;
304
305 for (i = 0; i < 4; i++) {
306 if (i < c->key.nr_depth_regs)
307 reg = brw_vec8_grf(i * 2, 0);
308 else
309 reg = brw_vec8_grf(0, 0);
310 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
311 }
312 reg_index += 2 * c->key.nr_depth_regs;
313
314 /* constants */
315 {
316 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
317 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
318
319 /* use a real constant buffer, or just use a section of the GRF? */
320 /* XXX this heuristic may need adjustment... */
321 if ((nr_params + nr_temps) * 4 + reg_index > 80)
322 c->fp->use_const_buffer = GL_TRUE;
323 else
324 c->fp->use_const_buffer = GL_FALSE;
325 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
326
327 if (c->fp->use_const_buffer) {
328 /* We'll use a real constant buffer and fetch constants from
329 * it with a dataport read message.
330 */
331
332 /* number of float constants in CURBE */
333 c->prog_data.nr_params = 0;
334 }
335 else {
336 const struct gl_program_parameter_list *plist =
337 c->fp->program.Base.Parameters;
338 int index = 0;
339
340 /* number of float constants in CURBE */
341 c->prog_data.nr_params = 4 * nr_params;
342
343 /* loop over program constants (float[4]) */
344 for (i = 0; i < nr_params; i++) {
345 /* loop over XYZW channels */
346 for (j = 0; j < 4; j++, index++) {
347 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
348 /* Save pointer to parameter/constant value.
349 * Constants will be copied in prepare_constant_buffer()
350 */
351 c->prog_data.param[index] = &plist->ParameterValues[i][j];
352 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
353 }
354 }
355 /* number of constant regs used (each reg is float[8]) */
356 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
357 reg_index += c->nr_creg;
358 }
359 }
360
361 /* fragment shader inputs */
362 for (i = 0; i < VERT_RESULT_MAX; i++) {
363 int fp_input;
364
365 if (i >= VERT_RESULT_VAR0)
366 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
367 else if (i <= VERT_RESULT_TEX7)
368 fp_input = i;
369 else
370 fp_input = -1;
371
372 if (fp_input >= 0 && inputs & (1 << fp_input)) {
373 urb_read_length = reg_index;
374 reg = brw_vec8_grf(reg_index, 0);
375 for (j = 0; j < 4; j++)
376 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
377 }
378 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
379 reg_index += 2;
380 }
381 }
382
383 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
384 c->prog_data.urb_read_length = urb_read_length;
385 c->prog_data.curb_read_length = c->nr_creg;
386 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
387 reg_index++;
388 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
389 reg_index += 2;
390
391 /* mark GRF regs [0..reg_index-1] as in-use */
392 for (i = 0; i < reg_index; i++)
393 prealloc_grf(c, i);
394
395 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
396 prealloc_grf(c, 126);
397 prealloc_grf(c, 127);
398
399 for (i = 0; i < c->nr_fp_insns; i++) {
400 const struct prog_instruction *inst = &c->prog_instructions[i];
401 struct brw_reg dst[4];
402
403 switch (inst->Opcode) {
404 case OPCODE_TEX:
405 case OPCODE_TXB:
406 /* Allocate the channels of texture results contiguously,
407 * since they are written out that way by the sampler unit.
408 */
409 for (j = 0; j < 4; j++) {
410 dst[j] = get_dst_reg(c, inst, j);
411 if (j != 0)
412 assert(dst[j].nr == dst[j - 1].nr + 1);
413 }
414 break;
415 default:
416 break;
417 }
418 }
419
420 for (i = 0; i < c->nr_fp_insns; i++) {
421 const struct prog_instruction *inst = &c->prog_instructions[i];
422
423 switch (inst->Opcode) {
424 case WM_DELTAXY:
425 /* Allocate WM_DELTAXY destination on G45/GM45 to an
426 * even-numbered GRF if possible so that we can use the PLN
427 * instruction.
428 */
429 if (inst->DstReg.WriteMask == WRITEMASK_XY &&
430 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
431 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
432 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
433 int grf;
434
435 for (grf = c->first_free_grf & ~1;
436 grf < BRW_WM_MAX_GRF;
437 grf += 2)
438 {
439 if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
440 c->used_grf[grf] = GL_TRUE;
441 c->used_grf[grf + 1] = GL_TRUE;
442 c->first_free_grf = grf + 2; /* a guess */
443
444 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
445 brw_vec8_grf(grf, 0));
446 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
447 brw_vec8_grf(grf + 1, 0));
448 break;
449 }
450 }
451 }
452 default:
453 break;
454 }
455 }
456
457 /* An instruction may reference up to three constants.
458 * They'll be found in these registers.
459 * XXX alloc these on demand!
460 */
461 if (c->fp->use_const_buffer) {
462 for (i = 0; i < 3; i++) {
463 c->current_const[i].index = -1;
464 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
465 }
466 }
467 #if 0
468 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
469 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
470 #endif
471 }
472
473
474 /**
475 * Check if any of the instruction's src registers are constants, uniforms,
476 * or statevars. If so, fetch any constants that we don't already have in
477 * the three GRF slots.
478 */
479 static void fetch_constants(struct brw_wm_compile *c,
480 const struct prog_instruction *inst)
481 {
482 struct brw_compile *p = &c->func;
483 GLuint i;
484
485 /* loop over instruction src regs */
486 for (i = 0; i < 3; i++) {
487 const struct prog_src_register *src = &inst->SrcReg[i];
488 if (src->File == PROGRAM_STATE_VAR ||
489 src->File == PROGRAM_CONSTANT ||
490 src->File == PROGRAM_UNIFORM) {
491 c->current_const[i].index = src->Index;
492
493 #if 0
494 printf(" fetch const[%d] for arg %d into reg %d\n",
495 src->Index, i, c->current_const[i].reg.nr);
496 #endif
497
498 /* need to fetch the constant now */
499 brw_dp_READ_4(p,
500 c->current_const[i].reg, /* writeback dest */
501 src->RelAddr, /* relative indexing? */
502 16 * src->Index, /* byte offset */
503 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
504 );
505 }
506 }
507 }
508
509
510 /**
511 * Convert Mesa dst register to brw register.
512 */
513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
514 const struct prog_instruction *inst,
515 GLuint component)
516 {
517 const int nr = 1;
518 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
519 0, 0);
520 }
521
522
523 static struct brw_reg
524 get_src_reg_const(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint component)
527 {
528 /* We should have already fetched the constant from the constant
529 * buffer in fetch_constants(). Now we just have to return a
530 * register description that extracts the needed component and
531 * smears it across all eight vector components.
532 */
533 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
534 struct brw_reg const_reg;
535
536 assert(component < 4);
537 assert(srcRegIndex < 3);
538 assert(c->current_const[srcRegIndex].index != -1);
539 const_reg = c->current_const[srcRegIndex].reg;
540
541 /* extract desired float from the const_reg, and smear */
542 const_reg = stride(const_reg, 0, 1, 0);
543 const_reg.subnr = component * 4;
544
545 if (src->Negate & (1 << component))
546 const_reg = negate(const_reg);
547 if (src->Abs)
548 const_reg = brw_abs(const_reg);
549
550 #if 0
551 printf(" form const[%d].%d for arg %d, reg %d\n",
552 c->current_const[srcRegIndex].index,
553 component,
554 srcRegIndex,
555 const_reg.nr);
556 #endif
557
558 return const_reg;
559 }
560
561
562 /**
563 * Convert Mesa src register to brw register.
564 */
565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
566 const struct prog_instruction *inst,
567 GLuint srcRegIndex, GLuint channel)
568 {
569 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
570 const GLuint nr = 1;
571 const GLuint component = GET_SWZ(src->Swizzle, channel);
572
573 /* Extended swizzle terms */
574 if (component == SWIZZLE_ZERO) {
575 return brw_imm_f(0.0F);
576 }
577 else if (component == SWIZZLE_ONE) {
578 return brw_imm_f(1.0F);
579 }
580
581 if (c->fp->use_const_buffer &&
582 (src->File == PROGRAM_STATE_VAR ||
583 src->File == PROGRAM_CONSTANT ||
584 src->File == PROGRAM_UNIFORM)) {
585 return get_src_reg_const(c, inst, srcRegIndex, component);
586 }
587 else {
588 /* other type of source register */
589 return get_reg(c, src->File, src->Index, component, nr,
590 src->Negate, src->Abs);
591 }
592 }
593
594 /**
595 * Subroutines are minimal support for resusable instruction sequences.
596 * They are implemented as simply as possible to minimise overhead: there
597 * is no explicit support for communication between the caller and callee
598 * other than saving the return address in a temporary register, nor is
599 * there any automatic local storage. This implies that great care is
600 * required before attempting reentrancy or any kind of nested
601 * subroutine invocations.
602 */
603 static void invoke_subroutine( struct brw_wm_compile *c,
604 enum _subroutine subroutine,
605 void (*emit)( struct brw_wm_compile * ) )
606 {
607 struct brw_compile *p = &c->func;
608
609 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
610
611 if( c->subroutines[ subroutine ] ) {
612 /* subroutine previously emitted: reuse existing instructions */
613
614 int mark = mark_tmps( c );
615 struct brw_reg return_address = retype( alloc_tmp( c ),
616 BRW_REGISTER_TYPE_UD );
617 int here = p->nr_insn;
618
619 brw_push_insn_state(p);
620 brw_set_mask_control(p, BRW_MASK_DISABLE);
621 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
622
623 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
624 brw_imm_d( ( c->subroutines[ subroutine ] -
625 here - 1 ) << 4 ) );
626 brw_pop_insn_state(p);
627
628 release_tmps( c, mark );
629 } else {
630 /* previously unused subroutine: emit, and mark for later reuse */
631
632 int mark = mark_tmps( c );
633 struct brw_reg return_address = retype( alloc_tmp( c ),
634 BRW_REGISTER_TYPE_UD );
635 struct brw_instruction *calc;
636 int base = p->nr_insn;
637
638 brw_push_insn_state(p);
639 brw_set_mask_control(p, BRW_MASK_DISABLE);
640 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
641 brw_pop_insn_state(p);
642
643 c->subroutines[ subroutine ] = p->nr_insn;
644
645 emit( c );
646
647 brw_push_insn_state(p);
648 brw_set_mask_control(p, BRW_MASK_DISABLE);
649 brw_MOV( p, brw_ip_reg(), return_address );
650 brw_pop_insn_state(p);
651
652 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
653
654 release_tmps( c, mark );
655 }
656 }
657
658 static void emit_arl(struct brw_wm_compile *c,
659 const struct prog_instruction *inst)
660 {
661 struct brw_compile *p = &c->func;
662 struct brw_reg src0, addr_reg;
663 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
664 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
665 BRW_ARF_ADDRESS, 0);
666 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
667 brw_MOV(p, addr_reg, src0);
668 brw_set_saturate(p, 0);
669 }
670
671 /**
672 * For GLSL shaders, this KIL will be unconditional.
673 * It may be contained inside an IF/ENDIF structure of course.
674 */
675 static void emit_kil(struct brw_wm_compile *c)
676 {
677 struct brw_compile *p = &c->func;
678 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
679 brw_push_insn_state(p);
680 brw_set_mask_control(p, BRW_MASK_DISABLE);
681 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
682 brw_AND(p, depth, c->emit_mask_reg, depth);
683 brw_pop_insn_state(p);
684 }
685
686 static INLINE struct brw_reg high_words( struct brw_reg reg )
687 {
688 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
689 0, 8, 2 );
690 }
691
692 static INLINE struct brw_reg low_words( struct brw_reg reg )
693 {
694 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
695 }
696
697 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
698 {
699 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
700 }
701
702 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
703 {
704 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
705 0, 16, 2 );
706 }
707
708 /* One-, two- and three-dimensional Perlin noise, similar to the description
709 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
710 static void noise1_sub( struct brw_wm_compile *c ) {
711
712 struct brw_compile *p = &c->func;
713 struct brw_reg param,
714 x0, x1, /* gradients at each end */
715 t, tmp[ 2 ], /* float temporaries */
716 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
717 int i;
718 int mark = mark_tmps( c );
719
720 x0 = alloc_tmp( c );
721 x1 = alloc_tmp( c );
722 t = alloc_tmp( c );
723 tmp[ 0 ] = alloc_tmp( c );
724 tmp[ 1 ] = alloc_tmp( c );
725 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
726 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
727 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
728 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
729 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
730
731 param = lookup_tmp( c, mark - 2 );
732
733 brw_set_access_mode( p, BRW_ALIGN_1 );
734
735 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
736
737 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
738 be hashed. Also compute the remainder (offset within the unit
739 length), interleaved to reduce register dependency penalties. */
740 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
741 brw_FRC( p, param, param );
742 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
743 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
744 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
745
746 /* We're now ready to perform the hashing. The two hashes are
747 interleaved for performance. The hash function used is
748 designed to rapidly achieve avalanche and require only 32x16
749 bit multiplication, and 16-bit swizzles (which we get for
750 free). We can't use immediate operands in the multiplies,
751 because immediates are permitted only in src1 and the 16-bit
752 factor is permitted only in src0. */
753 for( i = 0; i < 2; i++ )
754 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
755 for( i = 0; i < 2; i++ )
756 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
757 high_words( itmp[ i ] ) );
758 for( i = 0; i < 2; i++ )
759 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
760 for( i = 0; i < 2; i++ )
761 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
762 high_words( itmp[ i ] ) );
763 for( i = 0; i < 2; i++ )
764 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
765 for( i = 0; i < 2; i++ )
766 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
767 high_words( itmp[ i ] ) );
768
769 /* Now we want to initialise the two gradients based on the
770 hashes. Format conversion from signed integer to float leaves
771 everything scaled too high by a factor of pow( 2, 31 ), but
772 we correct for that right at the end. */
773 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
774 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
775 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
776
777 brw_MUL( p, x0, x0, param );
778 brw_MUL( p, x1, x1, t );
779
780 /* We interpolate between the gradients using the polynomial
781 6t^5 - 15t^4 + 10t^3 (Perlin). */
782 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
783 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
784 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
785 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
786 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
787 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
788 pipeline */
789 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
790 brw_MUL( p, param, tmp[ 0 ], param );
791 brw_MUL( p, x1, x1, param );
792 brw_ADD( p, x0, x0, x1 );
793 /* scale by pow( 2, -30 ), to compensate for the format conversion
794 above and an extra factor of 2 so that a single gradient covers
795 the [-1,1] range */
796 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
797
798 release_tmps( c, mark );
799 }
800
801 static void emit_noise1( struct brw_wm_compile *c,
802 const struct prog_instruction *inst )
803 {
804 struct brw_compile *p = &c->func;
805 struct brw_reg src, param, dst;
806 GLuint mask = inst->DstReg.WriteMask;
807 int i;
808 int mark = mark_tmps( c );
809
810 assert( mark == 0 );
811
812 src = get_src_reg( c, inst, 0, 0 );
813
814 param = alloc_tmp( c );
815
816 brw_MOV( p, param, src );
817
818 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
819
820 /* Fill in the result: */
821 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
822 for (i = 0 ; i < 4; i++) {
823 if (mask & (1<<i)) {
824 dst = get_dst_reg(c, inst, i);
825 brw_MOV( p, dst, param );
826 }
827 }
828 if( inst->SaturateMode == SATURATE_ZERO_ONE )
829 brw_set_saturate( p, 0 );
830
831 release_tmps( c, mark );
832 }
833
834 static void noise2_sub( struct brw_wm_compile *c ) {
835
836 struct brw_compile *p = &c->func;
837 struct brw_reg param0, param1,
838 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
839 t, tmp[ 4 ], /* float temporaries */
840 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
841 int i;
842 int mark = mark_tmps( c );
843
844 x0y0 = alloc_tmp( c );
845 x0y1 = alloc_tmp( c );
846 x1y0 = alloc_tmp( c );
847 x1y1 = alloc_tmp( c );
848 t = alloc_tmp( c );
849 for( i = 0; i < 4; i++ ) {
850 tmp[ i ] = alloc_tmp( c );
851 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
852 }
853 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
854 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
855 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
856
857 param0 = lookup_tmp( c, mark - 3 );
858 param1 = lookup_tmp( c, mark - 2 );
859
860 brw_set_access_mode( p, BRW_ALIGN_1 );
861
862 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
863 be hashed. Also compute the remainders (offsets within the unit
864 square), interleaved to reduce register dependency penalties. */
865 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
866 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
867 brw_FRC( p, param0, param0 );
868 brw_FRC( p, param1, param1 );
869 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
870 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
871 low_words( itmp[ 1 ] ) );
872 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
873 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
874 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
875 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
876 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
877
878 /* We're now ready to perform the hashing. The four hashes are
879 interleaved for performance. The hash function used is
880 designed to rapidly achieve avalanche and require only 32x16
881 bit multiplication, and 16-bit swizzles (which we get for
882 free). We can't use immediate operands in the multiplies,
883 because immediates are permitted only in src1 and the 16-bit
884 factor is permitted only in src0. */
885 for( i = 0; i < 4; i++ )
886 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
887 for( i = 0; i < 4; i++ )
888 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
889 high_words( itmp[ i ] ) );
890 for( i = 0; i < 4; i++ )
891 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
892 for( i = 0; i < 4; i++ )
893 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
894 high_words( itmp[ i ] ) );
895 for( i = 0; i < 4; i++ )
896 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
897 for( i = 0; i < 4; i++ )
898 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
899 high_words( itmp[ i ] ) );
900
901 /* Now we want to initialise the four gradients based on the
902 hashes. Format conversion from signed integer to float leaves
903 everything scaled too high by a factor of pow( 2, 15 ), but
904 we correct for that right at the end. */
905 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
906 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
907 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
908 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
909 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
910
911 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
912 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
913 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
914 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
915
916 brw_MUL( p, x1y0, x1y0, t );
917 brw_MUL( p, x1y1, x1y1, t );
918 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
919 brw_MUL( p, x0y0, x0y0, param0 );
920 brw_MUL( p, x0y1, x0y1, param0 );
921
922 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
923 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
924 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
925 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
926
927 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
928 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
929 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
930 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
931
932 /* We interpolate between the gradients using the polynomial
933 6t^5 - 15t^4 + 10t^3 (Perlin). */
934 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
935 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
936 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
937 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
938 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
939 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
940 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
941 pipeline */
942 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
943 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
944 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
945 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
946 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
947 pipeline */
948 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
949 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
950 brw_MUL( p, param0, tmp[ 0 ], param0 );
951 brw_MUL( p, param1, tmp[ 1 ], param1 );
952
953 /* Here we interpolate in the y dimension... */
954 brw_MUL( p, x0y1, x0y1, param1 );
955 brw_MUL( p, x1y1, x1y1, param1 );
956 brw_ADD( p, x0y0, x0y0, x0y1 );
957 brw_ADD( p, x1y0, x1y0, x1y1 );
958
959 /* And now in x. There are horrible register dependencies here,
960 but we have nothing else to do. */
961 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
962 brw_MUL( p, x1y0, x1y0, param0 );
963 brw_ADD( p, x0y0, x0y0, x1y0 );
964
965 /* scale by pow( 2, -15 ), as described above */
966 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
967
968 release_tmps( c, mark );
969 }
970
971 static void emit_noise2( struct brw_wm_compile *c,
972 const struct prog_instruction *inst )
973 {
974 struct brw_compile *p = &c->func;
975 struct brw_reg src0, src1, param0, param1, dst;
976 GLuint mask = inst->DstReg.WriteMask;
977 int i;
978 int mark = mark_tmps( c );
979
980 assert( mark == 0 );
981
982 src0 = get_src_reg( c, inst, 0, 0 );
983 src1 = get_src_reg( c, inst, 0, 1 );
984
985 param0 = alloc_tmp( c );
986 param1 = alloc_tmp( c );
987
988 brw_MOV( p, param0, src0 );
989 brw_MOV( p, param1, src1 );
990
991 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
992
993 /* Fill in the result: */
994 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
995 for (i = 0 ; i < 4; i++) {
996 if (mask & (1<<i)) {
997 dst = get_dst_reg(c, inst, i);
998 brw_MOV( p, dst, param0 );
999 }
1000 }
1001 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1002 brw_set_saturate( p, 0 );
1003
1004 release_tmps( c, mark );
1005 }
1006
1007 /**
1008 * The three-dimensional case is much like the one- and two- versions above,
1009 * but since the number of corners is rapidly growing we now pack 16 16-bit
1010 * hashes into each register to extract more parallelism from the EUs.
1011 */
1012 static void noise3_sub( struct brw_wm_compile *c ) {
1013
1014 struct brw_compile *p = &c->func;
1015 struct brw_reg param0, param1, param2,
1016 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1017 xi, yi, zi, /* interpolation coefficients */
1018 t, tmp[ 8 ], /* float temporaries */
1019 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1020 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1021 int i;
1022 int mark = mark_tmps( c );
1023
1024 x0y0 = alloc_tmp( c );
1025 x0y1 = alloc_tmp( c );
1026 x1y0 = alloc_tmp( c );
1027 x1y1 = alloc_tmp( c );
1028 xi = alloc_tmp( c );
1029 yi = alloc_tmp( c );
1030 zi = alloc_tmp( c );
1031 t = alloc_tmp( c );
1032 for( i = 0; i < 8; i++ ) {
1033 tmp[ i ] = alloc_tmp( c );
1034 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1035 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1036 }
1037
1038 param0 = lookup_tmp( c, mark - 4 );
1039 param1 = lookup_tmp( c, mark - 3 );
1040 param2 = lookup_tmp( c, mark - 2 );
1041
1042 brw_set_access_mode( p, BRW_ALIGN_1 );
1043
1044 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1045 be hashed. Also compute the remainders (offsets within the unit
1046 cube), interleaved to reduce register dependency penalties. */
1047 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1048 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1049 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1050 brw_FRC( p, param0, param0 );
1051 brw_FRC( p, param1, param1 );
1052 brw_FRC( p, param2, param2 );
1053 /* Since we now have only 16 bits of precision in the hash, we must
1054 be more careful about thorough mixing to maintain entropy as we
1055 squash the input vector into a small scalar. */
1056 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1057 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1058 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1059 brw_imm_uw( 0x9B93 ) );
1060 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1061 brw_imm_uw( 0xBC8F ) );
1062
1063 /* Temporarily disable the execution mask while we work with ExecSize=16
1064 channels (the mask is set for ExecSize=8 and is probably incorrect).
1065 Although this might cause execution of unwanted channels, the code
1066 writes only to temporary registers and has no side effects, so
1067 disabling the mask is harmless. */
1068 brw_push_insn_state( p );
1069 brw_set_mask_control( p, BRW_MASK_DISABLE );
1070 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1071 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1072 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1073
1074 /* We're now ready to perform the hashing. The eight hashes are
1075 interleaved for performance. The hash function used is
1076 designed to rapidly achieve avalanche and require only 16x16
1077 bit multiplication, and 8-bit swizzles (which we get for
1078 free). */
1079 for( i = 0; i < 4; i++ )
1080 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1081 for( i = 0; i < 4; i++ )
1082 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1083 odd_bytes( wtmp[ i ] ) );
1084 for( i = 0; i < 4; i++ )
1085 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1086 for( i = 0; i < 4; i++ )
1087 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1088 odd_bytes( wtmp[ i ] ) );
1089 brw_pop_insn_state( p );
1090
1091 /* Now we want to initialise the four rear gradients based on the
1092 hashes. Format conversion from signed integer to float leaves
1093 everything scaled too high by a factor of pow( 2, 15 ), but
1094 we correct for that right at the end. */
1095 /* x component */
1096 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1097 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1098 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1099 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1100 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1101
1102 brw_push_insn_state( p );
1103 brw_set_mask_control( p, BRW_MASK_DISABLE );
1104 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1105 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1106 brw_pop_insn_state( p );
1107
1108 brw_MUL( p, x1y0, x1y0, t );
1109 brw_MUL( p, x1y1, x1y1, t );
1110 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1111 brw_MUL( p, x0y0, x0y0, param0 );
1112 brw_MUL( p, x0y1, x0y1, param0 );
1113
1114 /* y component */
1115 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1116 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1117 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1118 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1119
1120 brw_push_insn_state( p );
1121 brw_set_mask_control( p, BRW_MASK_DISABLE );
1122 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1123 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1124 brw_pop_insn_state( p );
1125
1126 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1127 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1128 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1129 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1130 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1131
1132 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1133 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1134 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1135 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1136
1137 /* z component */
1138 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1139 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1140 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1141 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1142
1143 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1144 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1145 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1146 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1147
1148 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1149 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1150 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1151 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1152
1153 /* We interpolate between the gradients using the polynomial
1154 6t^5 - 15t^4 + 10t^3 (Perlin). */
1155 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1156 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1157 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1158 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1159 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1160 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1161 brw_MUL( p, xi, xi, param0 );
1162 brw_MUL( p, yi, yi, param1 );
1163 brw_MUL( p, zi, zi, param2 );
1164 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1165 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1166 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1167 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1168 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1169 brw_MUL( p, xi, xi, param0 );
1170 brw_MUL( p, yi, yi, param1 );
1171 brw_MUL( p, zi, zi, param2 );
1172 brw_MUL( p, xi, xi, param0 );
1173 brw_MUL( p, yi, yi, param1 );
1174 brw_MUL( p, zi, zi, param2 );
1175 brw_MUL( p, xi, xi, param0 );
1176 brw_MUL( p, yi, yi, param1 );
1177 brw_MUL( p, zi, zi, param2 );
1178
1179 /* Here we interpolate in the y dimension... */
1180 brw_MUL( p, x0y1, x0y1, yi );
1181 brw_MUL( p, x1y1, x1y1, yi );
1182 brw_ADD( p, x0y0, x0y0, x0y1 );
1183 brw_ADD( p, x1y0, x1y0, x1y1 );
1184
1185 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1186 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1187 brw_MUL( p, x1y0, x1y0, xi );
1188 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1189
1190 /* Now do the same thing for the front four gradients... */
1191 /* x component */
1192 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1193 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1194 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1195 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1196
1197 brw_push_insn_state( p );
1198 brw_set_mask_control( p, BRW_MASK_DISABLE );
1199 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1200 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1201 brw_pop_insn_state( p );
1202
1203 brw_MUL( p, x1y0, x1y0, t );
1204 brw_MUL( p, x1y1, x1y1, t );
1205 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1206 brw_MUL( p, x0y0, x0y0, param0 );
1207 brw_MUL( p, x0y1, x0y1, param0 );
1208
1209 /* y component */
1210 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1211 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1212 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1213 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1214
1215 brw_push_insn_state( p );
1216 brw_set_mask_control( p, BRW_MASK_DISABLE );
1217 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1218 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1219 brw_pop_insn_state( p );
1220
1221 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1222 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1223 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1224 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1225 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1226
1227 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1228 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1229 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1230 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1231
1232 /* z component */
1233 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1234 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1235 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1236 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1237
1238 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1239 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1240 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1241 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1242
1243 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1244 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1245 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1246 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1247
1248 /* The interpolation coefficients are still around from last time, so
1249 again interpolate in the y dimension... */
1250 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1251 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1252 brw_MUL( p, x0y1, x0y1, yi );
1253 brw_MUL( p, x1y1, x1y1, yi );
1254 brw_ADD( p, x0y0, x0y0, x0y1 );
1255 brw_ADD( p, x1y0, x1y0, x1y1 );
1256
1257 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1258 time put the front face in tmp[ 1 ] and we're nearly there... */
1259 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1260 brw_MUL( p, x1y0, x1y0, xi );
1261 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1262
1263 /* The final interpolation, in the z dimension: */
1264 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1265 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1266 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1267
1268 /* scale by pow( 2, -15 ), as described above */
1269 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1270
1271 release_tmps( c, mark );
1272 }
1273
1274 static void emit_noise3( struct brw_wm_compile *c,
1275 const struct prog_instruction *inst )
1276 {
1277 struct brw_compile *p = &c->func;
1278 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1279 GLuint mask = inst->DstReg.WriteMask;
1280 int i;
1281 int mark = mark_tmps( c );
1282
1283 assert( mark == 0 );
1284
1285 src0 = get_src_reg( c, inst, 0, 0 );
1286 src1 = get_src_reg( c, inst, 0, 1 );
1287 src2 = get_src_reg( c, inst, 0, 2 );
1288
1289 param0 = alloc_tmp( c );
1290 param1 = alloc_tmp( c );
1291 param2 = alloc_tmp( c );
1292
1293 brw_MOV( p, param0, src0 );
1294 brw_MOV( p, param1, src1 );
1295 brw_MOV( p, param2, src2 );
1296
1297 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1298
1299 /* Fill in the result: */
1300 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1301 for (i = 0 ; i < 4; i++) {
1302 if (mask & (1<<i)) {
1303 dst = get_dst_reg(c, inst, i);
1304 brw_MOV( p, dst, param0 );
1305 }
1306 }
1307 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1308 brw_set_saturate( p, 0 );
1309
1310 release_tmps( c, mark );
1311 }
1312
1313 /**
1314 * For the four-dimensional case, the little micro-optimisation benefits
1315 * we obtain by unrolling all the loops aren't worth the massive bloat it
1316 * now causes. Instead, we loop twice around performing a similar operation
1317 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1318 * code to glue it all together.
1319 */
1320 static void noise4_sub( struct brw_wm_compile *c )
1321 {
1322 struct brw_compile *p = &c->func;
1323 struct brw_reg param[ 4 ],
1324 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1325 w0, /* noise for the w=0 cube */
1326 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1327 interp[ 4 ], /* interpolation coefficients */
1328 t, tmp[ 8 ], /* float temporaries */
1329 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1330 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1331 int i, j;
1332 int mark = mark_tmps( c );
1333 GLuint loop, origin;
1334
1335 x0y0 = alloc_tmp( c );
1336 x0y1 = alloc_tmp( c );
1337 x1y0 = alloc_tmp( c );
1338 x1y1 = alloc_tmp( c );
1339 t = alloc_tmp( c );
1340 w0 = alloc_tmp( c );
1341 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1342 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1343
1344 for( i = 0; i < 4; i++ ) {
1345 param[ i ] = lookup_tmp( c, mark - 5 + i );
1346 interp[ i ] = alloc_tmp( c );
1347 }
1348
1349 for( i = 0; i < 8; i++ ) {
1350 tmp[ i ] = alloc_tmp( c );
1351 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1352 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1353 }
1354
1355 brw_set_access_mode( p, BRW_ALIGN_1 );
1356
1357 /* We only want 16 bits of precision from the integral part of each
1358 co-ordinate, but unfortunately the RNDD semantics would saturate
1359 at 16 bits if we performed the operation directly to a 16-bit
1360 destination. Therefore, we round to 32-bit temporaries where
1361 appropriate, and then store only the lower 16 bits. */
1362 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1363 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1364 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1365 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1366 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1367 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1368
1369 /* Modify the flag register here, because the side effect is useful
1370 later (see below). We know for certain that all flags will be
1371 cleared, since the FRC instruction cannot possibly generate
1372 negative results. Even for exceptional inputs (infinities, denormals,
1373 NaNs), the architecture guarantees that the L conditional is false. */
1374 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1375 brw_FRC( p, param[ 0 ], param[ 0 ] );
1376 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1377 for( i = 1; i < 4; i++ )
1378 brw_FRC( p, param[ i ], param[ i ] );
1379
1380 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1381 of all. */
1382 for( i = 0; i < 4; i++ )
1383 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1384 for( i = 0; i < 4; i++ )
1385 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1386 for( i = 0; i < 4; i++ )
1387 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1388 for( i = 0; i < 4; i++ )
1389 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1390 for( j = 0; j < 3; j++ )
1391 for( i = 0; i < 4; i++ )
1392 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1393
1394 /* Mark the current address, as it will be a jump destination. The
1395 following code will be executed twice: first, with the flag
1396 register clear indicating the w=0 case, and second with flags
1397 set for w=1. */
1398 loop = p->nr_insn;
1399
1400 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1401 be hashed. Since we have only 16 bits of precision in the hash, we
1402 must be careful about thorough mixing to maintain entropy as we
1403 squash the input vector into a small scalar. */
1404 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1405 brw_imm_uw( 0xBC8F ) );
1406 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1407 brw_imm_uw( 0xD0BD ) );
1408 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1409 brw_imm_uw( 0x9B93 ) );
1410 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1411 brw_imm_uw( 0xA359 ) );
1412 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1413 brw_imm_uw( 0xBC8F ) );
1414
1415 /* Temporarily disable the execution mask while we work with ExecSize=16
1416 channels (the mask is set for ExecSize=8 and is probably incorrect).
1417 Although this might cause execution of unwanted channels, the code
1418 writes only to temporary registers and has no side effects, so
1419 disabling the mask is harmless. */
1420 brw_push_insn_state( p );
1421 brw_set_mask_control( p, BRW_MASK_DISABLE );
1422 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1423 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1424 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1425
1426 /* We're now ready to perform the hashing. The eight hashes are
1427 interleaved for performance. The hash function used is
1428 designed to rapidly achieve avalanche and require only 16x16
1429 bit multiplication, and 8-bit swizzles (which we get for
1430 free). */
1431 for( i = 0; i < 4; i++ )
1432 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1433 for( i = 0; i < 4; i++ )
1434 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1435 odd_bytes( wtmp[ i ] ) );
1436 for( i = 0; i < 4; i++ )
1437 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1438 for( i = 0; i < 4; i++ )
1439 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1440 odd_bytes( wtmp[ i ] ) );
1441 brw_pop_insn_state( p );
1442
1443 /* Now we want to initialise the four rear gradients based on the
1444 hashes. Format conversion from signed integer to float leaves
1445 everything scaled too high by a factor of pow( 2, 15 ), but
1446 we correct for that right at the end. */
1447 /* x component */
1448 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1449 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1450 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1451 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1452 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1453
1454 brw_push_insn_state( p );
1455 brw_set_mask_control( p, BRW_MASK_DISABLE );
1456 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1457 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1458 brw_pop_insn_state( p );
1459
1460 brw_MUL( p, x1y0, x1y0, t );
1461 brw_MUL( p, x1y1, x1y1, t );
1462 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1463 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1464 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1465
1466 /* y component */
1467 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1468 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1469 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1470 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1471
1472 brw_push_insn_state( p );
1473 brw_set_mask_control( p, BRW_MASK_DISABLE );
1474 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1475 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1476 brw_pop_insn_state( p );
1477
1478 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1479 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1480 /* prepare t for the w component (used below): w the first time through
1481 the loop; w - 1 the second time) */
1482 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1483 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1484 p->current->header.predicate_inverse = 1;
1485 brw_MOV( p, t, param[ 3 ] );
1486 p->current->header.predicate_inverse = 0;
1487 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1488 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1489 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1490
1491 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1492 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1493 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1494 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1495
1496 /* z component */
1497 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1498 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1499 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1500 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1501
1502 brw_push_insn_state( p );
1503 brw_set_mask_control( p, BRW_MASK_DISABLE );
1504 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1505 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1506 brw_pop_insn_state( p );
1507
1508 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1509 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1510 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1511 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1512
1513 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1514 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1515 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1516 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1517
1518 /* w component */
1519 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1520 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1521 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1522 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1523
1524 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1525 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1526 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1527 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1528 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1529
1530 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1531 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1532 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1533 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1534
1535 /* Here we interpolate in the y dimension... */
1536 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1537 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1538 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1539 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1540 brw_ADD( p, x0y0, x0y0, x0y1 );
1541 brw_ADD( p, x1y0, x1y0, x1y1 );
1542
1543 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1544 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1545 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1546 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1547
1548 /* Now do the same thing for the front four gradients... */
1549 /* x component */
1550 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1551 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1552 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1553 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1554
1555 brw_push_insn_state( p );
1556 brw_set_mask_control( p, BRW_MASK_DISABLE );
1557 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1558 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1559 brw_pop_insn_state( p );
1560
1561 brw_MUL( p, x1y0, x1y0, t );
1562 brw_MUL( p, x1y1, x1y1, t );
1563 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1564 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1565 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1566
1567 /* y component */
1568 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1569 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1570 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1571 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1572
1573 brw_push_insn_state( p );
1574 brw_set_mask_control( p, BRW_MASK_DISABLE );
1575 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1576 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1577 brw_pop_insn_state( p );
1578
1579 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1580 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1581 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1582 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1583 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1584
1585 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1586 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1587 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1588 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1589
1590 /* z component */
1591 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1592 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1593 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1594 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1595
1596 brw_push_insn_state( p );
1597 brw_set_mask_control( p, BRW_MASK_DISABLE );
1598 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1599 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1600 brw_pop_insn_state( p );
1601
1602 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1603 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1604 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1605 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1606 /* prepare t for the w component (used below): w the first time through
1607 the loop; w - 1 the second time) */
1608 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1609 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1610 p->current->header.predicate_inverse = 1;
1611 brw_MOV( p, t, param[ 3 ] );
1612 p->current->header.predicate_inverse = 0;
1613 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1614
1615 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1616 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1617 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1618 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1619
1620 /* w component */
1621 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1622 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1623 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1624 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1625
1626 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1627 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1628 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1629 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1630
1631 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1632 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1633 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1634 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1635
1636 /* Interpolate in the y dimension: */
1637 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1638 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1639 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1640 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1641 brw_ADD( p, x0y0, x0y0, x0y1 );
1642 brw_ADD( p, x1y0, x1y0, x1y1 );
1643
1644 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1645 time put the front face in tmp[ 1 ] and we're nearly there... */
1646 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1647 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1648 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1649
1650 /* Another interpolation, in the z dimension: */
1651 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1652 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1653 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1654
1655 /* Exit the loop if we've computed both cubes... */
1656 origin = p->nr_insn;
1657 brw_push_insn_state( p );
1658 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1659 brw_set_mask_control( p, BRW_MASK_DISABLE );
1660 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1661 brw_pop_insn_state( p );
1662
1663 /* Save the result for the w=0 case, and increment the w coordinate: */
1664 brw_MOV( p, w0, tmp[ 0 ] );
1665 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1666 brw_imm_uw( 1 ) );
1667
1668 /* Loop around for the other cube. Explicitly set the flag register
1669 (unfortunately we must spend an extra instruction to do this: we
1670 can't rely on a side effect of the previous MOV or ADD because
1671 conditional modifiers which are normally true might be false in
1672 exceptional circumstances, e.g. given a NaN input; the add to
1673 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1674 brw_push_insn_state( p );
1675 brw_set_mask_control( p, BRW_MASK_DISABLE );
1676 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1677 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1678 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1679 brw_pop_insn_state( p );
1680
1681 /* Patch the previous conditional branch now that we know the
1682 destination address. */
1683 brw_set_src1( p->store + origin,
1684 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1685
1686 /* The very last interpolation. */
1687 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1688 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1689 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1690
1691 /* scale by pow( 2, -15 ), as described above */
1692 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1693
1694 release_tmps( c, mark );
1695 }
1696
1697 static void emit_noise4( struct brw_wm_compile *c,
1698 const struct prog_instruction *inst )
1699 {
1700 struct brw_compile *p = &c->func;
1701 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1702 GLuint mask = inst->DstReg.WriteMask;
1703 int i;
1704 int mark = mark_tmps( c );
1705
1706 assert( mark == 0 );
1707
1708 src0 = get_src_reg( c, inst, 0, 0 );
1709 src1 = get_src_reg( c, inst, 0, 1 );
1710 src2 = get_src_reg( c, inst, 0, 2 );
1711 src3 = get_src_reg( c, inst, 0, 3 );
1712
1713 param0 = alloc_tmp( c );
1714 param1 = alloc_tmp( c );
1715 param2 = alloc_tmp( c );
1716 param3 = alloc_tmp( c );
1717
1718 brw_MOV( p, param0, src0 );
1719 brw_MOV( p, param1, src1 );
1720 brw_MOV( p, param2, src2 );
1721 brw_MOV( p, param3, src3 );
1722
1723 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1724
1725 /* Fill in the result: */
1726 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1727 for (i = 0 ; i < 4; i++) {
1728 if (mask & (1<<i)) {
1729 dst = get_dst_reg(c, inst, i);
1730 brw_MOV( p, dst, param0 );
1731 }
1732 }
1733 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1734 brw_set_saturate( p, 0 );
1735
1736 release_tmps( c, mark );
1737 }
1738
1739 /**
1740 * Resolve subroutine calls after code emit is done.
1741 */
1742 static void post_wm_emit( struct brw_wm_compile *c )
1743 {
1744 brw_resolve_cals(&c->func);
1745 }
1746
1747 static void
1748 get_argument_regs(struct brw_wm_compile *c,
1749 const struct prog_instruction *inst,
1750 int index,
1751 struct brw_reg *dst,
1752 struct brw_reg *regs,
1753 int mask)
1754 {
1755 struct brw_compile *p = &c->func;
1756 int i, j;
1757
1758 for (i = 0; i < 4; i++) {
1759 if (mask & (1 << i)) {
1760 regs[i] = get_src_reg(c, inst, index, i);
1761
1762 /* Unalias destination registers from our sources. */
1763 if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1764 for (j = 0; j < 4; j++) {
1765 if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1766 struct brw_reg tmp = alloc_tmp(c);
1767 brw_MOV(p, tmp, regs[i]);
1768 regs[i] = tmp;
1769 break;
1770 }
1771 }
1772 }
1773 }
1774 }
1775 }
1776
1777 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1778 {
1779 struct intel_context *intel = &brw->intel;
1780 #define MAX_IF_DEPTH 32
1781 #define MAX_LOOP_DEPTH 32
1782 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1783 GLuint i, if_depth = 0, loop_depth = 0;
1784 struct brw_compile *p = &c->func;
1785 struct brw_indirect stack_index = brw_indirect(0, 0);
1786
1787 c->out_of_regs = GL_FALSE;
1788
1789 prealloc_reg(c);
1790 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1791 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1792
1793 for (i = 0; i < c->nr_fp_insns; i++) {
1794 const struct prog_instruction *inst = &c->prog_instructions[i];
1795 int dst_flags;
1796 struct brw_reg args[3][4], dst[4];
1797 int j;
1798 int mark = mark_tmps( c );
1799
1800 c->cur_inst = i;
1801
1802 #if 0
1803 printf("Inst %d: ", i);
1804 _mesa_print_instruction(inst);
1805 #endif
1806
1807 /* fetch any constants that this instruction needs */
1808 if (c->fp->use_const_buffer)
1809 fetch_constants(c, inst);
1810
1811 if (inst->Opcode != OPCODE_ARL) {
1812 for (j = 0; j < 4; j++) {
1813 if (inst->DstReg.WriteMask & (1 << j))
1814 dst[j] = get_dst_reg(c, inst, j);
1815 else
1816 dst[j] = brw_null_reg();
1817 }
1818 }
1819 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1820 get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1821
1822 dst_flags = inst->DstReg.WriteMask;
1823 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1824 dst_flags |= SATURATE;
1825
1826 if (inst->CondUpdate)
1827 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1828 else
1829 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1830
1831 switch (inst->Opcode) {
1832 case WM_PIXELXY:
1833 emit_pixel_xy(c, dst, dst_flags);
1834 break;
1835 case WM_DELTAXY:
1836 emit_delta_xy(p, dst, dst_flags, args[0]);
1837 break;
1838 case WM_PIXELW:
1839 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1840 break;
1841 case WM_LINTERP:
1842 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1843 break;
1844 case WM_PINTERP:
1845 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1846 break;
1847 case WM_CINTERP:
1848 emit_cinterp(p, dst, dst_flags, args[0]);
1849 break;
1850 case WM_WPOSXY:
1851 emit_wpos_xy(c, dst, dst_flags, args[0]);
1852 break;
1853 case WM_FB_WRITE:
1854 emit_fb_write(c, args[0], args[1], args[2],
1855 INST_AUX_GET_TARGET(inst->Aux),
1856 inst->Aux & INST_AUX_EOT);
1857 break;
1858 case WM_FRONTFACING:
1859 emit_frontfacing(p, dst, dst_flags);
1860 break;
1861 case OPCODE_ADD:
1862 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1863 break;
1864 case OPCODE_ARL:
1865 emit_arl(c, inst);
1866 break;
1867 case OPCODE_FRC:
1868 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1869 break;
1870 case OPCODE_FLR:
1871 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1872 break;
1873 case OPCODE_LRP:
1874 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1875 break;
1876 case OPCODE_TRUNC:
1877 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1878 break;
1879 case OPCODE_MOV:
1880 case OPCODE_SWZ:
1881 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1882 break;
1883 case OPCODE_DP3:
1884 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1885 break;
1886 case OPCODE_DP4:
1887 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1888 break;
1889 case OPCODE_XPD:
1890 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1891 break;
1892 case OPCODE_DPH:
1893 emit_dph(p, dst, dst_flags, args[0], args[1]);
1894 break;
1895 case OPCODE_RCP:
1896 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1897 break;
1898 case OPCODE_RSQ:
1899 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1900 break;
1901 case OPCODE_SIN:
1902 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1903 break;
1904 case OPCODE_COS:
1905 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1906 break;
1907 case OPCODE_EX2:
1908 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1909 break;
1910 case OPCODE_LG2:
1911 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1912 break;
1913 case OPCODE_CMP:
1914 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1915 break;
1916 case OPCODE_MIN:
1917 emit_min(p, dst, dst_flags, args[0], args[1]);
1918 break;
1919 case OPCODE_MAX:
1920 emit_max(p, dst, dst_flags, args[0], args[1]);
1921 break;
1922 case OPCODE_DDX:
1923 case OPCODE_DDY:
1924 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1925 args[0]);
1926 break;
1927 case OPCODE_SLT:
1928 emit_sop(p, dst, dst_flags,
1929 BRW_CONDITIONAL_L, args[0], args[1]);
1930 break;
1931 case OPCODE_SLE:
1932 emit_sop(p, dst, dst_flags,
1933 BRW_CONDITIONAL_LE, args[0], args[1]);
1934 break;
1935 case OPCODE_SGT:
1936 emit_sop(p, dst, dst_flags,
1937 BRW_CONDITIONAL_G, args[0], args[1]);
1938 break;
1939 case OPCODE_SGE:
1940 emit_sop(p, dst, dst_flags,
1941 BRW_CONDITIONAL_GE, args[0], args[1]);
1942 break;
1943 case OPCODE_SEQ:
1944 emit_sop(p, dst, dst_flags,
1945 BRW_CONDITIONAL_EQ, args[0], args[1]);
1946 break;
1947 case OPCODE_SNE:
1948 emit_sop(p, dst, dst_flags,
1949 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1950 break;
1951 case OPCODE_MUL:
1952 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1953 break;
1954 case OPCODE_POW:
1955 emit_math2(c, BRW_MATH_FUNCTION_POW,
1956 dst, dst_flags, args[0], args[1]);
1957 break;
1958 case OPCODE_MAD:
1959 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1960 break;
1961 case OPCODE_NOISE1:
1962 emit_noise1(c, inst);
1963 break;
1964 case OPCODE_NOISE2:
1965 emit_noise2(c, inst);
1966 break;
1967 case OPCODE_NOISE3:
1968 emit_noise3(c, inst);
1969 break;
1970 case OPCODE_NOISE4:
1971 emit_noise4(c, inst);
1972 break;
1973 case OPCODE_TEX:
1974 emit_tex(c, dst, dst_flags, args[0],
1975 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1976 0, 1, 0, 0),
1977 inst->TexSrcTarget,
1978 inst->TexSrcUnit,
1979 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1980 break;
1981 case OPCODE_TXB:
1982 emit_txb(c, dst, dst_flags, args[0],
1983 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1984 0, 1, 0, 0),
1985 inst->TexSrcTarget,
1986 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1987 break;
1988 case OPCODE_KIL_NV:
1989 emit_kil(c);
1990 break;
1991 case OPCODE_IF:
1992 assert(if_depth < MAX_IF_DEPTH);
1993 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1994 break;
1995 case OPCODE_ELSE:
1996 assert(if_depth > 0);
1997 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1998 break;
1999 case OPCODE_ENDIF:
2000 assert(if_depth > 0);
2001 brw_ENDIF(p, if_inst[--if_depth]);
2002 break;
2003 case OPCODE_BGNSUB:
2004 brw_save_label(p, inst->Comment, p->nr_insn);
2005 break;
2006 case OPCODE_ENDSUB:
2007 /* no-op */
2008 break;
2009 case OPCODE_CAL:
2010 brw_push_insn_state(p);
2011 brw_set_mask_control(p, BRW_MASK_DISABLE);
2012 brw_set_access_mode(p, BRW_ALIGN_1);
2013 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2014 brw_set_access_mode(p, BRW_ALIGN_16);
2015 brw_ADD(p, get_addr_reg(stack_index),
2016 get_addr_reg(stack_index), brw_imm_d(4));
2017 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2018 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2019 brw_pop_insn_state(p);
2020 break;
2021
2022 case OPCODE_RET:
2023 brw_push_insn_state(p);
2024 brw_set_mask_control(p, BRW_MASK_DISABLE);
2025 brw_ADD(p, get_addr_reg(stack_index),
2026 get_addr_reg(stack_index), brw_imm_d(-4));
2027 brw_set_access_mode(p, BRW_ALIGN_1);
2028 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2029 brw_set_access_mode(p, BRW_ALIGN_16);
2030 brw_pop_insn_state(p);
2031
2032 break;
2033 case OPCODE_BGNLOOP:
2034 /* XXX may need to invalidate the current_constant regs */
2035 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2036 break;
2037 case OPCODE_BRK:
2038 brw_BREAK(p);
2039 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2040 break;
2041 case OPCODE_CONT:
2042 brw_CONT(p);
2043 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2044 break;
2045 case OPCODE_ENDLOOP:
2046 {
2047 struct brw_instruction *inst0, *inst1;
2048 GLuint br = 1;
2049
2050 if (intel->is_ironlake)
2051 br = 2;
2052
2053 assert(loop_depth > 0);
2054 loop_depth--;
2055 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2056 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2057 while (inst0 > loop_inst[loop_depth]) {
2058 inst0--;
2059 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2060 inst0->bits3.if_else.jump_count == 0) {
2061 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2062 inst0->bits3.if_else.pop_count = 0;
2063 }
2064 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2065 inst0->bits3.if_else.jump_count == 0) {
2066 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2067 inst0->bits3.if_else.pop_count = 0;
2068 }
2069 }
2070 }
2071 break;
2072 default:
2073 printf("unsupported opcode %d (%s) in fragment shader\n",
2074 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2075 _mesa_opcode_string(inst->Opcode) : "unknown");
2076 }
2077
2078 /* Release temporaries containing any unaliased source regs. */
2079 release_tmps( c, mark );
2080
2081 if (inst->CondUpdate)
2082 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2083 else
2084 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2085 }
2086 post_wm_emit(c);
2087
2088 if (INTEL_DEBUG & DEBUG_WM) {
2089 printf("wm-native:\n");
2090 for (i = 0; i < p->nr_insn; i++)
2091 brw_disasm(stderr, &p->store[i]);
2092 printf("\n");
2093 }
2094 }
2095
2096 /**
2097 * Do GPU code generation for shaders that use GLSL features such as
2098 * flow control. Other shaders will be compiled with the
2099 */
2100 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2101 {
2102 if (INTEL_DEBUG & DEBUG_WM) {
2103 printf("brw_wm_glsl_emit:\n");
2104 }
2105
2106 /* initial instruction translation/simplification */
2107 brw_wm_pass_fp(c);
2108
2109 /* actual code generation */
2110 brw_wm_emit_glsl(brw, c);
2111
2112 if (INTEL_DEBUG & DEBUG_WM) {
2113 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2114 }
2115
2116 c->prog_data.total_grf = num_grf_used(c);
2117 c->prog_data.total_scratch = 0;
2118 }