Merge branch '7.8' into master
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 struct intel_context *intel = &c->func.brw->intel;
293 int i, j;
294 struct brw_reg reg;
295 int urb_read_length = 0;
296 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
297 GLuint reg_index = 0;
298
299 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
300 c->first_free_grf = 0;
301
302 for (i = 0; i < 4; i++) {
303 if (i < c->key.nr_depth_regs)
304 reg = brw_vec8_grf(i * 2, 0);
305 else
306 reg = brw_vec8_grf(0, 0);
307 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
308 }
309 reg_index += 2 * c->key.nr_depth_regs;
310
311 /* constants */
312 {
313 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
314 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
315
316 /* use a real constant buffer, or just use a section of the GRF? */
317 /* XXX this heuristic may need adjustment... */
318 if ((nr_params + nr_temps) * 4 + reg_index > 80)
319 c->fp->use_const_buffer = GL_TRUE;
320 else
321 c->fp->use_const_buffer = GL_FALSE;
322 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
323
324 if (c->fp->use_const_buffer) {
325 /* We'll use a real constant buffer and fetch constants from
326 * it with a dataport read message.
327 */
328
329 /* number of float constants in CURBE */
330 c->prog_data.nr_params = 0;
331 }
332 else {
333 const struct gl_program_parameter_list *plist =
334 c->fp->program.Base.Parameters;
335 int index = 0;
336
337 /* number of float constants in CURBE */
338 c->prog_data.nr_params = 4 * nr_params;
339
340 /* loop over program constants (float[4]) */
341 for (i = 0; i < nr_params; i++) {
342 /* loop over XYZW channels */
343 for (j = 0; j < 4; j++, index++) {
344 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
345 /* Save pointer to parameter/constant value.
346 * Constants will be copied in prepare_constant_buffer()
347 */
348 c->prog_data.param[index] = &plist->ParameterValues[i][j];
349 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
350 }
351 }
352 /* number of constant regs used (each reg is float[8]) */
353 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
354 reg_index += c->nr_creg;
355 }
356 }
357
358 /* fragment shader inputs */
359 for (i = 0; i < VERT_RESULT_MAX; i++) {
360 int fp_input;
361
362 if (i >= VERT_RESULT_VAR0)
363 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
364 else if (i <= VERT_RESULT_TEX7)
365 fp_input = i;
366 else
367 fp_input = -1;
368
369 if (fp_input >= 0 && inputs & (1 << fp_input)) {
370 urb_read_length = reg_index;
371 reg = brw_vec8_grf(reg_index, 0);
372 for (j = 0; j < 4; j++)
373 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
374 }
375 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
376 reg_index += 2;
377 }
378 }
379
380 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
381 c->prog_data.urb_read_length = urb_read_length;
382 c->prog_data.curb_read_length = c->nr_creg;
383 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
384 reg_index++;
385 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
386 reg_index += 2;
387
388 /* mark GRF regs [0..reg_index-1] as in-use */
389 for (i = 0; i < reg_index; i++)
390 prealloc_grf(c, i);
391
392 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
393 prealloc_grf(c, 126);
394 prealloc_grf(c, 127);
395
396 for (i = 0; i < c->nr_fp_insns; i++) {
397 const struct prog_instruction *inst = &c->prog_instructions[i];
398 struct brw_reg dst[4];
399
400 switch (inst->Opcode) {
401 case OPCODE_TEX:
402 case OPCODE_TXB:
403 /* Allocate the channels of texture results contiguously,
404 * since they are written out that way by the sampler unit.
405 */
406 for (j = 0; j < 4; j++) {
407 dst[j] = get_dst_reg(c, inst, j);
408 if (j != 0)
409 assert(dst[j].nr == dst[j - 1].nr + 1);
410 }
411 break;
412 default:
413 break;
414 }
415 }
416
417 for (i = 0; i < c->nr_fp_insns; i++) {
418 const struct prog_instruction *inst = &c->prog_instructions[i];
419
420 switch (inst->Opcode) {
421 case WM_DELTAXY:
422 /* Allocate WM_DELTAXY destination on G45/GM45 to an
423 * even-numbered GRF if possible so that we can use the PLN
424 * instruction.
425 */
426 if (inst->DstReg.WriteMask == WRITEMASK_XY &&
427 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
428 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
429 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
430 int grf;
431
432 for (grf = c->first_free_grf & ~1;
433 grf < BRW_WM_MAX_GRF;
434 grf += 2)
435 {
436 if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
437 c->used_grf[grf] = GL_TRUE;
438 c->used_grf[grf + 1] = GL_TRUE;
439 c->first_free_grf = grf + 2; /* a guess */
440
441 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
442 brw_vec8_grf(grf, 0));
443 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
444 brw_vec8_grf(grf + 1, 0));
445 break;
446 }
447 }
448 }
449 default:
450 break;
451 }
452 }
453
454 /* An instruction may reference up to three constants.
455 * They'll be found in these registers.
456 * XXX alloc these on demand!
457 */
458 if (c->fp->use_const_buffer) {
459 for (i = 0; i < 3; i++) {
460 c->current_const[i].index = -1;
461 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
462 }
463 }
464 #if 0
465 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
466 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
467 #endif
468 }
469
470
471 /**
472 * Check if any of the instruction's src registers are constants, uniforms,
473 * or statevars. If so, fetch any constants that we don't already have in
474 * the three GRF slots.
475 */
476 static void fetch_constants(struct brw_wm_compile *c,
477 const struct prog_instruction *inst)
478 {
479 struct brw_compile *p = &c->func;
480 GLuint i;
481
482 /* loop over instruction src regs */
483 for (i = 0; i < 3; i++) {
484 const struct prog_src_register *src = &inst->SrcReg[i];
485 if (src->File == PROGRAM_STATE_VAR ||
486 src->File == PROGRAM_CONSTANT ||
487 src->File == PROGRAM_UNIFORM) {
488 c->current_const[i].index = src->Index;
489
490 #if 0
491 printf(" fetch const[%d] for arg %d into reg %d\n",
492 src->Index, i, c->current_const[i].reg.nr);
493 #endif
494
495 /* need to fetch the constant now */
496 brw_dp_READ_4(p,
497 c->current_const[i].reg, /* writeback dest */
498 src->RelAddr, /* relative indexing? */
499 16 * src->Index, /* byte offset */
500 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
501 );
502 }
503 }
504 }
505
506
507 /**
508 * Convert Mesa dst register to brw register.
509 */
510 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
511 const struct prog_instruction *inst,
512 GLuint component)
513 {
514 const int nr = 1;
515 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
516 0, 0);
517 }
518
519
520 static struct brw_reg
521 get_src_reg_const(struct brw_wm_compile *c,
522 const struct prog_instruction *inst,
523 GLuint srcRegIndex, GLuint component)
524 {
525 /* We should have already fetched the constant from the constant
526 * buffer in fetch_constants(). Now we just have to return a
527 * register description that extracts the needed component and
528 * smears it across all eight vector components.
529 */
530 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
531 struct brw_reg const_reg;
532
533 assert(component < 4);
534 assert(srcRegIndex < 3);
535 assert(c->current_const[srcRegIndex].index != -1);
536 const_reg = c->current_const[srcRegIndex].reg;
537
538 /* extract desired float from the const_reg, and smear */
539 const_reg = stride(const_reg, 0, 1, 0);
540 const_reg.subnr = component * 4;
541
542 if (src->Negate & (1 << component))
543 const_reg = negate(const_reg);
544 if (src->Abs)
545 const_reg = brw_abs(const_reg);
546
547 #if 0
548 printf(" form const[%d].%d for arg %d, reg %d\n",
549 c->current_const[srcRegIndex].index,
550 component,
551 srcRegIndex,
552 const_reg.nr);
553 #endif
554
555 return const_reg;
556 }
557
558
559 /**
560 * Convert Mesa src register to brw register.
561 */
562 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
563 const struct prog_instruction *inst,
564 GLuint srcRegIndex, GLuint channel)
565 {
566 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
567 const GLuint nr = 1;
568 const GLuint component = GET_SWZ(src->Swizzle, channel);
569
570 /* Extended swizzle terms */
571 if (component == SWIZZLE_ZERO) {
572 return brw_imm_f(0.0F);
573 }
574 else if (component == SWIZZLE_ONE) {
575 return brw_imm_f(1.0F);
576 }
577
578 if (c->fp->use_const_buffer &&
579 (src->File == PROGRAM_STATE_VAR ||
580 src->File == PROGRAM_CONSTANT ||
581 src->File == PROGRAM_UNIFORM)) {
582 return get_src_reg_const(c, inst, srcRegIndex, component);
583 }
584 else {
585 /* other type of source register */
586 return get_reg(c, src->File, src->Index, component, nr,
587 src->Negate, src->Abs);
588 }
589 }
590
591 /**
592 * Subroutines are minimal support for resusable instruction sequences.
593 * They are implemented as simply as possible to minimise overhead: there
594 * is no explicit support for communication between the caller and callee
595 * other than saving the return address in a temporary register, nor is
596 * there any automatic local storage. This implies that great care is
597 * required before attempting reentrancy or any kind of nested
598 * subroutine invocations.
599 */
600 static void invoke_subroutine( struct brw_wm_compile *c,
601 enum _subroutine subroutine,
602 void (*emit)( struct brw_wm_compile * ) )
603 {
604 struct brw_compile *p = &c->func;
605
606 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
607
608 if( c->subroutines[ subroutine ] ) {
609 /* subroutine previously emitted: reuse existing instructions */
610
611 int mark = mark_tmps( c );
612 struct brw_reg return_address = retype( alloc_tmp( c ),
613 BRW_REGISTER_TYPE_UD );
614 int here = p->nr_insn;
615
616 brw_push_insn_state(p);
617 brw_set_mask_control(p, BRW_MASK_DISABLE);
618 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
619
620 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
621 brw_imm_d( ( c->subroutines[ subroutine ] -
622 here - 1 ) << 4 ) );
623 brw_pop_insn_state(p);
624
625 release_tmps( c, mark );
626 } else {
627 /* previously unused subroutine: emit, and mark for later reuse */
628
629 int mark = mark_tmps( c );
630 struct brw_reg return_address = retype( alloc_tmp( c ),
631 BRW_REGISTER_TYPE_UD );
632 struct brw_instruction *calc;
633 int base = p->nr_insn;
634
635 brw_push_insn_state(p);
636 brw_set_mask_control(p, BRW_MASK_DISABLE);
637 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
638 brw_pop_insn_state(p);
639
640 c->subroutines[ subroutine ] = p->nr_insn;
641
642 emit( c );
643
644 brw_push_insn_state(p);
645 brw_set_mask_control(p, BRW_MASK_DISABLE);
646 brw_MOV( p, brw_ip_reg(), return_address );
647 brw_pop_insn_state(p);
648
649 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
650
651 release_tmps( c, mark );
652 }
653 }
654
655 static void emit_arl(struct brw_wm_compile *c,
656 const struct prog_instruction *inst)
657 {
658 struct brw_compile *p = &c->func;
659 struct brw_reg src0, addr_reg;
660 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
661 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
662 BRW_ARF_ADDRESS, 0);
663 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
664 brw_MOV(p, addr_reg, src0);
665 brw_set_saturate(p, 0);
666 }
667
668 /**
669 * For GLSL shaders, this KIL will be unconditional.
670 * It may be contained inside an IF/ENDIF structure of course.
671 */
672 static void emit_kil(struct brw_wm_compile *c)
673 {
674 struct brw_compile *p = &c->func;
675 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
676 brw_push_insn_state(p);
677 brw_set_mask_control(p, BRW_MASK_DISABLE);
678 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
679 brw_AND(p, depth, c->emit_mask_reg, depth);
680 brw_pop_insn_state(p);
681 }
682
683 static INLINE struct brw_reg high_words( struct brw_reg reg )
684 {
685 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
686 0, 8, 2 );
687 }
688
689 static INLINE struct brw_reg low_words( struct brw_reg reg )
690 {
691 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
692 }
693
694 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
695 {
696 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
697 }
698
699 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
700 {
701 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
702 0, 16, 2 );
703 }
704
705 /* One-, two- and three-dimensional Perlin noise, similar to the description
706 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
707 static void noise1_sub( struct brw_wm_compile *c ) {
708
709 struct brw_compile *p = &c->func;
710 struct brw_reg param,
711 x0, x1, /* gradients at each end */
712 t, tmp[ 2 ], /* float temporaries */
713 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
714 int i;
715 int mark = mark_tmps( c );
716
717 x0 = alloc_tmp( c );
718 x1 = alloc_tmp( c );
719 t = alloc_tmp( c );
720 tmp[ 0 ] = alloc_tmp( c );
721 tmp[ 1 ] = alloc_tmp( c );
722 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
723 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
724 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
725 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
726 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
727
728 param = lookup_tmp( c, mark - 2 );
729
730 brw_set_access_mode( p, BRW_ALIGN_1 );
731
732 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
733
734 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
735 be hashed. Also compute the remainder (offset within the unit
736 length), interleaved to reduce register dependency penalties. */
737 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
738 brw_FRC( p, param, param );
739 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
740 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
741 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
742
743 /* We're now ready to perform the hashing. The two hashes are
744 interleaved for performance. The hash function used is
745 designed to rapidly achieve avalanche and require only 32x16
746 bit multiplication, and 16-bit swizzles (which we get for
747 free). We can't use immediate operands in the multiplies,
748 because immediates are permitted only in src1 and the 16-bit
749 factor is permitted only in src0. */
750 for( i = 0; i < 2; i++ )
751 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
752 for( i = 0; i < 2; i++ )
753 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
754 high_words( itmp[ i ] ) );
755 for( i = 0; i < 2; i++ )
756 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
757 for( i = 0; i < 2; i++ )
758 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
759 high_words( itmp[ i ] ) );
760 for( i = 0; i < 2; i++ )
761 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
762 for( i = 0; i < 2; i++ )
763 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
764 high_words( itmp[ i ] ) );
765
766 /* Now we want to initialise the two gradients based on the
767 hashes. Format conversion from signed integer to float leaves
768 everything scaled too high by a factor of pow( 2, 31 ), but
769 we correct for that right at the end. */
770 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
771 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
772 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
773
774 brw_MUL( p, x0, x0, param );
775 brw_MUL( p, x1, x1, t );
776
777 /* We interpolate between the gradients using the polynomial
778 6t^5 - 15t^4 + 10t^3 (Perlin). */
779 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
780 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
781 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
782 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
783 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
784 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
785 pipeline */
786 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
787 brw_MUL( p, param, tmp[ 0 ], param );
788 brw_MUL( p, x1, x1, param );
789 brw_ADD( p, x0, x0, x1 );
790 /* scale by pow( 2, -30 ), to compensate for the format conversion
791 above and an extra factor of 2 so that a single gradient covers
792 the [-1,1] range */
793 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
794
795 release_tmps( c, mark );
796 }
797
798 static void emit_noise1( struct brw_wm_compile *c,
799 const struct prog_instruction *inst )
800 {
801 struct brw_compile *p = &c->func;
802 struct brw_reg src, param, dst;
803 GLuint mask = inst->DstReg.WriteMask;
804 int i;
805 int mark = mark_tmps( c );
806
807 assert( mark == 0 );
808
809 src = get_src_reg( c, inst, 0, 0 );
810
811 param = alloc_tmp( c );
812
813 brw_MOV( p, param, src );
814
815 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
816
817 /* Fill in the result: */
818 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
819 for (i = 0 ; i < 4; i++) {
820 if (mask & (1<<i)) {
821 dst = get_dst_reg(c, inst, i);
822 brw_MOV( p, dst, param );
823 }
824 }
825 if( inst->SaturateMode == SATURATE_ZERO_ONE )
826 brw_set_saturate( p, 0 );
827
828 release_tmps( c, mark );
829 }
830
831 static void noise2_sub( struct brw_wm_compile *c ) {
832
833 struct brw_compile *p = &c->func;
834 struct brw_reg param0, param1,
835 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
836 t, tmp[ 4 ], /* float temporaries */
837 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
838 int i;
839 int mark = mark_tmps( c );
840
841 x0y0 = alloc_tmp( c );
842 x0y1 = alloc_tmp( c );
843 x1y0 = alloc_tmp( c );
844 x1y1 = alloc_tmp( c );
845 t = alloc_tmp( c );
846 for( i = 0; i < 4; i++ ) {
847 tmp[ i ] = alloc_tmp( c );
848 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
849 }
850 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
851 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
852 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
853
854 param0 = lookup_tmp( c, mark - 3 );
855 param1 = lookup_tmp( c, mark - 2 );
856
857 brw_set_access_mode( p, BRW_ALIGN_1 );
858
859 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
860 be hashed. Also compute the remainders (offsets within the unit
861 square), interleaved to reduce register dependency penalties. */
862 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
863 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
864 brw_FRC( p, param0, param0 );
865 brw_FRC( p, param1, param1 );
866 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
867 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
868 low_words( itmp[ 1 ] ) );
869 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
870 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
871 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
872 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
873 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
874
875 /* We're now ready to perform the hashing. The four hashes are
876 interleaved for performance. The hash function used is
877 designed to rapidly achieve avalanche and require only 32x16
878 bit multiplication, and 16-bit swizzles (which we get for
879 free). We can't use immediate operands in the multiplies,
880 because immediates are permitted only in src1 and the 16-bit
881 factor is permitted only in src0. */
882 for( i = 0; i < 4; i++ )
883 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
884 for( i = 0; i < 4; i++ )
885 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
886 high_words( itmp[ i ] ) );
887 for( i = 0; i < 4; i++ )
888 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
889 for( i = 0; i < 4; i++ )
890 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
891 high_words( itmp[ i ] ) );
892 for( i = 0; i < 4; i++ )
893 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
894 for( i = 0; i < 4; i++ )
895 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
896 high_words( itmp[ i ] ) );
897
898 /* Now we want to initialise the four gradients based on the
899 hashes. Format conversion from signed integer to float leaves
900 everything scaled too high by a factor of pow( 2, 15 ), but
901 we correct for that right at the end. */
902 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
903 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
904 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
905 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
906 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
907
908 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
909 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
910 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
911 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
912
913 brw_MUL( p, x1y0, x1y0, t );
914 brw_MUL( p, x1y1, x1y1, t );
915 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
916 brw_MUL( p, x0y0, x0y0, param0 );
917 brw_MUL( p, x0y1, x0y1, param0 );
918
919 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
920 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
921 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
922 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
923
924 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
925 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
926 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
927 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
928
929 /* We interpolate between the gradients using the polynomial
930 6t^5 - 15t^4 + 10t^3 (Perlin). */
931 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
932 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
933 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
934 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
935 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
936 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
937 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
938 pipeline */
939 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
940 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
941 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
942 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
943 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
944 pipeline */
945 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
946 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
947 brw_MUL( p, param0, tmp[ 0 ], param0 );
948 brw_MUL( p, param1, tmp[ 1 ], param1 );
949
950 /* Here we interpolate in the y dimension... */
951 brw_MUL( p, x0y1, x0y1, param1 );
952 brw_MUL( p, x1y1, x1y1, param1 );
953 brw_ADD( p, x0y0, x0y0, x0y1 );
954 brw_ADD( p, x1y0, x1y0, x1y1 );
955
956 /* And now in x. There are horrible register dependencies here,
957 but we have nothing else to do. */
958 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
959 brw_MUL( p, x1y0, x1y0, param0 );
960 brw_ADD( p, x0y0, x0y0, x1y0 );
961
962 /* scale by pow( 2, -15 ), as described above */
963 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
964
965 release_tmps( c, mark );
966 }
967
968 static void emit_noise2( struct brw_wm_compile *c,
969 const struct prog_instruction *inst )
970 {
971 struct brw_compile *p = &c->func;
972 struct brw_reg src0, src1, param0, param1, dst;
973 GLuint mask = inst->DstReg.WriteMask;
974 int i;
975 int mark = mark_tmps( c );
976
977 assert( mark == 0 );
978
979 src0 = get_src_reg( c, inst, 0, 0 );
980 src1 = get_src_reg( c, inst, 0, 1 );
981
982 param0 = alloc_tmp( c );
983 param1 = alloc_tmp( c );
984
985 brw_MOV( p, param0, src0 );
986 brw_MOV( p, param1, src1 );
987
988 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
989
990 /* Fill in the result: */
991 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
992 for (i = 0 ; i < 4; i++) {
993 if (mask & (1<<i)) {
994 dst = get_dst_reg(c, inst, i);
995 brw_MOV( p, dst, param0 );
996 }
997 }
998 if( inst->SaturateMode == SATURATE_ZERO_ONE )
999 brw_set_saturate( p, 0 );
1000
1001 release_tmps( c, mark );
1002 }
1003
1004 /**
1005 * The three-dimensional case is much like the one- and two- versions above,
1006 * but since the number of corners is rapidly growing we now pack 16 16-bit
1007 * hashes into each register to extract more parallelism from the EUs.
1008 */
1009 static void noise3_sub( struct brw_wm_compile *c ) {
1010
1011 struct brw_compile *p = &c->func;
1012 struct brw_reg param0, param1, param2,
1013 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1014 xi, yi, zi, /* interpolation coefficients */
1015 t, tmp[ 8 ], /* float temporaries */
1016 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1017 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1018 int i;
1019 int mark = mark_tmps( c );
1020
1021 x0y0 = alloc_tmp( c );
1022 x0y1 = alloc_tmp( c );
1023 x1y0 = alloc_tmp( c );
1024 x1y1 = alloc_tmp( c );
1025 xi = alloc_tmp( c );
1026 yi = alloc_tmp( c );
1027 zi = alloc_tmp( c );
1028 t = alloc_tmp( c );
1029 for( i = 0; i < 8; i++ ) {
1030 tmp[ i ] = alloc_tmp( c );
1031 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1032 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1033 }
1034
1035 param0 = lookup_tmp( c, mark - 4 );
1036 param1 = lookup_tmp( c, mark - 3 );
1037 param2 = lookup_tmp( c, mark - 2 );
1038
1039 brw_set_access_mode( p, BRW_ALIGN_1 );
1040
1041 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1042 be hashed. Also compute the remainders (offsets within the unit
1043 cube), interleaved to reduce register dependency penalties. */
1044 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1045 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1046 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1047 brw_FRC( p, param0, param0 );
1048 brw_FRC( p, param1, param1 );
1049 brw_FRC( p, param2, param2 );
1050 /* Since we now have only 16 bits of precision in the hash, we must
1051 be more careful about thorough mixing to maintain entropy as we
1052 squash the input vector into a small scalar. */
1053 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1054 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1055 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1056 brw_imm_uw( 0x9B93 ) );
1057 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1058 brw_imm_uw( 0xBC8F ) );
1059
1060 /* Temporarily disable the execution mask while we work with ExecSize=16
1061 channels (the mask is set for ExecSize=8 and is probably incorrect).
1062 Although this might cause execution of unwanted channels, the code
1063 writes only to temporary registers and has no side effects, so
1064 disabling the mask is harmless. */
1065 brw_push_insn_state( p );
1066 brw_set_mask_control( p, BRW_MASK_DISABLE );
1067 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1068 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1069 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1070
1071 /* We're now ready to perform the hashing. The eight hashes are
1072 interleaved for performance. The hash function used is
1073 designed to rapidly achieve avalanche and require only 16x16
1074 bit multiplication, and 8-bit swizzles (which we get for
1075 free). */
1076 for( i = 0; i < 4; i++ )
1077 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1078 for( i = 0; i < 4; i++ )
1079 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1080 odd_bytes( wtmp[ i ] ) );
1081 for( i = 0; i < 4; i++ )
1082 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1083 for( i = 0; i < 4; i++ )
1084 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1085 odd_bytes( wtmp[ i ] ) );
1086 brw_pop_insn_state( p );
1087
1088 /* Now we want to initialise the four rear gradients based on the
1089 hashes. Format conversion from signed integer to float leaves
1090 everything scaled too high by a factor of pow( 2, 15 ), but
1091 we correct for that right at the end. */
1092 /* x component */
1093 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1094 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1095 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1096 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1097 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1098
1099 brw_push_insn_state( p );
1100 brw_set_mask_control( p, BRW_MASK_DISABLE );
1101 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1102 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1103 brw_pop_insn_state( p );
1104
1105 brw_MUL( p, x1y0, x1y0, t );
1106 brw_MUL( p, x1y1, x1y1, t );
1107 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1108 brw_MUL( p, x0y0, x0y0, param0 );
1109 brw_MUL( p, x0y1, x0y1, param0 );
1110
1111 /* y component */
1112 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1113 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1114 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1115 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1116
1117 brw_push_insn_state( p );
1118 brw_set_mask_control( p, BRW_MASK_DISABLE );
1119 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1120 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1121 brw_pop_insn_state( p );
1122
1123 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1124 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1125 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1126 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1127 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1128
1129 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1130 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1131 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1132 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1133
1134 /* z component */
1135 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1136 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1137 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1138 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1139
1140 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1141 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1142 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1143 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1144
1145 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1146 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1147 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1148 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1149
1150 /* We interpolate between the gradients using the polynomial
1151 6t^5 - 15t^4 + 10t^3 (Perlin). */
1152 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1153 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1154 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1155 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1156 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1157 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1158 brw_MUL( p, xi, xi, param0 );
1159 brw_MUL( p, yi, yi, param1 );
1160 brw_MUL( p, zi, zi, param2 );
1161 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1162 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1163 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1164 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1165 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1166 brw_MUL( p, xi, xi, param0 );
1167 brw_MUL( p, yi, yi, param1 );
1168 brw_MUL( p, zi, zi, param2 );
1169 brw_MUL( p, xi, xi, param0 );
1170 brw_MUL( p, yi, yi, param1 );
1171 brw_MUL( p, zi, zi, param2 );
1172 brw_MUL( p, xi, xi, param0 );
1173 brw_MUL( p, yi, yi, param1 );
1174 brw_MUL( p, zi, zi, param2 );
1175
1176 /* Here we interpolate in the y dimension... */
1177 brw_MUL( p, x0y1, x0y1, yi );
1178 brw_MUL( p, x1y1, x1y1, yi );
1179 brw_ADD( p, x0y0, x0y0, x0y1 );
1180 brw_ADD( p, x1y0, x1y0, x1y1 );
1181
1182 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1183 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1184 brw_MUL( p, x1y0, x1y0, xi );
1185 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1186
1187 /* Now do the same thing for the front four gradients... */
1188 /* x component */
1189 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1190 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1191 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1192 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1193
1194 brw_push_insn_state( p );
1195 brw_set_mask_control( p, BRW_MASK_DISABLE );
1196 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1197 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1198 brw_pop_insn_state( p );
1199
1200 brw_MUL( p, x1y0, x1y0, t );
1201 brw_MUL( p, x1y1, x1y1, t );
1202 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1203 brw_MUL( p, x0y0, x0y0, param0 );
1204 brw_MUL( p, x0y1, x0y1, param0 );
1205
1206 /* y component */
1207 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1208 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1209 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1210 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1211
1212 brw_push_insn_state( p );
1213 brw_set_mask_control( p, BRW_MASK_DISABLE );
1214 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1215 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1216 brw_pop_insn_state( p );
1217
1218 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1219 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1220 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1221 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1222 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1223
1224 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1225 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1226 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1227 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1228
1229 /* z component */
1230 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1231 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1232 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1233 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1234
1235 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1236 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1237 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1238 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1239
1240 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1241 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1242 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1243 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1244
1245 /* The interpolation coefficients are still around from last time, so
1246 again interpolate in the y dimension... */
1247 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1248 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1249 brw_MUL( p, x0y1, x0y1, yi );
1250 brw_MUL( p, x1y1, x1y1, yi );
1251 brw_ADD( p, x0y0, x0y0, x0y1 );
1252 brw_ADD( p, x1y0, x1y0, x1y1 );
1253
1254 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1255 time put the front face in tmp[ 1 ] and we're nearly there... */
1256 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1257 brw_MUL( p, x1y0, x1y0, xi );
1258 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1259
1260 /* The final interpolation, in the z dimension: */
1261 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1262 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1263 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1264
1265 /* scale by pow( 2, -15 ), as described above */
1266 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1267
1268 release_tmps( c, mark );
1269 }
1270
1271 static void emit_noise3( struct brw_wm_compile *c,
1272 const struct prog_instruction *inst )
1273 {
1274 struct brw_compile *p = &c->func;
1275 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1276 GLuint mask = inst->DstReg.WriteMask;
1277 int i;
1278 int mark = mark_tmps( c );
1279
1280 assert( mark == 0 );
1281
1282 src0 = get_src_reg( c, inst, 0, 0 );
1283 src1 = get_src_reg( c, inst, 0, 1 );
1284 src2 = get_src_reg( c, inst, 0, 2 );
1285
1286 param0 = alloc_tmp( c );
1287 param1 = alloc_tmp( c );
1288 param2 = alloc_tmp( c );
1289
1290 brw_MOV( p, param0, src0 );
1291 brw_MOV( p, param1, src1 );
1292 brw_MOV( p, param2, src2 );
1293
1294 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1295
1296 /* Fill in the result: */
1297 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1298 for (i = 0 ; i < 4; i++) {
1299 if (mask & (1<<i)) {
1300 dst = get_dst_reg(c, inst, i);
1301 brw_MOV( p, dst, param0 );
1302 }
1303 }
1304 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1305 brw_set_saturate( p, 0 );
1306
1307 release_tmps( c, mark );
1308 }
1309
1310 /**
1311 * For the four-dimensional case, the little micro-optimisation benefits
1312 * we obtain by unrolling all the loops aren't worth the massive bloat it
1313 * now causes. Instead, we loop twice around performing a similar operation
1314 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1315 * code to glue it all together.
1316 */
1317 static void noise4_sub( struct brw_wm_compile *c )
1318 {
1319 struct brw_compile *p = &c->func;
1320 struct brw_reg param[ 4 ],
1321 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1322 w0, /* noise for the w=0 cube */
1323 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1324 interp[ 4 ], /* interpolation coefficients */
1325 t, tmp[ 8 ], /* float temporaries */
1326 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1327 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1328 int i, j;
1329 int mark = mark_tmps( c );
1330 GLuint loop, origin;
1331
1332 x0y0 = alloc_tmp( c );
1333 x0y1 = alloc_tmp( c );
1334 x1y0 = alloc_tmp( c );
1335 x1y1 = alloc_tmp( c );
1336 t = alloc_tmp( c );
1337 w0 = alloc_tmp( c );
1338 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1339 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1340
1341 for( i = 0; i < 4; i++ ) {
1342 param[ i ] = lookup_tmp( c, mark - 5 + i );
1343 interp[ i ] = alloc_tmp( c );
1344 }
1345
1346 for( i = 0; i < 8; i++ ) {
1347 tmp[ i ] = alloc_tmp( c );
1348 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1349 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1350 }
1351
1352 brw_set_access_mode( p, BRW_ALIGN_1 );
1353
1354 /* We only want 16 bits of precision from the integral part of each
1355 co-ordinate, but unfortunately the RNDD semantics would saturate
1356 at 16 bits if we performed the operation directly to a 16-bit
1357 destination. Therefore, we round to 32-bit temporaries where
1358 appropriate, and then store only the lower 16 bits. */
1359 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1360 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1361 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1362 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1363 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1364 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1365
1366 /* Modify the flag register here, because the side effect is useful
1367 later (see below). We know for certain that all flags will be
1368 cleared, since the FRC instruction cannot possibly generate
1369 negative results. Even for exceptional inputs (infinities, denormals,
1370 NaNs), the architecture guarantees that the L conditional is false. */
1371 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1372 brw_FRC( p, param[ 0 ], param[ 0 ] );
1373 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1374 for( i = 1; i < 4; i++ )
1375 brw_FRC( p, param[ i ], param[ i ] );
1376
1377 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1378 of all. */
1379 for( i = 0; i < 4; i++ )
1380 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1381 for( i = 0; i < 4; i++ )
1382 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1383 for( i = 0; i < 4; i++ )
1384 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1385 for( i = 0; i < 4; i++ )
1386 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1387 for( j = 0; j < 3; j++ )
1388 for( i = 0; i < 4; i++ )
1389 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1390
1391 /* Mark the current address, as it will be a jump destination. The
1392 following code will be executed twice: first, with the flag
1393 register clear indicating the w=0 case, and second with flags
1394 set for w=1. */
1395 loop = p->nr_insn;
1396
1397 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1398 be hashed. Since we have only 16 bits of precision in the hash, we
1399 must be careful about thorough mixing to maintain entropy as we
1400 squash the input vector into a small scalar. */
1401 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1402 brw_imm_uw( 0xBC8F ) );
1403 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1404 brw_imm_uw( 0xD0BD ) );
1405 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1406 brw_imm_uw( 0x9B93 ) );
1407 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1408 brw_imm_uw( 0xA359 ) );
1409 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1410 brw_imm_uw( 0xBC8F ) );
1411
1412 /* Temporarily disable the execution mask while we work with ExecSize=16
1413 channels (the mask is set for ExecSize=8 and is probably incorrect).
1414 Although this might cause execution of unwanted channels, the code
1415 writes only to temporary registers and has no side effects, so
1416 disabling the mask is harmless. */
1417 brw_push_insn_state( p );
1418 brw_set_mask_control( p, BRW_MASK_DISABLE );
1419 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1420 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1421 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1422
1423 /* We're now ready to perform the hashing. The eight hashes are
1424 interleaved for performance. The hash function used is
1425 designed to rapidly achieve avalanche and require only 16x16
1426 bit multiplication, and 8-bit swizzles (which we get for
1427 free). */
1428 for( i = 0; i < 4; i++ )
1429 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1430 for( i = 0; i < 4; i++ )
1431 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1432 odd_bytes( wtmp[ i ] ) );
1433 for( i = 0; i < 4; i++ )
1434 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1435 for( i = 0; i < 4; i++ )
1436 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1437 odd_bytes( wtmp[ i ] ) );
1438 brw_pop_insn_state( p );
1439
1440 /* Now we want to initialise the four rear gradients based on the
1441 hashes. Format conversion from signed integer to float leaves
1442 everything scaled too high by a factor of pow( 2, 15 ), but
1443 we correct for that right at the end. */
1444 /* x component */
1445 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1446 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1447 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1448 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1449 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1450
1451 brw_push_insn_state( p );
1452 brw_set_mask_control( p, BRW_MASK_DISABLE );
1453 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1454 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1455 brw_pop_insn_state( p );
1456
1457 brw_MUL( p, x1y0, x1y0, t );
1458 brw_MUL( p, x1y1, x1y1, t );
1459 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1460 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1461 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1462
1463 /* y component */
1464 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1465 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1466 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1467 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1468
1469 brw_push_insn_state( p );
1470 brw_set_mask_control( p, BRW_MASK_DISABLE );
1471 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1472 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1473 brw_pop_insn_state( p );
1474
1475 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1476 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1477 /* prepare t for the w component (used below): w the first time through
1478 the loop; w - 1 the second time) */
1479 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1480 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1481 p->current->header.predicate_inverse = 1;
1482 brw_MOV( p, t, param[ 3 ] );
1483 p->current->header.predicate_inverse = 0;
1484 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1485 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1486 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1487
1488 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1489 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1490 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1491 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1492
1493 /* z component */
1494 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1495 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1496 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1497 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1498
1499 brw_push_insn_state( p );
1500 brw_set_mask_control( p, BRW_MASK_DISABLE );
1501 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1502 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1503 brw_pop_insn_state( p );
1504
1505 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1506 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1507 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1508 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1509
1510 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1511 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1512 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1513 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1514
1515 /* w component */
1516 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1517 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1518 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1519 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1520
1521 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1522 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1523 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1524 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1525 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1526
1527 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1528 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1529 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1530 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1531
1532 /* Here we interpolate in the y dimension... */
1533 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1534 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1535 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1536 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1537 brw_ADD( p, x0y0, x0y0, x0y1 );
1538 brw_ADD( p, x1y0, x1y0, x1y1 );
1539
1540 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1541 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1542 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1543 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1544
1545 /* Now do the same thing for the front four gradients... */
1546 /* x component */
1547 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1548 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1549 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1550 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1551
1552 brw_push_insn_state( p );
1553 brw_set_mask_control( p, BRW_MASK_DISABLE );
1554 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1555 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1556 brw_pop_insn_state( p );
1557
1558 brw_MUL( p, x1y0, x1y0, t );
1559 brw_MUL( p, x1y1, x1y1, t );
1560 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1561 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1562 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1563
1564 /* y component */
1565 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1566 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1567 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1568 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1569
1570 brw_push_insn_state( p );
1571 brw_set_mask_control( p, BRW_MASK_DISABLE );
1572 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1573 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1574 brw_pop_insn_state( p );
1575
1576 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1577 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1578 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1579 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1580 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1581
1582 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1583 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1584 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1585 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1586
1587 /* z component */
1588 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1589 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1590 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1591 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1592
1593 brw_push_insn_state( p );
1594 brw_set_mask_control( p, BRW_MASK_DISABLE );
1595 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1596 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1597 brw_pop_insn_state( p );
1598
1599 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1600 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1601 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1602 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1603 /* prepare t for the w component (used below): w the first time through
1604 the loop; w - 1 the second time) */
1605 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1606 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1607 p->current->header.predicate_inverse = 1;
1608 brw_MOV( p, t, param[ 3 ] );
1609 p->current->header.predicate_inverse = 0;
1610 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1611
1612 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1613 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1614 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1615 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1616
1617 /* w component */
1618 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1619 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1620 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1621 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1622
1623 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1624 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1625 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1626 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1627
1628 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1629 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1630 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1631 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1632
1633 /* Interpolate in the y dimension: */
1634 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1635 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1636 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1637 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1638 brw_ADD( p, x0y0, x0y0, x0y1 );
1639 brw_ADD( p, x1y0, x1y0, x1y1 );
1640
1641 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1642 time put the front face in tmp[ 1 ] and we're nearly there... */
1643 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1644 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1645 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1646
1647 /* Another interpolation, in the z dimension: */
1648 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1649 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1650 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1651
1652 /* Exit the loop if we've computed both cubes... */
1653 origin = p->nr_insn;
1654 brw_push_insn_state( p );
1655 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1656 brw_set_mask_control( p, BRW_MASK_DISABLE );
1657 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1658 brw_pop_insn_state( p );
1659
1660 /* Save the result for the w=0 case, and increment the w coordinate: */
1661 brw_MOV( p, w0, tmp[ 0 ] );
1662 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1663 brw_imm_uw( 1 ) );
1664
1665 /* Loop around for the other cube. Explicitly set the flag register
1666 (unfortunately we must spend an extra instruction to do this: we
1667 can't rely on a side effect of the previous MOV or ADD because
1668 conditional modifiers which are normally true might be false in
1669 exceptional circumstances, e.g. given a NaN input; the add to
1670 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1671 brw_push_insn_state( p );
1672 brw_set_mask_control( p, BRW_MASK_DISABLE );
1673 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1674 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1675 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1676 brw_pop_insn_state( p );
1677
1678 /* Patch the previous conditional branch now that we know the
1679 destination address. */
1680 brw_set_src1( p->store + origin,
1681 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1682
1683 /* The very last interpolation. */
1684 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1685 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1686 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1687
1688 /* scale by pow( 2, -15 ), as described above */
1689 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1690
1691 release_tmps( c, mark );
1692 }
1693
1694 static void emit_noise4( struct brw_wm_compile *c,
1695 const struct prog_instruction *inst )
1696 {
1697 struct brw_compile *p = &c->func;
1698 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1699 GLuint mask = inst->DstReg.WriteMask;
1700 int i;
1701 int mark = mark_tmps( c );
1702
1703 assert( mark == 0 );
1704
1705 src0 = get_src_reg( c, inst, 0, 0 );
1706 src1 = get_src_reg( c, inst, 0, 1 );
1707 src2 = get_src_reg( c, inst, 0, 2 );
1708 src3 = get_src_reg( c, inst, 0, 3 );
1709
1710 param0 = alloc_tmp( c );
1711 param1 = alloc_tmp( c );
1712 param2 = alloc_tmp( c );
1713 param3 = alloc_tmp( c );
1714
1715 brw_MOV( p, param0, src0 );
1716 brw_MOV( p, param1, src1 );
1717 brw_MOV( p, param2, src2 );
1718 brw_MOV( p, param3, src3 );
1719
1720 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1721
1722 /* Fill in the result: */
1723 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1724 for (i = 0 ; i < 4; i++) {
1725 if (mask & (1<<i)) {
1726 dst = get_dst_reg(c, inst, i);
1727 brw_MOV( p, dst, param0 );
1728 }
1729 }
1730 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1731 brw_set_saturate( p, 0 );
1732
1733 release_tmps( c, mark );
1734 }
1735
1736 /**
1737 * Resolve subroutine calls after code emit is done.
1738 */
1739 static void post_wm_emit( struct brw_wm_compile *c )
1740 {
1741 brw_resolve_cals(&c->func);
1742 }
1743
1744 static void
1745 get_argument_regs(struct brw_wm_compile *c,
1746 const struct prog_instruction *inst,
1747 int index,
1748 struct brw_reg *dst,
1749 struct brw_reg *regs,
1750 int mask)
1751 {
1752 struct brw_compile *p = &c->func;
1753 int i, j;
1754
1755 for (i = 0; i < 4; i++) {
1756 if (mask & (1 << i)) {
1757 regs[i] = get_src_reg(c, inst, index, i);
1758
1759 /* Unalias destination registers from our sources. */
1760 if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1761 for (j = 0; j < 4; j++) {
1762 if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1763 struct brw_reg tmp = alloc_tmp(c);
1764 brw_MOV(p, tmp, regs[i]);
1765 regs[i] = tmp;
1766 break;
1767 }
1768 }
1769 }
1770 }
1771 }
1772 }
1773
1774 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1775 {
1776 struct intel_context *intel = &brw->intel;
1777 #define MAX_IF_DEPTH 32
1778 #define MAX_LOOP_DEPTH 32
1779 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1780 GLuint i, if_depth = 0, loop_depth = 0;
1781 struct brw_compile *p = &c->func;
1782 struct brw_indirect stack_index = brw_indirect(0, 0);
1783
1784 c->out_of_regs = GL_FALSE;
1785
1786 prealloc_reg(c);
1787 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1788 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1789
1790 for (i = 0; i < c->nr_fp_insns; i++) {
1791 const struct prog_instruction *inst = &c->prog_instructions[i];
1792 int dst_flags;
1793 struct brw_reg args[3][4], dst[4];
1794 int j;
1795 int mark = mark_tmps( c );
1796
1797 c->cur_inst = i;
1798
1799 #if 0
1800 printf("Inst %d: ", i);
1801 _mesa_print_instruction(inst);
1802 #endif
1803
1804 /* fetch any constants that this instruction needs */
1805 if (c->fp->use_const_buffer)
1806 fetch_constants(c, inst);
1807
1808 if (inst->Opcode != OPCODE_ARL) {
1809 for (j = 0; j < 4; j++) {
1810 if (inst->DstReg.WriteMask & (1 << j))
1811 dst[j] = get_dst_reg(c, inst, j);
1812 else
1813 dst[j] = brw_null_reg();
1814 }
1815 }
1816 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1817 get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1818
1819 dst_flags = inst->DstReg.WriteMask;
1820 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1821 dst_flags |= SATURATE;
1822
1823 if (inst->CondUpdate)
1824 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1825 else
1826 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1827
1828 switch (inst->Opcode) {
1829 case WM_PIXELXY:
1830 emit_pixel_xy(c, dst, dst_flags);
1831 break;
1832 case WM_DELTAXY:
1833 emit_delta_xy(p, dst, dst_flags, args[0]);
1834 break;
1835 case WM_PIXELW:
1836 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1837 break;
1838 case WM_LINTERP:
1839 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1840 break;
1841 case WM_PINTERP:
1842 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1843 break;
1844 case WM_CINTERP:
1845 emit_cinterp(p, dst, dst_flags, args[0]);
1846 break;
1847 case WM_WPOSXY:
1848 emit_wpos_xy(c, dst, dst_flags, args[0]);
1849 break;
1850 case WM_FB_WRITE:
1851 emit_fb_write(c, args[0], args[1], args[2],
1852 INST_AUX_GET_TARGET(inst->Aux),
1853 inst->Aux & INST_AUX_EOT);
1854 break;
1855 case WM_FRONTFACING:
1856 emit_frontfacing(p, dst, dst_flags);
1857 break;
1858 case OPCODE_ADD:
1859 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1860 break;
1861 case OPCODE_ARL:
1862 emit_arl(c, inst);
1863 break;
1864 case OPCODE_FRC:
1865 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1866 break;
1867 case OPCODE_FLR:
1868 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1869 break;
1870 case OPCODE_LRP:
1871 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1872 break;
1873 case OPCODE_TRUNC:
1874 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1875 break;
1876 case OPCODE_MOV:
1877 case OPCODE_SWZ:
1878 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1879 break;
1880 case OPCODE_DP3:
1881 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1882 break;
1883 case OPCODE_DP4:
1884 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1885 break;
1886 case OPCODE_XPD:
1887 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1888 break;
1889 case OPCODE_DPH:
1890 emit_dph(p, dst, dst_flags, args[0], args[1]);
1891 break;
1892 case OPCODE_RCP:
1893 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1894 break;
1895 case OPCODE_RSQ:
1896 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1897 break;
1898 case OPCODE_SIN:
1899 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1900 break;
1901 case OPCODE_COS:
1902 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1903 break;
1904 case OPCODE_EX2:
1905 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1906 break;
1907 case OPCODE_LG2:
1908 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1909 break;
1910 case OPCODE_CMP:
1911 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1912 break;
1913 case OPCODE_MIN:
1914 emit_min(p, dst, dst_flags, args[0], args[1]);
1915 break;
1916 case OPCODE_MAX:
1917 emit_max(p, dst, dst_flags, args[0], args[1]);
1918 break;
1919 case OPCODE_DDX:
1920 case OPCODE_DDY:
1921 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1922 args[0]);
1923 break;
1924 case OPCODE_SLT:
1925 emit_sop(p, dst, dst_flags,
1926 BRW_CONDITIONAL_L, args[0], args[1]);
1927 break;
1928 case OPCODE_SLE:
1929 emit_sop(p, dst, dst_flags,
1930 BRW_CONDITIONAL_LE, args[0], args[1]);
1931 break;
1932 case OPCODE_SGT:
1933 emit_sop(p, dst, dst_flags,
1934 BRW_CONDITIONAL_G, args[0], args[1]);
1935 break;
1936 case OPCODE_SGE:
1937 emit_sop(p, dst, dst_flags,
1938 BRW_CONDITIONAL_GE, args[0], args[1]);
1939 break;
1940 case OPCODE_SEQ:
1941 emit_sop(p, dst, dst_flags,
1942 BRW_CONDITIONAL_EQ, args[0], args[1]);
1943 break;
1944 case OPCODE_SNE:
1945 emit_sop(p, dst, dst_flags,
1946 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1947 break;
1948 case OPCODE_MUL:
1949 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1950 break;
1951 case OPCODE_POW:
1952 emit_math2(c, BRW_MATH_FUNCTION_POW,
1953 dst, dst_flags, args[0], args[1]);
1954 break;
1955 case OPCODE_MAD:
1956 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1957 break;
1958 case OPCODE_NOISE1:
1959 emit_noise1(c, inst);
1960 break;
1961 case OPCODE_NOISE2:
1962 emit_noise2(c, inst);
1963 break;
1964 case OPCODE_NOISE3:
1965 emit_noise3(c, inst);
1966 break;
1967 case OPCODE_NOISE4:
1968 emit_noise4(c, inst);
1969 break;
1970 case OPCODE_TEX:
1971 emit_tex(c, dst, dst_flags, args[0],
1972 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1973 0, 1, 0, 0),
1974 inst->TexSrcTarget,
1975 inst->TexSrcUnit,
1976 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1977 break;
1978 case OPCODE_TXB:
1979 emit_txb(c, dst, dst_flags, args[0],
1980 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1981 0, 1, 0, 0),
1982 inst->TexSrcTarget,
1983 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
1984 break;
1985 case OPCODE_KIL_NV:
1986 emit_kil(c);
1987 break;
1988 case OPCODE_IF:
1989 assert(if_depth < MAX_IF_DEPTH);
1990 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
1991 break;
1992 case OPCODE_ELSE:
1993 assert(if_depth > 0);
1994 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1995 break;
1996 case OPCODE_ENDIF:
1997 assert(if_depth > 0);
1998 brw_ENDIF(p, if_inst[--if_depth]);
1999 break;
2000 case OPCODE_BGNSUB:
2001 brw_save_label(p, inst->Comment, p->nr_insn);
2002 break;
2003 case OPCODE_ENDSUB:
2004 /* no-op */
2005 break;
2006 case OPCODE_CAL:
2007 brw_push_insn_state(p);
2008 brw_set_mask_control(p, BRW_MASK_DISABLE);
2009 brw_set_access_mode(p, BRW_ALIGN_1);
2010 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2011 brw_set_access_mode(p, BRW_ALIGN_16);
2012 brw_ADD(p, get_addr_reg(stack_index),
2013 get_addr_reg(stack_index), brw_imm_d(4));
2014 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2015 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2016 brw_pop_insn_state(p);
2017 break;
2018
2019 case OPCODE_RET:
2020 brw_push_insn_state(p);
2021 brw_set_mask_control(p, BRW_MASK_DISABLE);
2022 brw_ADD(p, get_addr_reg(stack_index),
2023 get_addr_reg(stack_index), brw_imm_d(-4));
2024 brw_set_access_mode(p, BRW_ALIGN_1);
2025 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2026 brw_set_access_mode(p, BRW_ALIGN_16);
2027 brw_pop_insn_state(p);
2028
2029 break;
2030 case OPCODE_BGNLOOP:
2031 /* XXX may need to invalidate the current_constant regs */
2032 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2033 break;
2034 case OPCODE_BRK:
2035 brw_BREAK(p);
2036 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2037 break;
2038 case OPCODE_CONT:
2039 brw_CONT(p);
2040 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2041 break;
2042 case OPCODE_ENDLOOP:
2043 {
2044 struct brw_instruction *inst0, *inst1;
2045 GLuint br = 1;
2046
2047 if (intel->is_ironlake)
2048 br = 2;
2049
2050 assert(loop_depth > 0);
2051 loop_depth--;
2052 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2053 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2054 while (inst0 > loop_inst[loop_depth]) {
2055 inst0--;
2056 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2057 inst0->bits3.if_else.jump_count == 0) {
2058 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2059 inst0->bits3.if_else.pop_count = 0;
2060 }
2061 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2062 inst0->bits3.if_else.jump_count == 0) {
2063 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2064 inst0->bits3.if_else.pop_count = 0;
2065 }
2066 }
2067 }
2068 break;
2069 default:
2070 printf("unsupported opcode %d (%s) in fragment shader\n",
2071 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2072 _mesa_opcode_string(inst->Opcode) : "unknown");
2073 }
2074
2075 /* Release temporaries containing any unaliased source regs. */
2076 release_tmps( c, mark );
2077
2078 if (inst->CondUpdate)
2079 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2080 else
2081 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2082 }
2083 post_wm_emit(c);
2084
2085 if (INTEL_DEBUG & DEBUG_WM) {
2086 printf("wm-native:\n");
2087 for (i = 0; i < p->nr_insn; i++)
2088 brw_disasm(stderr, &p->store[i]);
2089 printf("\n");
2090 }
2091 }
2092
2093 /**
2094 * Do GPU code generation for shaders that use GLSL features such as
2095 * flow control. Other shaders will be compiled with the
2096 */
2097 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2098 {
2099 if (INTEL_DEBUG & DEBUG_WM) {
2100 printf("brw_wm_glsl_emit:\n");
2101 }
2102
2103 /* initial instruction translation/simplification */
2104 brw_wm_pass_fp(c);
2105
2106 /* actual code generation */
2107 brw_wm_emit_glsl(brw, c);
2108
2109 if (INTEL_DEBUG & DEBUG_WM) {
2110 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2111 }
2112
2113 c->prog_data.total_grf = num_grf_used(c);
2114 c->prog_data.total_scratch = 0;
2115 }