Merge commit 'origin/7.8'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
27 return GL_TRUE;
28
29 for (i = 0; i < fp->Base.NumInstructions; i++) {
30 const struct prog_instruction *inst = &fp->Base.Instructions[i];
31 switch (inst->Opcode) {
32 case OPCODE_ARL:
33 case OPCODE_IF:
34 case OPCODE_ENDIF:
35 case OPCODE_CAL:
36 case OPCODE_BRK:
37 case OPCODE_RET:
38 case OPCODE_NOISE1:
39 case OPCODE_NOISE2:
40 case OPCODE_NOISE3:
41 case OPCODE_NOISE4:
42 case OPCODE_BGNLOOP:
43 return GL_TRUE;
44 default:
45 break;
46 }
47 }
48 return GL_FALSE;
49 }
50
51
52
53 static void
54 reclaim_temps(struct brw_wm_compile *c);
55
56
57 /** Mark GRF register as used. */
58 static void
59 prealloc_grf(struct brw_wm_compile *c, int r)
60 {
61 c->used_grf[r] = GL_TRUE;
62 }
63
64
65 /** Mark given GRF register as not in use. */
66 static void
67 release_grf(struct brw_wm_compile *c, int r)
68 {
69 /*assert(c->used_grf[r]);*/
70 c->used_grf[r] = GL_FALSE;
71 c->first_free_grf = MIN2(c->first_free_grf, r);
72 }
73
74
75 /** Return index of a free GRF, mark it as used. */
76 static int
77 alloc_grf(struct brw_wm_compile *c)
78 {
79 GLuint r;
80 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
81 if (!c->used_grf[r]) {
82 c->used_grf[r] = GL_TRUE;
83 c->first_free_grf = r + 1; /* a guess */
84 return r;
85 }
86 }
87
88 /* no free temps, try to reclaim some */
89 reclaim_temps(c);
90 c->first_free_grf = 0;
91
92 /* try alloc again */
93 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
94 if (!c->used_grf[r]) {
95 c->used_grf[r] = GL_TRUE;
96 c->first_free_grf = r + 1; /* a guess */
97 return r;
98 }
99 }
100
101 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
102 assert(c->used_grf[r]);
103 }
104
105 /* really, no free GRF regs found */
106 if (!c->out_of_regs) {
107 /* print warning once per compilation */
108 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
109 c->out_of_regs = GL_TRUE;
110 }
111
112 return -1;
113 }
114
115
116 /** Return number of GRF registers used */
117 static int
118 num_grf_used(const struct brw_wm_compile *c)
119 {
120 int r;
121 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
122 if (c->used_grf[r])
123 return r + 1;
124 return 0;
125 }
126
127
128
129 /**
130 * Record the mapping of a Mesa register to a hardware register.
131 */
132 static void set_reg(struct brw_wm_compile *c, int file, int index,
133 int component, struct brw_reg reg)
134 {
135 c->wm_regs[file][index][component].reg = reg;
136 c->wm_regs[file][index][component].inited = GL_TRUE;
137 }
138
139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
140 {
141 struct brw_reg reg;
142
143 /* if we need to allocate another temp, grow the tmp_regs[] array */
144 if (c->tmp_index == c->tmp_max) {
145 int r = alloc_grf(c);
146 if (r < 0) {
147 /*printf("Out of temps in %s\n", __FUNCTION__);*/
148 r = 50; /* XXX random register! */
149 }
150 c->tmp_regs[ c->tmp_max++ ] = r;
151 }
152
153 /* form the GRF register */
154 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
155 /*printf("alloc_temp %d\n", reg.nr);*/
156 assert(reg.nr < BRW_WM_MAX_GRF);
157 return reg;
158
159 }
160
161 /**
162 * Save current temp register info.
163 * There must be a matching call to release_tmps().
164 */
165 static int mark_tmps(struct brw_wm_compile *c)
166 {
167 return c->tmp_index;
168 }
169
170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
171 {
172 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
173 }
174
175 static void release_tmps(struct brw_wm_compile *c, int mark)
176 {
177 c->tmp_index = mark;
178 }
179
180 /**
181 * Convert Mesa src register to brw register.
182 *
183 * Since we're running in SOA mode each Mesa register corresponds to four
184 * hardware registers. We allocate the hardware registers as needed here.
185 *
186 * \param file register file, one of PROGRAM_x
187 * \param index register number
188 * \param component src component (X=0, Y=1, Z=2, W=3)
189 * \param nr not used?!?
190 * \param neg negate value?
191 * \param abs take absolute value?
192 */
193 static struct brw_reg
194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
195 int nr, GLuint neg, GLuint abs)
196 {
197 struct brw_reg reg;
198 switch (file) {
199 case PROGRAM_STATE_VAR:
200 case PROGRAM_CONSTANT:
201 case PROGRAM_UNIFORM:
202 file = PROGRAM_STATE_VAR;
203 break;
204 case PROGRAM_UNDEFINED:
205 return brw_null_reg();
206 case PROGRAM_TEMPORARY:
207 case PROGRAM_INPUT:
208 case PROGRAM_OUTPUT:
209 case PROGRAM_PAYLOAD:
210 break;
211 default:
212 _mesa_problem(NULL, "Unexpected file in get_reg()");
213 return brw_null_reg();
214 }
215
216 assert(index < 256);
217 assert(component < 4);
218
219 /* see if we've already allocated a HW register for this Mesa register */
220 if (c->wm_regs[file][index][component].inited) {
221 /* yes, re-use */
222 reg = c->wm_regs[file][index][component].reg;
223 }
224 else {
225 /* no, allocate new register */
226 int grf = alloc_grf(c);
227 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
228 if (grf < 0) {
229 /* totally out of temps */
230 grf = 51; /* XXX random register! */
231 }
232
233 reg = brw_vec8_grf(grf, 0);
234 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
235
236 set_reg(c, file, index, component, reg);
237 }
238
239 if (neg & (1 << component)) {
240 reg = negate(reg);
241 }
242 if (abs)
243 reg = brw_abs(reg);
244 return reg;
245 }
246
247
248
249 /**
250 * This is called if we run out of GRF registers. Examine the live intervals
251 * of temp regs in the program and free those which won't be used again.
252 */
253 static void
254 reclaim_temps(struct brw_wm_compile *c)
255 {
256 GLint intBegin[MAX_PROGRAM_TEMPS];
257 GLint intEnd[MAX_PROGRAM_TEMPS];
258 int index;
259
260 /*printf("Reclaim temps:\n");*/
261
262 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
263 intBegin, intEnd);
264
265 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
266 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
267 /* program temp[i] can be freed */
268 int component;
269 /*printf(" temp[%d] is dead\n", index);*/
270 for (component = 0; component < 4; component++) {
271 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
272 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
273 release_grf(c, r);
274 /*
275 printf(" Reclaim temp %d, reg %d at inst %d\n",
276 index, r, c->cur_inst);
277 */
278 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
279 }
280 }
281 }
282 }
283 }
284
285
286
287
288 /**
289 * Preallocate registers. This sets up the Mesa to hardware register
290 * mapping for certain registers, such as constants (uniforms/state vars)
291 * and shader inputs.
292 */
293 static void prealloc_reg(struct brw_wm_compile *c)
294 {
295 struct intel_context *intel = &c->func.brw->intel;
296 int i, j;
297 struct brw_reg reg;
298 int urb_read_length = 0;
299 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
300 GLuint reg_index = 0;
301
302 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
303 c->first_free_grf = 0;
304
305 for (i = 0; i < 4; i++) {
306 if (i < c->key.nr_depth_regs)
307 reg = brw_vec8_grf(i * 2, 0);
308 else
309 reg = brw_vec8_grf(0, 0);
310 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
311 }
312 reg_index += 2 * c->key.nr_depth_regs;
313
314 /* constants */
315 {
316 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
317 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
318
319 /* use a real constant buffer, or just use a section of the GRF? */
320 /* XXX this heuristic may need adjustment... */
321 if ((nr_params + nr_temps) * 4 + reg_index > 80)
322 c->fp->use_const_buffer = GL_TRUE;
323 else
324 c->fp->use_const_buffer = GL_FALSE;
325 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
326
327 if (c->fp->use_const_buffer) {
328 /* We'll use a real constant buffer and fetch constants from
329 * it with a dataport read message.
330 */
331
332 /* number of float constants in CURBE */
333 c->prog_data.nr_params = 0;
334 }
335 else {
336 const struct gl_program_parameter_list *plist =
337 c->fp->program.Base.Parameters;
338 int index = 0;
339
340 /* number of float constants in CURBE */
341 c->prog_data.nr_params = 4 * nr_params;
342
343 /* loop over program constants (float[4]) */
344 for (i = 0; i < nr_params; i++) {
345 /* loop over XYZW channels */
346 for (j = 0; j < 4; j++, index++) {
347 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
348 /* Save pointer to parameter/constant value.
349 * Constants will be copied in prepare_constant_buffer()
350 */
351 c->prog_data.param[index] = &plist->ParameterValues[i][j];
352 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
353 }
354 }
355 /* number of constant regs used (each reg is float[8]) */
356 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
357 reg_index += c->nr_creg;
358 }
359 }
360
361 /* fragment shader inputs */
362 for (i = 0; i < VERT_RESULT_MAX; i++) {
363 int fp_input;
364
365 if (i >= VERT_RESULT_VAR0)
366 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
367 else if (i <= VERT_RESULT_TEX7)
368 fp_input = i;
369 else
370 fp_input = -1;
371
372 if (fp_input >= 0 && inputs & (1 << fp_input)) {
373 urb_read_length = reg_index;
374 reg = brw_vec8_grf(reg_index, 0);
375 for (j = 0; j < 4; j++)
376 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
377 }
378 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
379 reg_index += 2;
380 }
381 }
382
383 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
384 c->prog_data.urb_read_length = urb_read_length;
385 c->prog_data.curb_read_length = c->nr_creg;
386 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
387 reg_index++;
388 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
389 reg_index += 2;
390
391 /* mark GRF regs [0..reg_index-1] as in-use */
392 for (i = 0; i < reg_index; i++)
393 prealloc_grf(c, i);
394
395 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
396 prealloc_grf(c, 126);
397 prealloc_grf(c, 127);
398
399 for (i = 0; i < c->nr_fp_insns; i++) {
400 const struct prog_instruction *inst = &c->prog_instructions[i];
401 struct brw_reg dst[4];
402
403 switch (inst->Opcode) {
404 case OPCODE_TEX:
405 case OPCODE_TXB:
406 /* Allocate the channels of texture results contiguously,
407 * since they are written out that way by the sampler unit.
408 */
409 for (j = 0; j < 4; j++) {
410 dst[j] = get_dst_reg(c, inst, j);
411 if (j != 0)
412 assert(dst[j].nr == dst[j - 1].nr + 1);
413 }
414 break;
415 default:
416 break;
417 }
418 }
419
420 for (i = 0; i < c->nr_fp_insns; i++) {
421 const struct prog_instruction *inst = &c->prog_instructions[i];
422
423 switch (inst->Opcode) {
424 case WM_DELTAXY:
425 /* Allocate WM_DELTAXY destination on G45/GM45 to an
426 * even-numbered GRF if possible so that we can use the PLN
427 * instruction.
428 */
429 if (inst->DstReg.WriteMask == WRITEMASK_XY &&
430 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
431 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
432 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
433 int grf;
434
435 for (grf = c->first_free_grf & ~1;
436 grf < BRW_WM_MAX_GRF;
437 grf += 2)
438 {
439 if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
440 c->used_grf[grf] = GL_TRUE;
441 c->used_grf[grf + 1] = GL_TRUE;
442 c->first_free_grf = grf + 2; /* a guess */
443
444 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
445 brw_vec8_grf(grf, 0));
446 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
447 brw_vec8_grf(grf + 1, 0));
448 break;
449 }
450 }
451 }
452 default:
453 break;
454 }
455 }
456
457 /* An instruction may reference up to three constants.
458 * They'll be found in these registers.
459 * XXX alloc these on demand!
460 */
461 if (c->fp->use_const_buffer) {
462 for (i = 0; i < 3; i++) {
463 c->current_const[i].index = -1;
464 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
465 }
466 }
467 #if 0
468 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
469 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
470 #endif
471 }
472
473
474 /**
475 * Check if any of the instruction's src registers are constants, uniforms,
476 * or statevars. If so, fetch any constants that we don't already have in
477 * the three GRF slots.
478 */
479 static void fetch_constants(struct brw_wm_compile *c,
480 const struct prog_instruction *inst)
481 {
482 struct brw_compile *p = &c->func;
483 GLuint i;
484
485 /* loop over instruction src regs */
486 for (i = 0; i < 3; i++) {
487 const struct prog_src_register *src = &inst->SrcReg[i];
488 if (src->File == PROGRAM_STATE_VAR ||
489 src->File == PROGRAM_CONSTANT ||
490 src->File == PROGRAM_UNIFORM) {
491 c->current_const[i].index = src->Index;
492
493 #if 0
494 printf(" fetch const[%d] for arg %d into reg %d\n",
495 src->Index, i, c->current_const[i].reg.nr);
496 #endif
497
498 /* need to fetch the constant now */
499 brw_dp_READ_4(p,
500 c->current_const[i].reg, /* writeback dest */
501 src->RelAddr, /* relative indexing? */
502 16 * src->Index, /* byte offset */
503 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
504 );
505 }
506 }
507 }
508
509
510 /**
511 * Convert Mesa dst register to brw register.
512 */
513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
514 const struct prog_instruction *inst,
515 GLuint component)
516 {
517 const int nr = 1;
518 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
519 0, 0);
520 }
521
522
523 static struct brw_reg
524 get_src_reg_const(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint component)
527 {
528 /* We should have already fetched the constant from the constant
529 * buffer in fetch_constants(). Now we just have to return a
530 * register description that extracts the needed component and
531 * smears it across all eight vector components.
532 */
533 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
534 struct brw_reg const_reg;
535
536 assert(component < 4);
537 assert(srcRegIndex < 3);
538 assert(c->current_const[srcRegIndex].index != -1);
539 const_reg = c->current_const[srcRegIndex].reg;
540
541 /* extract desired float from the const_reg, and smear */
542 const_reg = stride(const_reg, 0, 1, 0);
543 const_reg.subnr = component * 4;
544
545 if (src->Negate & (1 << component))
546 const_reg = negate(const_reg);
547 if (src->Abs)
548 const_reg = brw_abs(const_reg);
549
550 #if 0
551 printf(" form const[%d].%d for arg %d, reg %d\n",
552 c->current_const[srcRegIndex].index,
553 component,
554 srcRegIndex,
555 const_reg.nr);
556 #endif
557
558 return const_reg;
559 }
560
561
562 /**
563 * Convert Mesa src register to brw register.
564 */
565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
566 const struct prog_instruction *inst,
567 GLuint srcRegIndex, GLuint channel)
568 {
569 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
570 const GLuint nr = 1;
571 const GLuint component = GET_SWZ(src->Swizzle, channel);
572
573 /* Only one immediate value can be used per native opcode, and it
574 * has be in the src1 slot, so not all Mesa instructions will get
575 * to take advantage of immediate constants.
576 */
577 if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
578 const struct gl_program_parameter_list *params;
579
580 params = c->fp->program.Base.Parameters;
581
582 /* Extended swizzle terms */
583 if (component == SWIZZLE_ZERO) {
584 return brw_imm_f(0.0F);
585 } else if (component == SWIZZLE_ONE) {
586 if (src->Negate)
587 return brw_imm_f(-1.0F);
588 else
589 return brw_imm_f(1.0F);
590 }
591
592 if (src->File == PROGRAM_CONSTANT) {
593 float f = params->ParameterValues[src->Index][component];
594
595 if (src->Abs)
596 f = fabs(f);
597 if (src->Negate)
598 f = -f;
599
600 return brw_imm_f(f);
601 }
602 }
603
604 if (c->fp->use_const_buffer &&
605 (src->File == PROGRAM_STATE_VAR ||
606 src->File == PROGRAM_CONSTANT ||
607 src->File == PROGRAM_UNIFORM)) {
608 return get_src_reg_const(c, inst, srcRegIndex, component);
609 }
610 else {
611 /* other type of source register */
612 return get_reg(c, src->File, src->Index, component, nr,
613 src->Negate, src->Abs);
614 }
615 }
616
617 /**
618 * Subroutines are minimal support for resusable instruction sequences.
619 * They are implemented as simply as possible to minimise overhead: there
620 * is no explicit support for communication between the caller and callee
621 * other than saving the return address in a temporary register, nor is
622 * there any automatic local storage. This implies that great care is
623 * required before attempting reentrancy or any kind of nested
624 * subroutine invocations.
625 */
626 static void invoke_subroutine( struct brw_wm_compile *c,
627 enum _subroutine subroutine,
628 void (*emit)( struct brw_wm_compile * ) )
629 {
630 struct brw_compile *p = &c->func;
631
632 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
633
634 if( c->subroutines[ subroutine ] ) {
635 /* subroutine previously emitted: reuse existing instructions */
636
637 int mark = mark_tmps( c );
638 struct brw_reg return_address = retype( alloc_tmp( c ),
639 BRW_REGISTER_TYPE_UD );
640 int here = p->nr_insn;
641
642 brw_push_insn_state(p);
643 brw_set_mask_control(p, BRW_MASK_DISABLE);
644 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
645
646 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
647 brw_imm_d( ( c->subroutines[ subroutine ] -
648 here - 1 ) << 4 ) );
649 brw_pop_insn_state(p);
650
651 release_tmps( c, mark );
652 } else {
653 /* previously unused subroutine: emit, and mark for later reuse */
654
655 int mark = mark_tmps( c );
656 struct brw_reg return_address = retype( alloc_tmp( c ),
657 BRW_REGISTER_TYPE_UD );
658 struct brw_instruction *calc;
659 int base = p->nr_insn;
660
661 brw_push_insn_state(p);
662 brw_set_mask_control(p, BRW_MASK_DISABLE);
663 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
664 brw_pop_insn_state(p);
665
666 c->subroutines[ subroutine ] = p->nr_insn;
667
668 emit( c );
669
670 brw_push_insn_state(p);
671 brw_set_mask_control(p, BRW_MASK_DISABLE);
672 brw_MOV( p, brw_ip_reg(), return_address );
673 brw_pop_insn_state(p);
674
675 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
676
677 release_tmps( c, mark );
678 }
679 }
680
681 static void emit_arl(struct brw_wm_compile *c,
682 const struct prog_instruction *inst)
683 {
684 struct brw_compile *p = &c->func;
685 struct brw_reg src0, addr_reg;
686 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
687 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
688 BRW_ARF_ADDRESS, 0);
689 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
690 brw_MOV(p, addr_reg, src0);
691 brw_set_saturate(p, 0);
692 }
693
694 /**
695 * For GLSL shaders, this KIL will be unconditional.
696 * It may be contained inside an IF/ENDIF structure of course.
697 */
698 static void emit_kil(struct brw_wm_compile *c)
699 {
700 struct brw_compile *p = &c->func;
701 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
702 brw_push_insn_state(p);
703 brw_set_mask_control(p, BRW_MASK_DISABLE);
704 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
705 brw_AND(p, depth, c->emit_mask_reg, depth);
706 brw_pop_insn_state(p);
707 }
708
709 static INLINE struct brw_reg high_words( struct brw_reg reg )
710 {
711 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
712 0, 8, 2 );
713 }
714
715 static INLINE struct brw_reg low_words( struct brw_reg reg )
716 {
717 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
718 }
719
720 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
721 {
722 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
723 }
724
725 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
726 {
727 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
728 0, 16, 2 );
729 }
730
731 /* One-, two- and three-dimensional Perlin noise, similar to the description
732 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
733 static void noise1_sub( struct brw_wm_compile *c ) {
734
735 struct brw_compile *p = &c->func;
736 struct brw_reg param,
737 x0, x1, /* gradients at each end */
738 t, tmp[ 2 ], /* float temporaries */
739 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
740 int i;
741 int mark = mark_tmps( c );
742
743 x0 = alloc_tmp( c );
744 x1 = alloc_tmp( c );
745 t = alloc_tmp( c );
746 tmp[ 0 ] = alloc_tmp( c );
747 tmp[ 1 ] = alloc_tmp( c );
748 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
749 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
750 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
751 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
752 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
753
754 param = lookup_tmp( c, mark - 2 );
755
756 brw_set_access_mode( p, BRW_ALIGN_1 );
757
758 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
759
760 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
761 be hashed. Also compute the remainder (offset within the unit
762 length), interleaved to reduce register dependency penalties. */
763 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
764 brw_FRC( p, param, param );
765 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
766 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
767 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
768
769 /* We're now ready to perform the hashing. The two hashes are
770 interleaved for performance. The hash function used is
771 designed to rapidly achieve avalanche and require only 32x16
772 bit multiplication, and 16-bit swizzles (which we get for
773 free). We can't use immediate operands in the multiplies,
774 because immediates are permitted only in src1 and the 16-bit
775 factor is permitted only in src0. */
776 for( i = 0; i < 2; i++ )
777 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
778 for( i = 0; i < 2; i++ )
779 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
780 high_words( itmp[ i ] ) );
781 for( i = 0; i < 2; i++ )
782 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
783 for( i = 0; i < 2; i++ )
784 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
785 high_words( itmp[ i ] ) );
786 for( i = 0; i < 2; i++ )
787 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
788 for( i = 0; i < 2; i++ )
789 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
790 high_words( itmp[ i ] ) );
791
792 /* Now we want to initialise the two gradients based on the
793 hashes. Format conversion from signed integer to float leaves
794 everything scaled too high by a factor of pow( 2, 31 ), but
795 we correct for that right at the end. */
796 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
797 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
798 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
799
800 brw_MUL( p, x0, x0, param );
801 brw_MUL( p, x1, x1, t );
802
803 /* We interpolate between the gradients using the polynomial
804 6t^5 - 15t^4 + 10t^3 (Perlin). */
805 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
806 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
807 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
808 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
809 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
810 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
811 pipeline */
812 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
813 brw_MUL( p, param, tmp[ 0 ], param );
814 brw_MUL( p, x1, x1, param );
815 brw_ADD( p, x0, x0, x1 );
816 /* scale by pow( 2, -30 ), to compensate for the format conversion
817 above and an extra factor of 2 so that a single gradient covers
818 the [-1,1] range */
819 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
820
821 release_tmps( c, mark );
822 }
823
824 static void emit_noise1( struct brw_wm_compile *c,
825 const struct prog_instruction *inst )
826 {
827 struct brw_compile *p = &c->func;
828 struct brw_reg src, param, dst;
829 GLuint mask = inst->DstReg.WriteMask;
830 int i;
831 int mark = mark_tmps( c );
832
833 assert( mark == 0 );
834
835 src = get_src_reg( c, inst, 0, 0 );
836
837 param = alloc_tmp( c );
838
839 brw_MOV( p, param, src );
840
841 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
842
843 /* Fill in the result: */
844 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
845 for (i = 0 ; i < 4; i++) {
846 if (mask & (1<<i)) {
847 dst = get_dst_reg(c, inst, i);
848 brw_MOV( p, dst, param );
849 }
850 }
851 if( inst->SaturateMode == SATURATE_ZERO_ONE )
852 brw_set_saturate( p, 0 );
853
854 release_tmps( c, mark );
855 }
856
857 static void noise2_sub( struct brw_wm_compile *c ) {
858
859 struct brw_compile *p = &c->func;
860 struct brw_reg param0, param1,
861 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
862 t, tmp[ 4 ], /* float temporaries */
863 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
864 int i;
865 int mark = mark_tmps( c );
866
867 x0y0 = alloc_tmp( c );
868 x0y1 = alloc_tmp( c );
869 x1y0 = alloc_tmp( c );
870 x1y1 = alloc_tmp( c );
871 t = alloc_tmp( c );
872 for( i = 0; i < 4; i++ ) {
873 tmp[ i ] = alloc_tmp( c );
874 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
875 }
876 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
877 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
878 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
879
880 param0 = lookup_tmp( c, mark - 3 );
881 param1 = lookup_tmp( c, mark - 2 );
882
883 brw_set_access_mode( p, BRW_ALIGN_1 );
884
885 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
886 be hashed. Also compute the remainders (offsets within the unit
887 square), interleaved to reduce register dependency penalties. */
888 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
889 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
890 brw_FRC( p, param0, param0 );
891 brw_FRC( p, param1, param1 );
892 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
893 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
894 low_words( itmp[ 1 ] ) );
895 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
896 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
897 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
898 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
899 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
900
901 /* We're now ready to perform the hashing. The four hashes are
902 interleaved for performance. The hash function used is
903 designed to rapidly achieve avalanche and require only 32x16
904 bit multiplication, and 16-bit swizzles (which we get for
905 free). We can't use immediate operands in the multiplies,
906 because immediates are permitted only in src1 and the 16-bit
907 factor is permitted only in src0. */
908 for( i = 0; i < 4; i++ )
909 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
910 for( i = 0; i < 4; i++ )
911 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
912 high_words( itmp[ i ] ) );
913 for( i = 0; i < 4; i++ )
914 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
915 for( i = 0; i < 4; i++ )
916 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
917 high_words( itmp[ i ] ) );
918 for( i = 0; i < 4; i++ )
919 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
920 for( i = 0; i < 4; i++ )
921 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
922 high_words( itmp[ i ] ) );
923
924 /* Now we want to initialise the four gradients based on the
925 hashes. Format conversion from signed integer to float leaves
926 everything scaled too high by a factor of pow( 2, 15 ), but
927 we correct for that right at the end. */
928 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
929 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
930 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
931 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
932 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
933
934 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
935 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
936 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
937 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
938
939 brw_MUL( p, x1y0, x1y0, t );
940 brw_MUL( p, x1y1, x1y1, t );
941 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
942 brw_MUL( p, x0y0, x0y0, param0 );
943 brw_MUL( p, x0y1, x0y1, param0 );
944
945 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
946 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
947 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
948 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
949
950 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
951 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
952 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
953 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
954
955 /* We interpolate between the gradients using the polynomial
956 6t^5 - 15t^4 + 10t^3 (Perlin). */
957 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
958 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
959 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
960 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
961 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
962 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
963 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
964 pipeline */
965 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
966 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
967 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
968 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
969 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
970 pipeline */
971 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
972 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
973 brw_MUL( p, param0, tmp[ 0 ], param0 );
974 brw_MUL( p, param1, tmp[ 1 ], param1 );
975
976 /* Here we interpolate in the y dimension... */
977 brw_MUL( p, x0y1, x0y1, param1 );
978 brw_MUL( p, x1y1, x1y1, param1 );
979 brw_ADD( p, x0y0, x0y0, x0y1 );
980 brw_ADD( p, x1y0, x1y0, x1y1 );
981
982 /* And now in x. There are horrible register dependencies here,
983 but we have nothing else to do. */
984 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
985 brw_MUL( p, x1y0, x1y0, param0 );
986 brw_ADD( p, x0y0, x0y0, x1y0 );
987
988 /* scale by pow( 2, -15 ), as described above */
989 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
990
991 release_tmps( c, mark );
992 }
993
994 static void emit_noise2( struct brw_wm_compile *c,
995 const struct prog_instruction *inst )
996 {
997 struct brw_compile *p = &c->func;
998 struct brw_reg src0, src1, param0, param1, dst;
999 GLuint mask = inst->DstReg.WriteMask;
1000 int i;
1001 int mark = mark_tmps( c );
1002
1003 assert( mark == 0 );
1004
1005 src0 = get_src_reg( c, inst, 0, 0 );
1006 src1 = get_src_reg( c, inst, 0, 1 );
1007
1008 param0 = alloc_tmp( c );
1009 param1 = alloc_tmp( c );
1010
1011 brw_MOV( p, param0, src0 );
1012 brw_MOV( p, param1, src1 );
1013
1014 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1015
1016 /* Fill in the result: */
1017 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1018 for (i = 0 ; i < 4; i++) {
1019 if (mask & (1<<i)) {
1020 dst = get_dst_reg(c, inst, i);
1021 brw_MOV( p, dst, param0 );
1022 }
1023 }
1024 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1025 brw_set_saturate( p, 0 );
1026
1027 release_tmps( c, mark );
1028 }
1029
1030 /**
1031 * The three-dimensional case is much like the one- and two- versions above,
1032 * but since the number of corners is rapidly growing we now pack 16 16-bit
1033 * hashes into each register to extract more parallelism from the EUs.
1034 */
1035 static void noise3_sub( struct brw_wm_compile *c ) {
1036
1037 struct brw_compile *p = &c->func;
1038 struct brw_reg param0, param1, param2,
1039 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1040 xi, yi, zi, /* interpolation coefficients */
1041 t, tmp[ 8 ], /* float temporaries */
1042 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1043 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1044 int i;
1045 int mark = mark_tmps( c );
1046
1047 x0y0 = alloc_tmp( c );
1048 x0y1 = alloc_tmp( c );
1049 x1y0 = alloc_tmp( c );
1050 x1y1 = alloc_tmp( c );
1051 xi = alloc_tmp( c );
1052 yi = alloc_tmp( c );
1053 zi = alloc_tmp( c );
1054 t = alloc_tmp( c );
1055 for( i = 0; i < 8; i++ ) {
1056 tmp[ i ] = alloc_tmp( c );
1057 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1058 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1059 }
1060
1061 param0 = lookup_tmp( c, mark - 4 );
1062 param1 = lookup_tmp( c, mark - 3 );
1063 param2 = lookup_tmp( c, mark - 2 );
1064
1065 brw_set_access_mode( p, BRW_ALIGN_1 );
1066
1067 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1068 be hashed. Also compute the remainders (offsets within the unit
1069 cube), interleaved to reduce register dependency penalties. */
1070 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1071 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1072 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1073 brw_FRC( p, param0, param0 );
1074 brw_FRC( p, param1, param1 );
1075 brw_FRC( p, param2, param2 );
1076 /* Since we now have only 16 bits of precision in the hash, we must
1077 be more careful about thorough mixing to maintain entropy as we
1078 squash the input vector into a small scalar. */
1079 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1080 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1081 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1082 brw_imm_uw( 0x9B93 ) );
1083 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1084 brw_imm_uw( 0xBC8F ) );
1085
1086 /* Temporarily disable the execution mask while we work with ExecSize=16
1087 channels (the mask is set for ExecSize=8 and is probably incorrect).
1088 Although this might cause execution of unwanted channels, the code
1089 writes only to temporary registers and has no side effects, so
1090 disabling the mask is harmless. */
1091 brw_push_insn_state( p );
1092 brw_set_mask_control( p, BRW_MASK_DISABLE );
1093 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1094 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1095 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1096
1097 /* We're now ready to perform the hashing. The eight hashes are
1098 interleaved for performance. The hash function used is
1099 designed to rapidly achieve avalanche and require only 16x16
1100 bit multiplication, and 8-bit swizzles (which we get for
1101 free). */
1102 for( i = 0; i < 4; i++ )
1103 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1104 for( i = 0; i < 4; i++ )
1105 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1106 odd_bytes( wtmp[ i ] ) );
1107 for( i = 0; i < 4; i++ )
1108 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1109 for( i = 0; i < 4; i++ )
1110 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1111 odd_bytes( wtmp[ i ] ) );
1112 brw_pop_insn_state( p );
1113
1114 /* Now we want to initialise the four rear gradients based on the
1115 hashes. Format conversion from signed integer to float leaves
1116 everything scaled too high by a factor of pow( 2, 15 ), but
1117 we correct for that right at the end. */
1118 /* x component */
1119 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1120 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1121 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1122 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1123 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1124
1125 brw_push_insn_state( p );
1126 brw_set_mask_control( p, BRW_MASK_DISABLE );
1127 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1128 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1129 brw_pop_insn_state( p );
1130
1131 brw_MUL( p, x1y0, x1y0, t );
1132 brw_MUL( p, x1y1, x1y1, t );
1133 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1134 brw_MUL( p, x0y0, x0y0, param0 );
1135 brw_MUL( p, x0y1, x0y1, param0 );
1136
1137 /* y component */
1138 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1139 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1140 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1141 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1142
1143 brw_push_insn_state( p );
1144 brw_set_mask_control( p, BRW_MASK_DISABLE );
1145 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1146 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1147 brw_pop_insn_state( p );
1148
1149 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1150 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1151 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1152 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1153 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1154
1155 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1156 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1157 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1158 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1159
1160 /* z component */
1161 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1162 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1163 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1164 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1165
1166 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1167 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1168 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1169 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1170
1171 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1172 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1173 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1174 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1175
1176 /* We interpolate between the gradients using the polynomial
1177 6t^5 - 15t^4 + 10t^3 (Perlin). */
1178 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1179 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1180 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1181 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1182 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1183 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1184 brw_MUL( p, xi, xi, param0 );
1185 brw_MUL( p, yi, yi, param1 );
1186 brw_MUL( p, zi, zi, param2 );
1187 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1188 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1189 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1190 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1191 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1192 brw_MUL( p, xi, xi, param0 );
1193 brw_MUL( p, yi, yi, param1 );
1194 brw_MUL( p, zi, zi, param2 );
1195 brw_MUL( p, xi, xi, param0 );
1196 brw_MUL( p, yi, yi, param1 );
1197 brw_MUL( p, zi, zi, param2 );
1198 brw_MUL( p, xi, xi, param0 );
1199 brw_MUL( p, yi, yi, param1 );
1200 brw_MUL( p, zi, zi, param2 );
1201
1202 /* Here we interpolate in the y dimension... */
1203 brw_MUL( p, x0y1, x0y1, yi );
1204 brw_MUL( p, x1y1, x1y1, yi );
1205 brw_ADD( p, x0y0, x0y0, x0y1 );
1206 brw_ADD( p, x1y0, x1y0, x1y1 );
1207
1208 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1209 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1210 brw_MUL( p, x1y0, x1y0, xi );
1211 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1212
1213 /* Now do the same thing for the front four gradients... */
1214 /* x component */
1215 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1216 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1217 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1218 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1219
1220 brw_push_insn_state( p );
1221 brw_set_mask_control( p, BRW_MASK_DISABLE );
1222 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1223 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1224 brw_pop_insn_state( p );
1225
1226 brw_MUL( p, x1y0, x1y0, t );
1227 brw_MUL( p, x1y1, x1y1, t );
1228 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1229 brw_MUL( p, x0y0, x0y0, param0 );
1230 brw_MUL( p, x0y1, x0y1, param0 );
1231
1232 /* y component */
1233 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1234 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1235 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1236 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1237
1238 brw_push_insn_state( p );
1239 brw_set_mask_control( p, BRW_MASK_DISABLE );
1240 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1241 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1242 brw_pop_insn_state( p );
1243
1244 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1245 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1246 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1247 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1248 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1249
1250 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1251 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1252 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1253 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1254
1255 /* z component */
1256 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1257 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1258 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1259 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1260
1261 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1262 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1263 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1264 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1265
1266 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1267 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1268 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1269 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1270
1271 /* The interpolation coefficients are still around from last time, so
1272 again interpolate in the y dimension... */
1273 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1274 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1275 brw_MUL( p, x0y1, x0y1, yi );
1276 brw_MUL( p, x1y1, x1y1, yi );
1277 brw_ADD( p, x0y0, x0y0, x0y1 );
1278 brw_ADD( p, x1y0, x1y0, x1y1 );
1279
1280 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1281 time put the front face in tmp[ 1 ] and we're nearly there... */
1282 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1283 brw_MUL( p, x1y0, x1y0, xi );
1284 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1285
1286 /* The final interpolation, in the z dimension: */
1287 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1288 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1289 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1290
1291 /* scale by pow( 2, -15 ), as described above */
1292 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1293
1294 release_tmps( c, mark );
1295 }
1296
1297 static void emit_noise3( struct brw_wm_compile *c,
1298 const struct prog_instruction *inst )
1299 {
1300 struct brw_compile *p = &c->func;
1301 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1302 GLuint mask = inst->DstReg.WriteMask;
1303 int i;
1304 int mark = mark_tmps( c );
1305
1306 assert( mark == 0 );
1307
1308 src0 = get_src_reg( c, inst, 0, 0 );
1309 src1 = get_src_reg( c, inst, 0, 1 );
1310 src2 = get_src_reg( c, inst, 0, 2 );
1311
1312 param0 = alloc_tmp( c );
1313 param1 = alloc_tmp( c );
1314 param2 = alloc_tmp( c );
1315
1316 brw_MOV( p, param0, src0 );
1317 brw_MOV( p, param1, src1 );
1318 brw_MOV( p, param2, src2 );
1319
1320 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1321
1322 /* Fill in the result: */
1323 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1324 for (i = 0 ; i < 4; i++) {
1325 if (mask & (1<<i)) {
1326 dst = get_dst_reg(c, inst, i);
1327 brw_MOV( p, dst, param0 );
1328 }
1329 }
1330 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1331 brw_set_saturate( p, 0 );
1332
1333 release_tmps( c, mark );
1334 }
1335
1336 /**
1337 * For the four-dimensional case, the little micro-optimisation benefits
1338 * we obtain by unrolling all the loops aren't worth the massive bloat it
1339 * now causes. Instead, we loop twice around performing a similar operation
1340 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1341 * code to glue it all together.
1342 */
1343 static void noise4_sub( struct brw_wm_compile *c )
1344 {
1345 struct brw_compile *p = &c->func;
1346 struct brw_reg param[ 4 ],
1347 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1348 w0, /* noise for the w=0 cube */
1349 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1350 interp[ 4 ], /* interpolation coefficients */
1351 t, tmp[ 8 ], /* float temporaries */
1352 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1353 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1354 int i, j;
1355 int mark = mark_tmps( c );
1356 GLuint loop, origin;
1357
1358 x0y0 = alloc_tmp( c );
1359 x0y1 = alloc_tmp( c );
1360 x1y0 = alloc_tmp( c );
1361 x1y1 = alloc_tmp( c );
1362 t = alloc_tmp( c );
1363 w0 = alloc_tmp( c );
1364 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1365 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1366
1367 for( i = 0; i < 4; i++ ) {
1368 param[ i ] = lookup_tmp( c, mark - 5 + i );
1369 interp[ i ] = alloc_tmp( c );
1370 }
1371
1372 for( i = 0; i < 8; i++ ) {
1373 tmp[ i ] = alloc_tmp( c );
1374 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1375 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1376 }
1377
1378 brw_set_access_mode( p, BRW_ALIGN_1 );
1379
1380 /* We only want 16 bits of precision from the integral part of each
1381 co-ordinate, but unfortunately the RNDD semantics would saturate
1382 at 16 bits if we performed the operation directly to a 16-bit
1383 destination. Therefore, we round to 32-bit temporaries where
1384 appropriate, and then store only the lower 16 bits. */
1385 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1386 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1387 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1388 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1389 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1390 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1391
1392 /* Modify the flag register here, because the side effect is useful
1393 later (see below). We know for certain that all flags will be
1394 cleared, since the FRC instruction cannot possibly generate
1395 negative results. Even for exceptional inputs (infinities, denormals,
1396 NaNs), the architecture guarantees that the L conditional is false. */
1397 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1398 brw_FRC( p, param[ 0 ], param[ 0 ] );
1399 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1400 for( i = 1; i < 4; i++ )
1401 brw_FRC( p, param[ i ], param[ i ] );
1402
1403 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1404 of all. */
1405 for( i = 0; i < 4; i++ )
1406 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1407 for( i = 0; i < 4; i++ )
1408 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1409 for( i = 0; i < 4; i++ )
1410 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1411 for( i = 0; i < 4; i++ )
1412 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1413 for( j = 0; j < 3; j++ )
1414 for( i = 0; i < 4; i++ )
1415 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1416
1417 /* Mark the current address, as it will be a jump destination. The
1418 following code will be executed twice: first, with the flag
1419 register clear indicating the w=0 case, and second with flags
1420 set for w=1. */
1421 loop = p->nr_insn;
1422
1423 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1424 be hashed. Since we have only 16 bits of precision in the hash, we
1425 must be careful about thorough mixing to maintain entropy as we
1426 squash the input vector into a small scalar. */
1427 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1428 brw_imm_uw( 0xBC8F ) );
1429 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1430 brw_imm_uw( 0xD0BD ) );
1431 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1432 brw_imm_uw( 0x9B93 ) );
1433 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1434 brw_imm_uw( 0xA359 ) );
1435 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1436 brw_imm_uw( 0xBC8F ) );
1437
1438 /* Temporarily disable the execution mask while we work with ExecSize=16
1439 channels (the mask is set for ExecSize=8 and is probably incorrect).
1440 Although this might cause execution of unwanted channels, the code
1441 writes only to temporary registers and has no side effects, so
1442 disabling the mask is harmless. */
1443 brw_push_insn_state( p );
1444 brw_set_mask_control( p, BRW_MASK_DISABLE );
1445 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1446 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1447 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1448
1449 /* We're now ready to perform the hashing. The eight hashes are
1450 interleaved for performance. The hash function used is
1451 designed to rapidly achieve avalanche and require only 16x16
1452 bit multiplication, and 8-bit swizzles (which we get for
1453 free). */
1454 for( i = 0; i < 4; i++ )
1455 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1456 for( i = 0; i < 4; i++ )
1457 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1458 odd_bytes( wtmp[ i ] ) );
1459 for( i = 0; i < 4; i++ )
1460 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1461 for( i = 0; i < 4; i++ )
1462 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1463 odd_bytes( wtmp[ i ] ) );
1464 brw_pop_insn_state( p );
1465
1466 /* Now we want to initialise the four rear gradients based on the
1467 hashes. Format conversion from signed integer to float leaves
1468 everything scaled too high by a factor of pow( 2, 15 ), but
1469 we correct for that right at the end. */
1470 /* x component */
1471 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1472 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1473 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1474 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1475 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1476
1477 brw_push_insn_state( p );
1478 brw_set_mask_control( p, BRW_MASK_DISABLE );
1479 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1480 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1481 brw_pop_insn_state( p );
1482
1483 brw_MUL( p, x1y0, x1y0, t );
1484 brw_MUL( p, x1y1, x1y1, t );
1485 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1486 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1487 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1488
1489 /* y component */
1490 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1491 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1492 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1493 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1494
1495 brw_push_insn_state( p );
1496 brw_set_mask_control( p, BRW_MASK_DISABLE );
1497 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1498 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1499 brw_pop_insn_state( p );
1500
1501 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1502 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1503 /* prepare t for the w component (used below): w the first time through
1504 the loop; w - 1 the second time) */
1505 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1506 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1507 p->current->header.predicate_inverse = 1;
1508 brw_MOV( p, t, param[ 3 ] );
1509 p->current->header.predicate_inverse = 0;
1510 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1511 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1512 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1513
1514 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1515 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1516 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1517 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1518
1519 /* z component */
1520 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1521 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1522 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1523 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1524
1525 brw_push_insn_state( p );
1526 brw_set_mask_control( p, BRW_MASK_DISABLE );
1527 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1528 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1529 brw_pop_insn_state( p );
1530
1531 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1532 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1533 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1534 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1535
1536 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1537 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1538 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1539 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1540
1541 /* w component */
1542 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1543 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1544 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1545 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1546
1547 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1548 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1549 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1550 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1551 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1552
1553 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1554 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1555 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1556 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1557
1558 /* Here we interpolate in the y dimension... */
1559 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1560 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1561 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1562 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1563 brw_ADD( p, x0y0, x0y0, x0y1 );
1564 brw_ADD( p, x1y0, x1y0, x1y1 );
1565
1566 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1567 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1568 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1569 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1570
1571 /* Now do the same thing for the front four gradients... */
1572 /* x component */
1573 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1574 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1575 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1576 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1577
1578 brw_push_insn_state( p );
1579 brw_set_mask_control( p, BRW_MASK_DISABLE );
1580 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1581 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1582 brw_pop_insn_state( p );
1583
1584 brw_MUL( p, x1y0, x1y0, t );
1585 brw_MUL( p, x1y1, x1y1, t );
1586 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1587 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1588 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1589
1590 /* y component */
1591 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1592 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1593 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1594 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1595
1596 brw_push_insn_state( p );
1597 brw_set_mask_control( p, BRW_MASK_DISABLE );
1598 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1599 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1600 brw_pop_insn_state( p );
1601
1602 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1603 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1604 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1605 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1606 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1607
1608 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1609 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1610 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1611 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1612
1613 /* z component */
1614 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1615 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1616 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1617 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1618
1619 brw_push_insn_state( p );
1620 brw_set_mask_control( p, BRW_MASK_DISABLE );
1621 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1622 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1623 brw_pop_insn_state( p );
1624
1625 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1626 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1627 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1628 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1629 /* prepare t for the w component (used below): w the first time through
1630 the loop; w - 1 the second time) */
1631 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1632 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1633 p->current->header.predicate_inverse = 1;
1634 brw_MOV( p, t, param[ 3 ] );
1635 p->current->header.predicate_inverse = 0;
1636 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1637
1638 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1639 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1640 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1641 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1642
1643 /* w component */
1644 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1645 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1646 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1647 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1648
1649 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1650 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1651 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1652 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1653
1654 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1655 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1656 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1657 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1658
1659 /* Interpolate in the y dimension: */
1660 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1661 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1662 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1663 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1664 brw_ADD( p, x0y0, x0y0, x0y1 );
1665 brw_ADD( p, x1y0, x1y0, x1y1 );
1666
1667 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1668 time put the front face in tmp[ 1 ] and we're nearly there... */
1669 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1670 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1671 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1672
1673 /* Another interpolation, in the z dimension: */
1674 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1675 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1676 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1677
1678 /* Exit the loop if we've computed both cubes... */
1679 origin = p->nr_insn;
1680 brw_push_insn_state( p );
1681 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1682 brw_set_mask_control( p, BRW_MASK_DISABLE );
1683 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1684 brw_pop_insn_state( p );
1685
1686 /* Save the result for the w=0 case, and increment the w coordinate: */
1687 brw_MOV( p, w0, tmp[ 0 ] );
1688 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1689 brw_imm_uw( 1 ) );
1690
1691 /* Loop around for the other cube. Explicitly set the flag register
1692 (unfortunately we must spend an extra instruction to do this: we
1693 can't rely on a side effect of the previous MOV or ADD because
1694 conditional modifiers which are normally true might be false in
1695 exceptional circumstances, e.g. given a NaN input; the add to
1696 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1697 brw_push_insn_state( p );
1698 brw_set_mask_control( p, BRW_MASK_DISABLE );
1699 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1700 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1701 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1702 brw_pop_insn_state( p );
1703
1704 /* Patch the previous conditional branch now that we know the
1705 destination address. */
1706 brw_set_src1( p->store + origin,
1707 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1708
1709 /* The very last interpolation. */
1710 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1711 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1712 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1713
1714 /* scale by pow( 2, -15 ), as described above */
1715 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1716
1717 release_tmps( c, mark );
1718 }
1719
1720 static void emit_noise4( struct brw_wm_compile *c,
1721 const struct prog_instruction *inst )
1722 {
1723 struct brw_compile *p = &c->func;
1724 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1725 GLuint mask = inst->DstReg.WriteMask;
1726 int i;
1727 int mark = mark_tmps( c );
1728
1729 assert( mark == 0 );
1730
1731 src0 = get_src_reg( c, inst, 0, 0 );
1732 src1 = get_src_reg( c, inst, 0, 1 );
1733 src2 = get_src_reg( c, inst, 0, 2 );
1734 src3 = get_src_reg( c, inst, 0, 3 );
1735
1736 param0 = alloc_tmp( c );
1737 param1 = alloc_tmp( c );
1738 param2 = alloc_tmp( c );
1739 param3 = alloc_tmp( c );
1740
1741 brw_MOV( p, param0, src0 );
1742 brw_MOV( p, param1, src1 );
1743 brw_MOV( p, param2, src2 );
1744 brw_MOV( p, param3, src3 );
1745
1746 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1747
1748 /* Fill in the result: */
1749 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1750 for (i = 0 ; i < 4; i++) {
1751 if (mask & (1<<i)) {
1752 dst = get_dst_reg(c, inst, i);
1753 brw_MOV( p, dst, param0 );
1754 }
1755 }
1756 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1757 brw_set_saturate( p, 0 );
1758
1759 release_tmps( c, mark );
1760 }
1761
1762 /**
1763 * Resolve subroutine calls after code emit is done.
1764 */
1765 static void post_wm_emit( struct brw_wm_compile *c )
1766 {
1767 brw_resolve_cals(&c->func);
1768 }
1769
1770 static void
1771 get_argument_regs(struct brw_wm_compile *c,
1772 const struct prog_instruction *inst,
1773 int index,
1774 struct brw_reg *dst,
1775 struct brw_reg *regs,
1776 int mask)
1777 {
1778 struct brw_compile *p = &c->func;
1779 int i, j;
1780
1781 for (i = 0; i < 4; i++) {
1782 if (mask & (1 << i)) {
1783 regs[i] = get_src_reg(c, inst, index, i);
1784
1785 /* Unalias destination registers from our sources. */
1786 if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1787 for (j = 0; j < 4; j++) {
1788 if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1789 struct brw_reg tmp = alloc_tmp(c);
1790 brw_MOV(p, tmp, regs[i]);
1791 regs[i] = tmp;
1792 break;
1793 }
1794 }
1795 }
1796 }
1797 }
1798 }
1799
1800 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1801 {
1802 struct intel_context *intel = &brw->intel;
1803 #define MAX_IF_DEPTH 32
1804 #define MAX_LOOP_DEPTH 32
1805 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1806 GLuint i, if_depth = 0, loop_depth = 0;
1807 struct brw_compile *p = &c->func;
1808 struct brw_indirect stack_index = brw_indirect(0, 0);
1809
1810 c->out_of_regs = GL_FALSE;
1811
1812 prealloc_reg(c);
1813 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1814 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1815
1816 for (i = 0; i < c->nr_fp_insns; i++) {
1817 const struct prog_instruction *inst = &c->prog_instructions[i];
1818 int dst_flags;
1819 struct brw_reg args[3][4], dst[4];
1820 int j;
1821 int mark = mark_tmps( c );
1822
1823 c->cur_inst = i;
1824
1825 #if 0
1826 printf("Inst %d: ", i);
1827 _mesa_print_instruction(inst);
1828 #endif
1829
1830 /* fetch any constants that this instruction needs */
1831 if (c->fp->use_const_buffer)
1832 fetch_constants(c, inst);
1833
1834 if (inst->Opcode != OPCODE_ARL) {
1835 for (j = 0; j < 4; j++) {
1836 if (inst->DstReg.WriteMask & (1 << j))
1837 dst[j] = get_dst_reg(c, inst, j);
1838 else
1839 dst[j] = brw_null_reg();
1840 }
1841 }
1842 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1843 get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1844
1845 dst_flags = inst->DstReg.WriteMask;
1846 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1847 dst_flags |= SATURATE;
1848
1849 if (inst->CondUpdate)
1850 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1851 else
1852 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1853
1854 switch (inst->Opcode) {
1855 case WM_PIXELXY:
1856 emit_pixel_xy(c, dst, dst_flags);
1857 break;
1858 case WM_DELTAXY:
1859 emit_delta_xy(p, dst, dst_flags, args[0]);
1860 break;
1861 case WM_PIXELW:
1862 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1863 break;
1864 case WM_LINTERP:
1865 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1866 break;
1867 case WM_PINTERP:
1868 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1869 break;
1870 case WM_CINTERP:
1871 emit_cinterp(p, dst, dst_flags, args[0]);
1872 break;
1873 case WM_WPOSXY:
1874 emit_wpos_xy(c, dst, dst_flags, args[0]);
1875 break;
1876 case WM_FB_WRITE:
1877 emit_fb_write(c, args[0], args[1], args[2],
1878 INST_AUX_GET_TARGET(inst->Aux),
1879 inst->Aux & INST_AUX_EOT);
1880 break;
1881 case WM_FRONTFACING:
1882 emit_frontfacing(p, dst, dst_flags);
1883 break;
1884 case OPCODE_ADD:
1885 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1886 break;
1887 case OPCODE_ARL:
1888 emit_arl(c, inst);
1889 break;
1890 case OPCODE_FRC:
1891 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1892 break;
1893 case OPCODE_FLR:
1894 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1895 break;
1896 case OPCODE_LRP:
1897 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1898 break;
1899 case OPCODE_TRUNC:
1900 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1901 break;
1902 case OPCODE_MOV:
1903 case OPCODE_SWZ:
1904 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1905 break;
1906 case OPCODE_DP3:
1907 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1908 break;
1909 case OPCODE_DP4:
1910 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1911 break;
1912 case OPCODE_XPD:
1913 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1914 break;
1915 case OPCODE_DPH:
1916 emit_dph(p, dst, dst_flags, args[0], args[1]);
1917 break;
1918 case OPCODE_RCP:
1919 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1920 break;
1921 case OPCODE_RSQ:
1922 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1923 break;
1924 case OPCODE_SIN:
1925 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1926 break;
1927 case OPCODE_COS:
1928 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1929 break;
1930 case OPCODE_EX2:
1931 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1932 break;
1933 case OPCODE_LG2:
1934 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1935 break;
1936 case OPCODE_CMP:
1937 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1938 break;
1939 case OPCODE_MIN:
1940 emit_min(p, dst, dst_flags, args[0], args[1]);
1941 break;
1942 case OPCODE_MAX:
1943 emit_max(p, dst, dst_flags, args[0], args[1]);
1944 break;
1945 case OPCODE_DDX:
1946 case OPCODE_DDY:
1947 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1948 args[0]);
1949 break;
1950 case OPCODE_SLT:
1951 emit_sop(p, dst, dst_flags,
1952 BRW_CONDITIONAL_L, args[0], args[1]);
1953 break;
1954 case OPCODE_SLE:
1955 emit_sop(p, dst, dst_flags,
1956 BRW_CONDITIONAL_LE, args[0], args[1]);
1957 break;
1958 case OPCODE_SGT:
1959 emit_sop(p, dst, dst_flags,
1960 BRW_CONDITIONAL_G, args[0], args[1]);
1961 break;
1962 case OPCODE_SGE:
1963 emit_sop(p, dst, dst_flags,
1964 BRW_CONDITIONAL_GE, args[0], args[1]);
1965 break;
1966 case OPCODE_SEQ:
1967 emit_sop(p, dst, dst_flags,
1968 BRW_CONDITIONAL_EQ, args[0], args[1]);
1969 break;
1970 case OPCODE_SNE:
1971 emit_sop(p, dst, dst_flags,
1972 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1973 break;
1974 case OPCODE_MUL:
1975 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1976 break;
1977 case OPCODE_POW:
1978 emit_math2(c, BRW_MATH_FUNCTION_POW,
1979 dst, dst_flags, args[0], args[1]);
1980 break;
1981 case OPCODE_MAD:
1982 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1983 break;
1984 case OPCODE_NOISE1:
1985 emit_noise1(c, inst);
1986 break;
1987 case OPCODE_NOISE2:
1988 emit_noise2(c, inst);
1989 break;
1990 case OPCODE_NOISE3:
1991 emit_noise3(c, inst);
1992 break;
1993 case OPCODE_NOISE4:
1994 emit_noise4(c, inst);
1995 break;
1996 case OPCODE_TEX:
1997 emit_tex(c, dst, dst_flags, args[0],
1998 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1999 0, 1, 0, 0),
2000 inst->TexSrcTarget,
2001 inst->TexSrcUnit,
2002 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
2003 break;
2004 case OPCODE_TXB:
2005 emit_txb(c, dst, dst_flags, args[0],
2006 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
2007 0, 1, 0, 0),
2008 inst->TexSrcTarget,
2009 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2010 break;
2011 case OPCODE_KIL_NV:
2012 emit_kil(c);
2013 break;
2014 case OPCODE_IF:
2015 assert(if_depth < MAX_IF_DEPTH);
2016 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2017 break;
2018 case OPCODE_ELSE:
2019 assert(if_depth > 0);
2020 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2021 break;
2022 case OPCODE_ENDIF:
2023 assert(if_depth > 0);
2024 brw_ENDIF(p, if_inst[--if_depth]);
2025 break;
2026 case OPCODE_BGNSUB:
2027 brw_save_label(p, inst->Comment, p->nr_insn);
2028 break;
2029 case OPCODE_ENDSUB:
2030 /* no-op */
2031 break;
2032 case OPCODE_CAL:
2033 brw_push_insn_state(p);
2034 brw_set_mask_control(p, BRW_MASK_DISABLE);
2035 brw_set_access_mode(p, BRW_ALIGN_1);
2036 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2037 brw_set_access_mode(p, BRW_ALIGN_16);
2038 brw_ADD(p, get_addr_reg(stack_index),
2039 get_addr_reg(stack_index), brw_imm_d(4));
2040 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2041 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2042 brw_pop_insn_state(p);
2043 break;
2044
2045 case OPCODE_RET:
2046 brw_push_insn_state(p);
2047 brw_set_mask_control(p, BRW_MASK_DISABLE);
2048 brw_ADD(p, get_addr_reg(stack_index),
2049 get_addr_reg(stack_index), brw_imm_d(-4));
2050 brw_set_access_mode(p, BRW_ALIGN_1);
2051 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2052 brw_set_access_mode(p, BRW_ALIGN_16);
2053 brw_pop_insn_state(p);
2054
2055 break;
2056 case OPCODE_BGNLOOP:
2057 /* XXX may need to invalidate the current_constant regs */
2058 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2059 break;
2060 case OPCODE_BRK:
2061 brw_BREAK(p);
2062 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2063 break;
2064 case OPCODE_CONT:
2065 brw_CONT(p);
2066 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2067 break;
2068 case OPCODE_ENDLOOP:
2069 {
2070 struct brw_instruction *inst0, *inst1;
2071 GLuint br = 1;
2072
2073 if (intel->is_ironlake)
2074 br = 2;
2075
2076 assert(loop_depth > 0);
2077 loop_depth--;
2078 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2079 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2080 while (inst0 > loop_inst[loop_depth]) {
2081 inst0--;
2082 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2083 inst0->bits3.if_else.jump_count == 0) {
2084 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2085 inst0->bits3.if_else.pop_count = 0;
2086 }
2087 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2088 inst0->bits3.if_else.jump_count == 0) {
2089 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2090 inst0->bits3.if_else.pop_count = 0;
2091 }
2092 }
2093 }
2094 break;
2095 default:
2096 printf("unsupported opcode %d (%s) in fragment shader\n",
2097 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2098 _mesa_opcode_string(inst->Opcode) : "unknown");
2099 }
2100
2101 /* Release temporaries containing any unaliased source regs. */
2102 release_tmps( c, mark );
2103
2104 if (inst->CondUpdate)
2105 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2106 else
2107 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2108 }
2109 post_wm_emit(c);
2110
2111 if (INTEL_DEBUG & DEBUG_WM) {
2112 printf("wm-native:\n");
2113 for (i = 0; i < p->nr_insn; i++)
2114 brw_disasm(stderr, &p->store[i]);
2115 printf("\n");
2116 }
2117 }
2118
2119 /**
2120 * Do GPU code generation for shaders that use GLSL features such as
2121 * flow control. Other shaders will be compiled with the
2122 */
2123 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2124 {
2125 if (INTEL_DEBUG & DEBUG_WM) {
2126 printf("brw_wm_glsl_emit:\n");
2127 }
2128
2129 /* initial instruction translation/simplification */
2130 brw_wm_pass_fp(c);
2131
2132 /* actual code generation */
2133 brw_wm_emit_glsl(brw, c);
2134
2135 if (INTEL_DEBUG & DEBUG_WM) {
2136 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2137 }
2138
2139 c->prog_data.total_grf = num_grf_used(c);
2140 c->prog_data.total_scratch = 0;
2141 }