Merge branch '7.8'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 if (INTEL_DEBUG & DEBUG_GLSL_FORCE)
27 return GL_TRUE;
28
29 for (i = 0; i < fp->Base.NumInstructions; i++) {
30 const struct prog_instruction *inst = &fp->Base.Instructions[i];
31 switch (inst->Opcode) {
32 case OPCODE_ARL:
33 case OPCODE_IF:
34 case OPCODE_ENDIF:
35 case OPCODE_CAL:
36 case OPCODE_BRK:
37 case OPCODE_RET:
38 case OPCODE_NOISE1:
39 case OPCODE_NOISE2:
40 case OPCODE_NOISE3:
41 case OPCODE_NOISE4:
42 case OPCODE_BGNLOOP:
43 return GL_TRUE;
44 default:
45 break;
46 }
47 }
48 return GL_FALSE;
49 }
50
51
52
53 static void
54 reclaim_temps(struct brw_wm_compile *c);
55
56
57 /** Mark GRF register as used. */
58 static void
59 prealloc_grf(struct brw_wm_compile *c, int r)
60 {
61 c->used_grf[r] = GL_TRUE;
62 }
63
64
65 /** Mark given GRF register as not in use. */
66 static void
67 release_grf(struct brw_wm_compile *c, int r)
68 {
69 /*assert(c->used_grf[r]);*/
70 c->used_grf[r] = GL_FALSE;
71 c->first_free_grf = MIN2(c->first_free_grf, r);
72 }
73
74
75 /** Return index of a free GRF, mark it as used. */
76 static int
77 alloc_grf(struct brw_wm_compile *c)
78 {
79 GLuint r;
80 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
81 if (!c->used_grf[r]) {
82 c->used_grf[r] = GL_TRUE;
83 c->first_free_grf = r + 1; /* a guess */
84 return r;
85 }
86 }
87
88 /* no free temps, try to reclaim some */
89 reclaim_temps(c);
90 c->first_free_grf = 0;
91
92 /* try alloc again */
93 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
94 if (!c->used_grf[r]) {
95 c->used_grf[r] = GL_TRUE;
96 c->first_free_grf = r + 1; /* a guess */
97 return r;
98 }
99 }
100
101 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
102 assert(c->used_grf[r]);
103 }
104
105 /* really, no free GRF regs found */
106 if (!c->out_of_regs) {
107 /* print warning once per compilation */
108 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
109 c->out_of_regs = GL_TRUE;
110 }
111
112 return -1;
113 }
114
115
116 /** Return number of GRF registers used */
117 static int
118 num_grf_used(const struct brw_wm_compile *c)
119 {
120 int r;
121 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
122 if (c->used_grf[r])
123 return r + 1;
124 return 0;
125 }
126
127
128
129 /**
130 * Record the mapping of a Mesa register to a hardware register.
131 */
132 static void set_reg(struct brw_wm_compile *c, int file, int index,
133 int component, struct brw_reg reg)
134 {
135 c->wm_regs[file][index][component].reg = reg;
136 c->wm_regs[file][index][component].inited = GL_TRUE;
137 }
138
139 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
140 {
141 struct brw_reg reg;
142
143 /* if we need to allocate another temp, grow the tmp_regs[] array */
144 if (c->tmp_index == c->tmp_max) {
145 int r = alloc_grf(c);
146 if (r < 0) {
147 /*printf("Out of temps in %s\n", __FUNCTION__);*/
148 r = 50; /* XXX random register! */
149 }
150 c->tmp_regs[ c->tmp_max++ ] = r;
151 }
152
153 /* form the GRF register */
154 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
155 /*printf("alloc_temp %d\n", reg.nr);*/
156 assert(reg.nr < BRW_WM_MAX_GRF);
157 return reg;
158
159 }
160
161 /**
162 * Save current temp register info.
163 * There must be a matching call to release_tmps().
164 */
165 static int mark_tmps(struct brw_wm_compile *c)
166 {
167 return c->tmp_index;
168 }
169
170 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
171 {
172 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
173 }
174
175 static void release_tmps(struct brw_wm_compile *c, int mark)
176 {
177 c->tmp_index = mark;
178 }
179
180 /**
181 * Convert Mesa src register to brw register.
182 *
183 * Since we're running in SOA mode each Mesa register corresponds to four
184 * hardware registers. We allocate the hardware registers as needed here.
185 *
186 * \param file register file, one of PROGRAM_x
187 * \param index register number
188 * \param component src component (X=0, Y=1, Z=2, W=3)
189 * \param nr not used?!?
190 * \param neg negate value?
191 * \param abs take absolute value?
192 */
193 static struct brw_reg
194 get_reg(struct brw_wm_compile *c, int file, int index, int component,
195 int nr, GLuint neg, GLuint abs)
196 {
197 struct brw_reg reg;
198 switch (file) {
199 case PROGRAM_STATE_VAR:
200 case PROGRAM_CONSTANT:
201 case PROGRAM_UNIFORM:
202 file = PROGRAM_STATE_VAR;
203 break;
204 case PROGRAM_UNDEFINED:
205 return brw_null_reg();
206 case PROGRAM_TEMPORARY:
207 case PROGRAM_INPUT:
208 case PROGRAM_OUTPUT:
209 case PROGRAM_PAYLOAD:
210 break;
211 default:
212 _mesa_problem(NULL, "Unexpected file in get_reg()");
213 return brw_null_reg();
214 }
215
216 assert(index < 256);
217 assert(component < 4);
218
219 /* see if we've already allocated a HW register for this Mesa register */
220 if (c->wm_regs[file][index][component].inited) {
221 /* yes, re-use */
222 reg = c->wm_regs[file][index][component].reg;
223 }
224 else {
225 /* no, allocate new register */
226 int grf = alloc_grf(c);
227 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
228 if (grf < 0) {
229 /* totally out of temps */
230 grf = 51; /* XXX random register! */
231 }
232
233 reg = brw_vec8_grf(grf, 0);
234 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
235
236 set_reg(c, file, index, component, reg);
237 }
238
239 if (neg & (1 << component)) {
240 reg = negate(reg);
241 }
242 if (abs)
243 reg = brw_abs(reg);
244 return reg;
245 }
246
247
248
249 /**
250 * This is called if we run out of GRF registers. Examine the live intervals
251 * of temp regs in the program and free those which won't be used again.
252 */
253 static void
254 reclaim_temps(struct brw_wm_compile *c)
255 {
256 GLint intBegin[MAX_PROGRAM_TEMPS];
257 GLint intEnd[MAX_PROGRAM_TEMPS];
258 int index;
259
260 /*printf("Reclaim temps:\n");*/
261
262 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
263 intBegin, intEnd);
264
265 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
266 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
267 /* program temp[i] can be freed */
268 int component;
269 /*printf(" temp[%d] is dead\n", index);*/
270 for (component = 0; component < 4; component++) {
271 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
272 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
273 release_grf(c, r);
274 /*
275 printf(" Reclaim temp %d, reg %d at inst %d\n",
276 index, r, c->cur_inst);
277 */
278 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
279 }
280 }
281 }
282 }
283 }
284
285
286
287
288 /**
289 * Preallocate registers. This sets up the Mesa to hardware register
290 * mapping for certain registers, such as constants (uniforms/state vars)
291 * and shader inputs.
292 */
293 static void prealloc_reg(struct brw_wm_compile *c)
294 {
295 struct intel_context *intel = &c->func.brw->intel;
296 int i, j;
297 struct brw_reg reg;
298 int urb_read_length = 0;
299 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
300 GLuint reg_index = 0;
301
302 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
303 c->first_free_grf = 0;
304
305 for (i = 0; i < 4; i++) {
306 if (i < c->key.nr_depth_regs)
307 reg = brw_vec8_grf(i * 2, 0);
308 else
309 reg = brw_vec8_grf(0, 0);
310 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
311 }
312 reg_index += 2 * c->key.nr_depth_regs;
313
314 /* constants */
315 {
316 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
317 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
318
319 /* use a real constant buffer, or just use a section of the GRF? */
320 /* XXX this heuristic may need adjustment... */
321 if ((nr_params + nr_temps) * 4 + reg_index > 80)
322 c->fp->use_const_buffer = GL_TRUE;
323 else
324 c->fp->use_const_buffer = GL_FALSE;
325 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
326
327 if (c->fp->use_const_buffer) {
328 /* We'll use a real constant buffer and fetch constants from
329 * it with a dataport read message.
330 */
331
332 /* number of float constants in CURBE */
333 c->prog_data.nr_params = 0;
334 }
335 else {
336 const struct gl_program_parameter_list *plist =
337 c->fp->program.Base.Parameters;
338 int index = 0;
339
340 /* number of float constants in CURBE */
341 c->prog_data.nr_params = 4 * nr_params;
342
343 /* loop over program constants (float[4]) */
344 for (i = 0; i < nr_params; i++) {
345 /* loop over XYZW channels */
346 for (j = 0; j < 4; j++, index++) {
347 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
348 /* Save pointer to parameter/constant value.
349 * Constants will be copied in prepare_constant_buffer()
350 */
351 c->prog_data.param[index] = &plist->ParameterValues[i][j];
352 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
353 }
354 }
355 /* number of constant regs used (each reg is float[8]) */
356 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
357 reg_index += c->nr_creg;
358 }
359 }
360
361 /* fragment shader inputs */
362 for (i = 0; i < VERT_RESULT_MAX; i++) {
363 int fp_input;
364
365 if (i >= VERT_RESULT_VAR0)
366 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
367 else if (i <= VERT_RESULT_TEX7)
368 fp_input = i;
369 else
370 fp_input = -1;
371
372 if (fp_input >= 0 && inputs & (1 << fp_input)) {
373 urb_read_length = reg_index;
374 reg = brw_vec8_grf(reg_index, 0);
375 for (j = 0; j < 4; j++)
376 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
377 }
378 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
379 reg_index += 2;
380 }
381 }
382
383 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
384 c->prog_data.urb_read_length = urb_read_length;
385 c->prog_data.curb_read_length = c->nr_creg;
386 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
387 reg_index++;
388 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
389 reg_index += 2;
390
391 /* mark GRF regs [0..reg_index-1] as in-use */
392 for (i = 0; i < reg_index; i++)
393 prealloc_grf(c, i);
394
395 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
396 prealloc_grf(c, 126);
397 prealloc_grf(c, 127);
398
399 for (i = 0; i < c->nr_fp_insns; i++) {
400 const struct prog_instruction *inst = &c->prog_instructions[i];
401 struct brw_reg dst[4];
402
403 switch (inst->Opcode) {
404 case OPCODE_TEX:
405 case OPCODE_TXB:
406 /* Allocate the channels of texture results contiguously,
407 * since they are written out that way by the sampler unit.
408 */
409 for (j = 0; j < 4; j++) {
410 dst[j] = get_dst_reg(c, inst, j);
411 if (j != 0)
412 assert(dst[j].nr == dst[j - 1].nr + 1);
413 }
414 break;
415 default:
416 break;
417 }
418 }
419
420 for (i = 0; i < c->nr_fp_insns; i++) {
421 const struct prog_instruction *inst = &c->prog_instructions[i];
422
423 switch (inst->Opcode) {
424 case WM_DELTAXY:
425 /* Allocate WM_DELTAXY destination on G45/GM45 to an
426 * even-numbered GRF if possible so that we can use the PLN
427 * instruction.
428 */
429 if (inst->DstReg.WriteMask == WRITEMASK_XY &&
430 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
431 !c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
432 (IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
433 int grf;
434
435 for (grf = c->first_free_grf & ~1;
436 grf < BRW_WM_MAX_GRF;
437 grf += 2)
438 {
439 if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
440 c->used_grf[grf] = GL_TRUE;
441 c->used_grf[grf + 1] = GL_TRUE;
442 c->first_free_grf = grf + 2; /* a guess */
443
444 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
445 brw_vec8_grf(grf, 0));
446 set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
447 brw_vec8_grf(grf + 1, 0));
448 break;
449 }
450 }
451 }
452 default:
453 break;
454 }
455 }
456
457 /* An instruction may reference up to three constants.
458 * They'll be found in these registers.
459 * XXX alloc these on demand!
460 */
461 if (c->fp->use_const_buffer) {
462 for (i = 0; i < 3; i++) {
463 c->current_const[i].index = -1;
464 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
465 }
466 }
467 #if 0
468 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
469 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
470 #endif
471 }
472
473
474 /**
475 * Check if any of the instruction's src registers are constants, uniforms,
476 * or statevars. If so, fetch any constants that we don't already have in
477 * the three GRF slots.
478 */
479 static void fetch_constants(struct brw_wm_compile *c,
480 const struct prog_instruction *inst)
481 {
482 struct brw_compile *p = &c->func;
483 GLuint i;
484
485 /* loop over instruction src regs */
486 for (i = 0; i < 3; i++) {
487 const struct prog_src_register *src = &inst->SrcReg[i];
488 if (src->File == PROGRAM_STATE_VAR ||
489 src->File == PROGRAM_CONSTANT ||
490 src->File == PROGRAM_UNIFORM) {
491 c->current_const[i].index = src->Index;
492
493 #if 0
494 printf(" fetch const[%d] for arg %d into reg %d\n",
495 src->Index, i, c->current_const[i].reg.nr);
496 #endif
497
498 /* need to fetch the constant now */
499 brw_dp_READ_4(p,
500 c->current_const[i].reg, /* writeback dest */
501 src->RelAddr, /* relative indexing? */
502 16 * src->Index, /* byte offset */
503 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
504 );
505 }
506 }
507 }
508
509
510 /**
511 * Convert Mesa dst register to brw register.
512 */
513 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
514 const struct prog_instruction *inst,
515 GLuint component)
516 {
517 const int nr = 1;
518 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
519 0, 0);
520 }
521
522
523 static struct brw_reg
524 get_src_reg_const(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint component)
527 {
528 /* We should have already fetched the constant from the constant
529 * buffer in fetch_constants(). Now we just have to return a
530 * register description that extracts the needed component and
531 * smears it across all eight vector components.
532 */
533 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
534 struct brw_reg const_reg;
535
536 assert(component < 4);
537 assert(srcRegIndex < 3);
538 assert(c->current_const[srcRegIndex].index != -1);
539 const_reg = c->current_const[srcRegIndex].reg;
540
541 /* extract desired float from the const_reg, and smear */
542 const_reg = stride(const_reg, 0, 1, 0);
543 const_reg.subnr = component * 4;
544
545 if (src->Negate & (1 << component))
546 const_reg = negate(const_reg);
547 if (src->Abs)
548 const_reg = brw_abs(const_reg);
549
550 #if 0
551 printf(" form const[%d].%d for arg %d, reg %d\n",
552 c->current_const[srcRegIndex].index,
553 component,
554 srcRegIndex,
555 const_reg.nr);
556 #endif
557
558 return const_reg;
559 }
560
561
562 /**
563 * Convert Mesa src register to brw register.
564 */
565 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
566 const struct prog_instruction *inst,
567 GLuint srcRegIndex, GLuint channel)
568 {
569 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
570 const GLuint nr = 1;
571 const GLuint component = GET_SWZ(src->Swizzle, channel);
572
573 /* Only one immediate value can be used per native opcode, and it
574 * has be in the src1 slot, so not all Mesa instructions will get
575 * to take advantage of immediate constants.
576 */
577 if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
578 const struct gl_program_parameter_list *params;
579
580 params = c->fp->program.Base.Parameters;
581
582 /* Extended swizzle terms */
583 if (component == SWIZZLE_ZERO) {
584 return brw_imm_f(0.0F);
585 } else if (component == SWIZZLE_ONE) {
586 return brw_imm_f(1.0F);
587 }
588
589 if (src->File == PROGRAM_CONSTANT) {
590 return brw_imm_f(params->ParameterValues[src->Index][component]);
591 }
592 }
593
594 if (c->fp->use_const_buffer &&
595 (src->File == PROGRAM_STATE_VAR ||
596 src->File == PROGRAM_CONSTANT ||
597 src->File == PROGRAM_UNIFORM)) {
598 return get_src_reg_const(c, inst, srcRegIndex, component);
599 }
600 else {
601 /* other type of source register */
602 return get_reg(c, src->File, src->Index, component, nr,
603 src->Negate, src->Abs);
604 }
605 }
606
607 /**
608 * Subroutines are minimal support for resusable instruction sequences.
609 * They are implemented as simply as possible to minimise overhead: there
610 * is no explicit support for communication between the caller and callee
611 * other than saving the return address in a temporary register, nor is
612 * there any automatic local storage. This implies that great care is
613 * required before attempting reentrancy or any kind of nested
614 * subroutine invocations.
615 */
616 static void invoke_subroutine( struct brw_wm_compile *c,
617 enum _subroutine subroutine,
618 void (*emit)( struct brw_wm_compile * ) )
619 {
620 struct brw_compile *p = &c->func;
621
622 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
623
624 if( c->subroutines[ subroutine ] ) {
625 /* subroutine previously emitted: reuse existing instructions */
626
627 int mark = mark_tmps( c );
628 struct brw_reg return_address = retype( alloc_tmp( c ),
629 BRW_REGISTER_TYPE_UD );
630 int here = p->nr_insn;
631
632 brw_push_insn_state(p);
633 brw_set_mask_control(p, BRW_MASK_DISABLE);
634 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
635
636 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
637 brw_imm_d( ( c->subroutines[ subroutine ] -
638 here - 1 ) << 4 ) );
639 brw_pop_insn_state(p);
640
641 release_tmps( c, mark );
642 } else {
643 /* previously unused subroutine: emit, and mark for later reuse */
644
645 int mark = mark_tmps( c );
646 struct brw_reg return_address = retype( alloc_tmp( c ),
647 BRW_REGISTER_TYPE_UD );
648 struct brw_instruction *calc;
649 int base = p->nr_insn;
650
651 brw_push_insn_state(p);
652 brw_set_mask_control(p, BRW_MASK_DISABLE);
653 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
654 brw_pop_insn_state(p);
655
656 c->subroutines[ subroutine ] = p->nr_insn;
657
658 emit( c );
659
660 brw_push_insn_state(p);
661 brw_set_mask_control(p, BRW_MASK_DISABLE);
662 brw_MOV( p, brw_ip_reg(), return_address );
663 brw_pop_insn_state(p);
664
665 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
666
667 release_tmps( c, mark );
668 }
669 }
670
671 static void emit_arl(struct brw_wm_compile *c,
672 const struct prog_instruction *inst)
673 {
674 struct brw_compile *p = &c->func;
675 struct brw_reg src0, addr_reg;
676 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
677 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
678 BRW_ARF_ADDRESS, 0);
679 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
680 brw_MOV(p, addr_reg, src0);
681 brw_set_saturate(p, 0);
682 }
683
684 /**
685 * For GLSL shaders, this KIL will be unconditional.
686 * It may be contained inside an IF/ENDIF structure of course.
687 */
688 static void emit_kil(struct brw_wm_compile *c)
689 {
690 struct brw_compile *p = &c->func;
691 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
692 brw_push_insn_state(p);
693 brw_set_mask_control(p, BRW_MASK_DISABLE);
694 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
695 brw_AND(p, depth, c->emit_mask_reg, depth);
696 brw_pop_insn_state(p);
697 }
698
699 static INLINE struct brw_reg high_words( struct brw_reg reg )
700 {
701 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
702 0, 8, 2 );
703 }
704
705 static INLINE struct brw_reg low_words( struct brw_reg reg )
706 {
707 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
708 }
709
710 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
711 {
712 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
713 }
714
715 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
716 {
717 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
718 0, 16, 2 );
719 }
720
721 /* One-, two- and three-dimensional Perlin noise, similar to the description
722 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
723 static void noise1_sub( struct brw_wm_compile *c ) {
724
725 struct brw_compile *p = &c->func;
726 struct brw_reg param,
727 x0, x1, /* gradients at each end */
728 t, tmp[ 2 ], /* float temporaries */
729 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
730 int i;
731 int mark = mark_tmps( c );
732
733 x0 = alloc_tmp( c );
734 x1 = alloc_tmp( c );
735 t = alloc_tmp( c );
736 tmp[ 0 ] = alloc_tmp( c );
737 tmp[ 1 ] = alloc_tmp( c );
738 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
739 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
740 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
741 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
742 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
743
744 param = lookup_tmp( c, mark - 2 );
745
746 brw_set_access_mode( p, BRW_ALIGN_1 );
747
748 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
749
750 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
751 be hashed. Also compute the remainder (offset within the unit
752 length), interleaved to reduce register dependency penalties. */
753 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
754 brw_FRC( p, param, param );
755 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
756 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
757 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
758
759 /* We're now ready to perform the hashing. The two hashes are
760 interleaved for performance. The hash function used is
761 designed to rapidly achieve avalanche and require only 32x16
762 bit multiplication, and 16-bit swizzles (which we get for
763 free). We can't use immediate operands in the multiplies,
764 because immediates are permitted only in src1 and the 16-bit
765 factor is permitted only in src0. */
766 for( i = 0; i < 2; i++ )
767 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
768 for( i = 0; i < 2; i++ )
769 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
770 high_words( itmp[ i ] ) );
771 for( i = 0; i < 2; i++ )
772 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
773 for( i = 0; i < 2; i++ )
774 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
775 high_words( itmp[ i ] ) );
776 for( i = 0; i < 2; i++ )
777 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
778 for( i = 0; i < 2; i++ )
779 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
780 high_words( itmp[ i ] ) );
781
782 /* Now we want to initialise the two gradients based on the
783 hashes. Format conversion from signed integer to float leaves
784 everything scaled too high by a factor of pow( 2, 31 ), but
785 we correct for that right at the end. */
786 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
787 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
788 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
789
790 brw_MUL( p, x0, x0, param );
791 brw_MUL( p, x1, x1, t );
792
793 /* We interpolate between the gradients using the polynomial
794 6t^5 - 15t^4 + 10t^3 (Perlin). */
795 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
796 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
797 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
798 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
799 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
800 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
801 pipeline */
802 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
803 brw_MUL( p, param, tmp[ 0 ], param );
804 brw_MUL( p, x1, x1, param );
805 brw_ADD( p, x0, x0, x1 );
806 /* scale by pow( 2, -30 ), to compensate for the format conversion
807 above and an extra factor of 2 so that a single gradient covers
808 the [-1,1] range */
809 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
810
811 release_tmps( c, mark );
812 }
813
814 static void emit_noise1( struct brw_wm_compile *c,
815 const struct prog_instruction *inst )
816 {
817 struct brw_compile *p = &c->func;
818 struct brw_reg src, param, dst;
819 GLuint mask = inst->DstReg.WriteMask;
820 int i;
821 int mark = mark_tmps( c );
822
823 assert( mark == 0 );
824
825 src = get_src_reg( c, inst, 0, 0 );
826
827 param = alloc_tmp( c );
828
829 brw_MOV( p, param, src );
830
831 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
832
833 /* Fill in the result: */
834 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
835 for (i = 0 ; i < 4; i++) {
836 if (mask & (1<<i)) {
837 dst = get_dst_reg(c, inst, i);
838 brw_MOV( p, dst, param );
839 }
840 }
841 if( inst->SaturateMode == SATURATE_ZERO_ONE )
842 brw_set_saturate( p, 0 );
843
844 release_tmps( c, mark );
845 }
846
847 static void noise2_sub( struct brw_wm_compile *c ) {
848
849 struct brw_compile *p = &c->func;
850 struct brw_reg param0, param1,
851 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
852 t, tmp[ 4 ], /* float temporaries */
853 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
854 int i;
855 int mark = mark_tmps( c );
856
857 x0y0 = alloc_tmp( c );
858 x0y1 = alloc_tmp( c );
859 x1y0 = alloc_tmp( c );
860 x1y1 = alloc_tmp( c );
861 t = alloc_tmp( c );
862 for( i = 0; i < 4; i++ ) {
863 tmp[ i ] = alloc_tmp( c );
864 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
865 }
866 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
867 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
868 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
869
870 param0 = lookup_tmp( c, mark - 3 );
871 param1 = lookup_tmp( c, mark - 2 );
872
873 brw_set_access_mode( p, BRW_ALIGN_1 );
874
875 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
876 be hashed. Also compute the remainders (offsets within the unit
877 square), interleaved to reduce register dependency penalties. */
878 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
879 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
880 brw_FRC( p, param0, param0 );
881 brw_FRC( p, param1, param1 );
882 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
883 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
884 low_words( itmp[ 1 ] ) );
885 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
886 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
887 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
888 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
889 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
890
891 /* We're now ready to perform the hashing. The four hashes are
892 interleaved for performance. The hash function used is
893 designed to rapidly achieve avalanche and require only 32x16
894 bit multiplication, and 16-bit swizzles (which we get for
895 free). We can't use immediate operands in the multiplies,
896 because immediates are permitted only in src1 and the 16-bit
897 factor is permitted only in src0. */
898 for( i = 0; i < 4; i++ )
899 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
900 for( i = 0; i < 4; i++ )
901 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
902 high_words( itmp[ i ] ) );
903 for( i = 0; i < 4; i++ )
904 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
905 for( i = 0; i < 4; i++ )
906 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
907 high_words( itmp[ i ] ) );
908 for( i = 0; i < 4; i++ )
909 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
910 for( i = 0; i < 4; i++ )
911 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
912 high_words( itmp[ i ] ) );
913
914 /* Now we want to initialise the four gradients based on the
915 hashes. Format conversion from signed integer to float leaves
916 everything scaled too high by a factor of pow( 2, 15 ), but
917 we correct for that right at the end. */
918 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
919 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
920 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
921 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
922 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
923
924 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
925 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
926 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
927 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
928
929 brw_MUL( p, x1y0, x1y0, t );
930 brw_MUL( p, x1y1, x1y1, t );
931 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
932 brw_MUL( p, x0y0, x0y0, param0 );
933 brw_MUL( p, x0y1, x0y1, param0 );
934
935 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
936 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
937 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
938 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
939
940 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
941 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
942 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
943 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
944
945 /* We interpolate between the gradients using the polynomial
946 6t^5 - 15t^4 + 10t^3 (Perlin). */
947 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
948 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
949 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
950 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
951 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
952 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
953 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
954 pipeline */
955 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
956 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
957 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
958 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
959 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
960 pipeline */
961 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
962 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
963 brw_MUL( p, param0, tmp[ 0 ], param0 );
964 brw_MUL( p, param1, tmp[ 1 ], param1 );
965
966 /* Here we interpolate in the y dimension... */
967 brw_MUL( p, x0y1, x0y1, param1 );
968 brw_MUL( p, x1y1, x1y1, param1 );
969 brw_ADD( p, x0y0, x0y0, x0y1 );
970 brw_ADD( p, x1y0, x1y0, x1y1 );
971
972 /* And now in x. There are horrible register dependencies here,
973 but we have nothing else to do. */
974 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
975 brw_MUL( p, x1y0, x1y0, param0 );
976 brw_ADD( p, x0y0, x0y0, x1y0 );
977
978 /* scale by pow( 2, -15 ), as described above */
979 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
980
981 release_tmps( c, mark );
982 }
983
984 static void emit_noise2( struct brw_wm_compile *c,
985 const struct prog_instruction *inst )
986 {
987 struct brw_compile *p = &c->func;
988 struct brw_reg src0, src1, param0, param1, dst;
989 GLuint mask = inst->DstReg.WriteMask;
990 int i;
991 int mark = mark_tmps( c );
992
993 assert( mark == 0 );
994
995 src0 = get_src_reg( c, inst, 0, 0 );
996 src1 = get_src_reg( c, inst, 0, 1 );
997
998 param0 = alloc_tmp( c );
999 param1 = alloc_tmp( c );
1000
1001 brw_MOV( p, param0, src0 );
1002 brw_MOV( p, param1, src1 );
1003
1004 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1005
1006 /* Fill in the result: */
1007 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1008 for (i = 0 ; i < 4; i++) {
1009 if (mask & (1<<i)) {
1010 dst = get_dst_reg(c, inst, i);
1011 brw_MOV( p, dst, param0 );
1012 }
1013 }
1014 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1015 brw_set_saturate( p, 0 );
1016
1017 release_tmps( c, mark );
1018 }
1019
1020 /**
1021 * The three-dimensional case is much like the one- and two- versions above,
1022 * but since the number of corners is rapidly growing we now pack 16 16-bit
1023 * hashes into each register to extract more parallelism from the EUs.
1024 */
1025 static void noise3_sub( struct brw_wm_compile *c ) {
1026
1027 struct brw_compile *p = &c->func;
1028 struct brw_reg param0, param1, param2,
1029 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1030 xi, yi, zi, /* interpolation coefficients */
1031 t, tmp[ 8 ], /* float temporaries */
1032 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1033 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1034 int i;
1035 int mark = mark_tmps( c );
1036
1037 x0y0 = alloc_tmp( c );
1038 x0y1 = alloc_tmp( c );
1039 x1y0 = alloc_tmp( c );
1040 x1y1 = alloc_tmp( c );
1041 xi = alloc_tmp( c );
1042 yi = alloc_tmp( c );
1043 zi = alloc_tmp( c );
1044 t = alloc_tmp( c );
1045 for( i = 0; i < 8; i++ ) {
1046 tmp[ i ] = alloc_tmp( c );
1047 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1048 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1049 }
1050
1051 param0 = lookup_tmp( c, mark - 4 );
1052 param1 = lookup_tmp( c, mark - 3 );
1053 param2 = lookup_tmp( c, mark - 2 );
1054
1055 brw_set_access_mode( p, BRW_ALIGN_1 );
1056
1057 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1058 be hashed. Also compute the remainders (offsets within the unit
1059 cube), interleaved to reduce register dependency penalties. */
1060 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1061 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1062 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1063 brw_FRC( p, param0, param0 );
1064 brw_FRC( p, param1, param1 );
1065 brw_FRC( p, param2, param2 );
1066 /* Since we now have only 16 bits of precision in the hash, we must
1067 be more careful about thorough mixing to maintain entropy as we
1068 squash the input vector into a small scalar. */
1069 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1070 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1071 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1072 brw_imm_uw( 0x9B93 ) );
1073 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1074 brw_imm_uw( 0xBC8F ) );
1075
1076 /* Temporarily disable the execution mask while we work with ExecSize=16
1077 channels (the mask is set for ExecSize=8 and is probably incorrect).
1078 Although this might cause execution of unwanted channels, the code
1079 writes only to temporary registers and has no side effects, so
1080 disabling the mask is harmless. */
1081 brw_push_insn_state( p );
1082 brw_set_mask_control( p, BRW_MASK_DISABLE );
1083 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1084 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1085 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1086
1087 /* We're now ready to perform the hashing. The eight hashes are
1088 interleaved for performance. The hash function used is
1089 designed to rapidly achieve avalanche and require only 16x16
1090 bit multiplication, and 8-bit swizzles (which we get for
1091 free). */
1092 for( i = 0; i < 4; i++ )
1093 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1094 for( i = 0; i < 4; i++ )
1095 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1096 odd_bytes( wtmp[ i ] ) );
1097 for( i = 0; i < 4; i++ )
1098 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1099 for( i = 0; i < 4; i++ )
1100 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1101 odd_bytes( wtmp[ i ] ) );
1102 brw_pop_insn_state( p );
1103
1104 /* Now we want to initialise the four rear gradients based on the
1105 hashes. Format conversion from signed integer to float leaves
1106 everything scaled too high by a factor of pow( 2, 15 ), but
1107 we correct for that right at the end. */
1108 /* x component */
1109 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1110 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1111 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1112 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1113 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1114
1115 brw_push_insn_state( p );
1116 brw_set_mask_control( p, BRW_MASK_DISABLE );
1117 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1118 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1119 brw_pop_insn_state( p );
1120
1121 brw_MUL( p, x1y0, x1y0, t );
1122 brw_MUL( p, x1y1, x1y1, t );
1123 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1124 brw_MUL( p, x0y0, x0y0, param0 );
1125 brw_MUL( p, x0y1, x0y1, param0 );
1126
1127 /* y component */
1128 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1129 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1130 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1131 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1132
1133 brw_push_insn_state( p );
1134 brw_set_mask_control( p, BRW_MASK_DISABLE );
1135 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1136 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1137 brw_pop_insn_state( p );
1138
1139 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1140 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1141 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1142 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1143 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1144
1145 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1146 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1147 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1148 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1149
1150 /* z component */
1151 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1152 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1153 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1154 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1155
1156 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1157 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1158 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1159 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1160
1161 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1162 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1163 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1164 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1165
1166 /* We interpolate between the gradients using the polynomial
1167 6t^5 - 15t^4 + 10t^3 (Perlin). */
1168 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1169 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1170 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1171 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1172 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1173 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1174 brw_MUL( p, xi, xi, param0 );
1175 brw_MUL( p, yi, yi, param1 );
1176 brw_MUL( p, zi, zi, param2 );
1177 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1178 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1179 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1180 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1181 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1182 brw_MUL( p, xi, xi, param0 );
1183 brw_MUL( p, yi, yi, param1 );
1184 brw_MUL( p, zi, zi, param2 );
1185 brw_MUL( p, xi, xi, param0 );
1186 brw_MUL( p, yi, yi, param1 );
1187 brw_MUL( p, zi, zi, param2 );
1188 brw_MUL( p, xi, xi, param0 );
1189 brw_MUL( p, yi, yi, param1 );
1190 brw_MUL( p, zi, zi, param2 );
1191
1192 /* Here we interpolate in the y dimension... */
1193 brw_MUL( p, x0y1, x0y1, yi );
1194 brw_MUL( p, x1y1, x1y1, yi );
1195 brw_ADD( p, x0y0, x0y0, x0y1 );
1196 brw_ADD( p, x1y0, x1y0, x1y1 );
1197
1198 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1199 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1200 brw_MUL( p, x1y0, x1y0, xi );
1201 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1202
1203 /* Now do the same thing for the front four gradients... */
1204 /* x component */
1205 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1206 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1207 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1208 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1209
1210 brw_push_insn_state( p );
1211 brw_set_mask_control( p, BRW_MASK_DISABLE );
1212 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1213 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1214 brw_pop_insn_state( p );
1215
1216 brw_MUL( p, x1y0, x1y0, t );
1217 brw_MUL( p, x1y1, x1y1, t );
1218 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1219 brw_MUL( p, x0y0, x0y0, param0 );
1220 brw_MUL( p, x0y1, x0y1, param0 );
1221
1222 /* y component */
1223 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1224 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1225 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1226 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1227
1228 brw_push_insn_state( p );
1229 brw_set_mask_control( p, BRW_MASK_DISABLE );
1230 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1231 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1232 brw_pop_insn_state( p );
1233
1234 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1235 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1236 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1237 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1238 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1239
1240 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1241 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1242 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1243 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1244
1245 /* z component */
1246 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1247 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1248 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1249 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1250
1251 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1252 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1253 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1254 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1255
1256 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1257 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1258 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1259 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1260
1261 /* The interpolation coefficients are still around from last time, so
1262 again interpolate in the y dimension... */
1263 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1264 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1265 brw_MUL( p, x0y1, x0y1, yi );
1266 brw_MUL( p, x1y1, x1y1, yi );
1267 brw_ADD( p, x0y0, x0y0, x0y1 );
1268 brw_ADD( p, x1y0, x1y0, x1y1 );
1269
1270 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1271 time put the front face in tmp[ 1 ] and we're nearly there... */
1272 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1273 brw_MUL( p, x1y0, x1y0, xi );
1274 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1275
1276 /* The final interpolation, in the z dimension: */
1277 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1278 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1279 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1280
1281 /* scale by pow( 2, -15 ), as described above */
1282 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1283
1284 release_tmps( c, mark );
1285 }
1286
1287 static void emit_noise3( struct brw_wm_compile *c,
1288 const struct prog_instruction *inst )
1289 {
1290 struct brw_compile *p = &c->func;
1291 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1292 GLuint mask = inst->DstReg.WriteMask;
1293 int i;
1294 int mark = mark_tmps( c );
1295
1296 assert( mark == 0 );
1297
1298 src0 = get_src_reg( c, inst, 0, 0 );
1299 src1 = get_src_reg( c, inst, 0, 1 );
1300 src2 = get_src_reg( c, inst, 0, 2 );
1301
1302 param0 = alloc_tmp( c );
1303 param1 = alloc_tmp( c );
1304 param2 = alloc_tmp( c );
1305
1306 brw_MOV( p, param0, src0 );
1307 brw_MOV( p, param1, src1 );
1308 brw_MOV( p, param2, src2 );
1309
1310 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1311
1312 /* Fill in the result: */
1313 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1314 for (i = 0 ; i < 4; i++) {
1315 if (mask & (1<<i)) {
1316 dst = get_dst_reg(c, inst, i);
1317 brw_MOV( p, dst, param0 );
1318 }
1319 }
1320 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1321 brw_set_saturate( p, 0 );
1322
1323 release_tmps( c, mark );
1324 }
1325
1326 /**
1327 * For the four-dimensional case, the little micro-optimisation benefits
1328 * we obtain by unrolling all the loops aren't worth the massive bloat it
1329 * now causes. Instead, we loop twice around performing a similar operation
1330 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1331 * code to glue it all together.
1332 */
1333 static void noise4_sub( struct brw_wm_compile *c )
1334 {
1335 struct brw_compile *p = &c->func;
1336 struct brw_reg param[ 4 ],
1337 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1338 w0, /* noise for the w=0 cube */
1339 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1340 interp[ 4 ], /* interpolation coefficients */
1341 t, tmp[ 8 ], /* float temporaries */
1342 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1343 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1344 int i, j;
1345 int mark = mark_tmps( c );
1346 GLuint loop, origin;
1347
1348 x0y0 = alloc_tmp( c );
1349 x0y1 = alloc_tmp( c );
1350 x1y0 = alloc_tmp( c );
1351 x1y1 = alloc_tmp( c );
1352 t = alloc_tmp( c );
1353 w0 = alloc_tmp( c );
1354 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1355 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1356
1357 for( i = 0; i < 4; i++ ) {
1358 param[ i ] = lookup_tmp( c, mark - 5 + i );
1359 interp[ i ] = alloc_tmp( c );
1360 }
1361
1362 for( i = 0; i < 8; i++ ) {
1363 tmp[ i ] = alloc_tmp( c );
1364 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1365 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1366 }
1367
1368 brw_set_access_mode( p, BRW_ALIGN_1 );
1369
1370 /* We only want 16 bits of precision from the integral part of each
1371 co-ordinate, but unfortunately the RNDD semantics would saturate
1372 at 16 bits if we performed the operation directly to a 16-bit
1373 destination. Therefore, we round to 32-bit temporaries where
1374 appropriate, and then store only the lower 16 bits. */
1375 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1376 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1377 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1378 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1379 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1380 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1381
1382 /* Modify the flag register here, because the side effect is useful
1383 later (see below). We know for certain that all flags will be
1384 cleared, since the FRC instruction cannot possibly generate
1385 negative results. Even for exceptional inputs (infinities, denormals,
1386 NaNs), the architecture guarantees that the L conditional is false. */
1387 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1388 brw_FRC( p, param[ 0 ], param[ 0 ] );
1389 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1390 for( i = 1; i < 4; i++ )
1391 brw_FRC( p, param[ i ], param[ i ] );
1392
1393 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1394 of all. */
1395 for( i = 0; i < 4; i++ )
1396 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1397 for( i = 0; i < 4; i++ )
1398 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1399 for( i = 0; i < 4; i++ )
1400 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1401 for( i = 0; i < 4; i++ )
1402 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1403 for( j = 0; j < 3; j++ )
1404 for( i = 0; i < 4; i++ )
1405 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1406
1407 /* Mark the current address, as it will be a jump destination. The
1408 following code will be executed twice: first, with the flag
1409 register clear indicating the w=0 case, and second with flags
1410 set for w=1. */
1411 loop = p->nr_insn;
1412
1413 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1414 be hashed. Since we have only 16 bits of precision in the hash, we
1415 must be careful about thorough mixing to maintain entropy as we
1416 squash the input vector into a small scalar. */
1417 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1418 brw_imm_uw( 0xBC8F ) );
1419 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1420 brw_imm_uw( 0xD0BD ) );
1421 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1422 brw_imm_uw( 0x9B93 ) );
1423 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1424 brw_imm_uw( 0xA359 ) );
1425 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1426 brw_imm_uw( 0xBC8F ) );
1427
1428 /* Temporarily disable the execution mask while we work with ExecSize=16
1429 channels (the mask is set for ExecSize=8 and is probably incorrect).
1430 Although this might cause execution of unwanted channels, the code
1431 writes only to temporary registers and has no side effects, so
1432 disabling the mask is harmless. */
1433 brw_push_insn_state( p );
1434 brw_set_mask_control( p, BRW_MASK_DISABLE );
1435 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1436 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1437 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1438
1439 /* We're now ready to perform the hashing. The eight hashes are
1440 interleaved for performance. The hash function used is
1441 designed to rapidly achieve avalanche and require only 16x16
1442 bit multiplication, and 8-bit swizzles (which we get for
1443 free). */
1444 for( i = 0; i < 4; i++ )
1445 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1446 for( i = 0; i < 4; i++ )
1447 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1448 odd_bytes( wtmp[ i ] ) );
1449 for( i = 0; i < 4; i++ )
1450 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1451 for( i = 0; i < 4; i++ )
1452 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1453 odd_bytes( wtmp[ i ] ) );
1454 brw_pop_insn_state( p );
1455
1456 /* Now we want to initialise the four rear gradients based on the
1457 hashes. Format conversion from signed integer to float leaves
1458 everything scaled too high by a factor of pow( 2, 15 ), but
1459 we correct for that right at the end. */
1460 /* x component */
1461 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1462 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1463 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1464 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1465 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1466
1467 brw_push_insn_state( p );
1468 brw_set_mask_control( p, BRW_MASK_DISABLE );
1469 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1470 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1471 brw_pop_insn_state( p );
1472
1473 brw_MUL( p, x1y0, x1y0, t );
1474 brw_MUL( p, x1y1, x1y1, t );
1475 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1476 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1477 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1478
1479 /* y component */
1480 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1481 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1482 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1483 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1484
1485 brw_push_insn_state( p );
1486 brw_set_mask_control( p, BRW_MASK_DISABLE );
1487 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1488 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1489 brw_pop_insn_state( p );
1490
1491 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1492 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1493 /* prepare t for the w component (used below): w the first time through
1494 the loop; w - 1 the second time) */
1495 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1496 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1497 p->current->header.predicate_inverse = 1;
1498 brw_MOV( p, t, param[ 3 ] );
1499 p->current->header.predicate_inverse = 0;
1500 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1501 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1502 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1503
1504 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1505 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1506 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1507 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1508
1509 /* z component */
1510 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1511 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1512 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1513 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1514
1515 brw_push_insn_state( p );
1516 brw_set_mask_control( p, BRW_MASK_DISABLE );
1517 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1518 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1519 brw_pop_insn_state( p );
1520
1521 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1522 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1523 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1524 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1525
1526 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1527 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1528 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1529 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1530
1531 /* w component */
1532 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1533 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1534 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1535 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1536
1537 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1538 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1539 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1540 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1541 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1542
1543 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1544 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1545 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1546 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1547
1548 /* Here we interpolate in the y dimension... */
1549 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1550 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1551 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1552 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1553 brw_ADD( p, x0y0, x0y0, x0y1 );
1554 brw_ADD( p, x1y0, x1y0, x1y1 );
1555
1556 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1557 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1558 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1559 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1560
1561 /* Now do the same thing for the front four gradients... */
1562 /* x component */
1563 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1564 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1565 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1566 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1567
1568 brw_push_insn_state( p );
1569 brw_set_mask_control( p, BRW_MASK_DISABLE );
1570 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1571 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1572 brw_pop_insn_state( p );
1573
1574 brw_MUL( p, x1y0, x1y0, t );
1575 brw_MUL( p, x1y1, x1y1, t );
1576 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1577 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1578 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1579
1580 /* y component */
1581 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1582 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1583 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1584 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1585
1586 brw_push_insn_state( p );
1587 brw_set_mask_control( p, BRW_MASK_DISABLE );
1588 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1589 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1590 brw_pop_insn_state( p );
1591
1592 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1593 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1594 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1595 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1596 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1597
1598 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1599 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1600 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1601 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1602
1603 /* z component */
1604 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1605 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1606 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1607 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1608
1609 brw_push_insn_state( p );
1610 brw_set_mask_control( p, BRW_MASK_DISABLE );
1611 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1612 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1613 brw_pop_insn_state( p );
1614
1615 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1616 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1617 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1618 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1619 /* prepare t for the w component (used below): w the first time through
1620 the loop; w - 1 the second time) */
1621 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1622 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1623 p->current->header.predicate_inverse = 1;
1624 brw_MOV( p, t, param[ 3 ] );
1625 p->current->header.predicate_inverse = 0;
1626 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1627
1628 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1629 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1630 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1631 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1632
1633 /* w component */
1634 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1635 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1636 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1637 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1638
1639 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1640 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1641 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1642 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1643
1644 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1645 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1646 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1647 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1648
1649 /* Interpolate in the y dimension: */
1650 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1651 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1652 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1653 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1654 brw_ADD( p, x0y0, x0y0, x0y1 );
1655 brw_ADD( p, x1y0, x1y0, x1y1 );
1656
1657 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1658 time put the front face in tmp[ 1 ] and we're nearly there... */
1659 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1660 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1661 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1662
1663 /* Another interpolation, in the z dimension: */
1664 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1665 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1666 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1667
1668 /* Exit the loop if we've computed both cubes... */
1669 origin = p->nr_insn;
1670 brw_push_insn_state( p );
1671 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1672 brw_set_mask_control( p, BRW_MASK_DISABLE );
1673 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1674 brw_pop_insn_state( p );
1675
1676 /* Save the result for the w=0 case, and increment the w coordinate: */
1677 brw_MOV( p, w0, tmp[ 0 ] );
1678 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1679 brw_imm_uw( 1 ) );
1680
1681 /* Loop around for the other cube. Explicitly set the flag register
1682 (unfortunately we must spend an extra instruction to do this: we
1683 can't rely on a side effect of the previous MOV or ADD because
1684 conditional modifiers which are normally true might be false in
1685 exceptional circumstances, e.g. given a NaN input; the add to
1686 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1687 brw_push_insn_state( p );
1688 brw_set_mask_control( p, BRW_MASK_DISABLE );
1689 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1690 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1691 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1692 brw_pop_insn_state( p );
1693
1694 /* Patch the previous conditional branch now that we know the
1695 destination address. */
1696 brw_set_src1( p->store + origin,
1697 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1698
1699 /* The very last interpolation. */
1700 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1701 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1702 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1703
1704 /* scale by pow( 2, -15 ), as described above */
1705 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1706
1707 release_tmps( c, mark );
1708 }
1709
1710 static void emit_noise4( struct brw_wm_compile *c,
1711 const struct prog_instruction *inst )
1712 {
1713 struct brw_compile *p = &c->func;
1714 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1715 GLuint mask = inst->DstReg.WriteMask;
1716 int i;
1717 int mark = mark_tmps( c );
1718
1719 assert( mark == 0 );
1720
1721 src0 = get_src_reg( c, inst, 0, 0 );
1722 src1 = get_src_reg( c, inst, 0, 1 );
1723 src2 = get_src_reg( c, inst, 0, 2 );
1724 src3 = get_src_reg( c, inst, 0, 3 );
1725
1726 param0 = alloc_tmp( c );
1727 param1 = alloc_tmp( c );
1728 param2 = alloc_tmp( c );
1729 param3 = alloc_tmp( c );
1730
1731 brw_MOV( p, param0, src0 );
1732 brw_MOV( p, param1, src1 );
1733 brw_MOV( p, param2, src2 );
1734 brw_MOV( p, param3, src3 );
1735
1736 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1737
1738 /* Fill in the result: */
1739 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1740 for (i = 0 ; i < 4; i++) {
1741 if (mask & (1<<i)) {
1742 dst = get_dst_reg(c, inst, i);
1743 brw_MOV( p, dst, param0 );
1744 }
1745 }
1746 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1747 brw_set_saturate( p, 0 );
1748
1749 release_tmps( c, mark );
1750 }
1751
1752 /**
1753 * Resolve subroutine calls after code emit is done.
1754 */
1755 static void post_wm_emit( struct brw_wm_compile *c )
1756 {
1757 brw_resolve_cals(&c->func);
1758 }
1759
1760 static void
1761 get_argument_regs(struct brw_wm_compile *c,
1762 const struct prog_instruction *inst,
1763 int index,
1764 struct brw_reg *dst,
1765 struct brw_reg *regs,
1766 int mask)
1767 {
1768 struct brw_compile *p = &c->func;
1769 int i, j;
1770
1771 for (i = 0; i < 4; i++) {
1772 if (mask & (1 << i)) {
1773 regs[i] = get_src_reg(c, inst, index, i);
1774
1775 /* Unalias destination registers from our sources. */
1776 if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
1777 for (j = 0; j < 4; j++) {
1778 if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
1779 struct brw_reg tmp = alloc_tmp(c);
1780 brw_MOV(p, tmp, regs[i]);
1781 regs[i] = tmp;
1782 break;
1783 }
1784 }
1785 }
1786 }
1787 }
1788 }
1789
1790 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
1791 {
1792 struct intel_context *intel = &brw->intel;
1793 #define MAX_IF_DEPTH 32
1794 #define MAX_LOOP_DEPTH 32
1795 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
1796 GLuint i, if_depth = 0, loop_depth = 0;
1797 struct brw_compile *p = &c->func;
1798 struct brw_indirect stack_index = brw_indirect(0, 0);
1799
1800 c->out_of_regs = GL_FALSE;
1801
1802 prealloc_reg(c);
1803 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1804 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1805
1806 for (i = 0; i < c->nr_fp_insns; i++) {
1807 const struct prog_instruction *inst = &c->prog_instructions[i];
1808 int dst_flags;
1809 struct brw_reg args[3][4], dst[4];
1810 int j;
1811 int mark = mark_tmps( c );
1812
1813 c->cur_inst = i;
1814
1815 #if 0
1816 printf("Inst %d: ", i);
1817 _mesa_print_instruction(inst);
1818 #endif
1819
1820 /* fetch any constants that this instruction needs */
1821 if (c->fp->use_const_buffer)
1822 fetch_constants(c, inst);
1823
1824 if (inst->Opcode != OPCODE_ARL) {
1825 for (j = 0; j < 4; j++) {
1826 if (inst->DstReg.WriteMask & (1 << j))
1827 dst[j] = get_dst_reg(c, inst, j);
1828 else
1829 dst[j] = brw_null_reg();
1830 }
1831 }
1832 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
1833 get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
1834
1835 dst_flags = inst->DstReg.WriteMask;
1836 if (inst->SaturateMode == SATURATE_ZERO_ONE)
1837 dst_flags |= SATURATE;
1838
1839 if (inst->CondUpdate)
1840 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
1841 else
1842 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
1843
1844 switch (inst->Opcode) {
1845 case WM_PIXELXY:
1846 emit_pixel_xy(c, dst, dst_flags);
1847 break;
1848 case WM_DELTAXY:
1849 emit_delta_xy(p, dst, dst_flags, args[0]);
1850 break;
1851 case WM_PIXELW:
1852 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1853 break;
1854 case WM_LINTERP:
1855 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1856 break;
1857 case WM_PINTERP:
1858 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1859 break;
1860 case WM_CINTERP:
1861 emit_cinterp(p, dst, dst_flags, args[0]);
1862 break;
1863 case WM_WPOSXY:
1864 emit_wpos_xy(c, dst, dst_flags, args[0]);
1865 break;
1866 case WM_FB_WRITE:
1867 emit_fb_write(c, args[0], args[1], args[2],
1868 INST_AUX_GET_TARGET(inst->Aux),
1869 inst->Aux & INST_AUX_EOT);
1870 break;
1871 case WM_FRONTFACING:
1872 emit_frontfacing(p, dst, dst_flags);
1873 break;
1874 case OPCODE_ADD:
1875 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1876 break;
1877 case OPCODE_ARL:
1878 emit_arl(c, inst);
1879 break;
1880 case OPCODE_FRC:
1881 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1882 break;
1883 case OPCODE_FLR:
1884 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1885 break;
1886 case OPCODE_LRP:
1887 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1888 break;
1889 case OPCODE_TRUNC:
1890 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
1891 break;
1892 case OPCODE_MOV:
1893 case OPCODE_SWZ:
1894 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1895 break;
1896 case OPCODE_DP3:
1897 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1898 break;
1899 case OPCODE_DP4:
1900 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1901 break;
1902 case OPCODE_XPD:
1903 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1904 break;
1905 case OPCODE_DPH:
1906 emit_dph(p, dst, dst_flags, args[0], args[1]);
1907 break;
1908 case OPCODE_RCP:
1909 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1910 break;
1911 case OPCODE_RSQ:
1912 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1913 break;
1914 case OPCODE_SIN:
1915 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1916 break;
1917 case OPCODE_COS:
1918 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1919 break;
1920 case OPCODE_EX2:
1921 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1922 break;
1923 case OPCODE_LG2:
1924 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1925 break;
1926 case OPCODE_CMP:
1927 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1928 break;
1929 case OPCODE_MIN:
1930 emit_min(p, dst, dst_flags, args[0], args[1]);
1931 break;
1932 case OPCODE_MAX:
1933 emit_max(p, dst, dst_flags, args[0], args[1]);
1934 break;
1935 case OPCODE_DDX:
1936 case OPCODE_DDY:
1937 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
1938 args[0]);
1939 break;
1940 case OPCODE_SLT:
1941 emit_sop(p, dst, dst_flags,
1942 BRW_CONDITIONAL_L, args[0], args[1]);
1943 break;
1944 case OPCODE_SLE:
1945 emit_sop(p, dst, dst_flags,
1946 BRW_CONDITIONAL_LE, args[0], args[1]);
1947 break;
1948 case OPCODE_SGT:
1949 emit_sop(p, dst, dst_flags,
1950 BRW_CONDITIONAL_G, args[0], args[1]);
1951 break;
1952 case OPCODE_SGE:
1953 emit_sop(p, dst, dst_flags,
1954 BRW_CONDITIONAL_GE, args[0], args[1]);
1955 break;
1956 case OPCODE_SEQ:
1957 emit_sop(p, dst, dst_flags,
1958 BRW_CONDITIONAL_EQ, args[0], args[1]);
1959 break;
1960 case OPCODE_SNE:
1961 emit_sop(p, dst, dst_flags,
1962 BRW_CONDITIONAL_NEQ, args[0], args[1]);
1963 break;
1964 case OPCODE_MUL:
1965 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1966 break;
1967 case OPCODE_POW:
1968 emit_math2(c, BRW_MATH_FUNCTION_POW,
1969 dst, dst_flags, args[0], args[1]);
1970 break;
1971 case OPCODE_MAD:
1972 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1973 break;
1974 case OPCODE_NOISE1:
1975 emit_noise1(c, inst);
1976 break;
1977 case OPCODE_NOISE2:
1978 emit_noise2(c, inst);
1979 break;
1980 case OPCODE_NOISE3:
1981 emit_noise3(c, inst);
1982 break;
1983 case OPCODE_NOISE4:
1984 emit_noise4(c, inst);
1985 break;
1986 case OPCODE_TEX:
1987 emit_tex(c, dst, dst_flags, args[0],
1988 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1989 0, 1, 0, 0),
1990 inst->TexSrcTarget,
1991 inst->TexSrcUnit,
1992 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
1993 break;
1994 case OPCODE_TXB:
1995 emit_txb(c, dst, dst_flags, args[0],
1996 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
1997 0, 1, 0, 0),
1998 inst->TexSrcTarget,
1999 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
2000 break;
2001 case OPCODE_KIL_NV:
2002 emit_kil(c);
2003 break;
2004 case OPCODE_IF:
2005 assert(if_depth < MAX_IF_DEPTH);
2006 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2007 break;
2008 case OPCODE_ELSE:
2009 assert(if_depth > 0);
2010 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2011 break;
2012 case OPCODE_ENDIF:
2013 assert(if_depth > 0);
2014 brw_ENDIF(p, if_inst[--if_depth]);
2015 break;
2016 case OPCODE_BGNSUB:
2017 brw_save_label(p, inst->Comment, p->nr_insn);
2018 break;
2019 case OPCODE_ENDSUB:
2020 /* no-op */
2021 break;
2022 case OPCODE_CAL:
2023 brw_push_insn_state(p);
2024 brw_set_mask_control(p, BRW_MASK_DISABLE);
2025 brw_set_access_mode(p, BRW_ALIGN_1);
2026 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2027 brw_set_access_mode(p, BRW_ALIGN_16);
2028 brw_ADD(p, get_addr_reg(stack_index),
2029 get_addr_reg(stack_index), brw_imm_d(4));
2030 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2031 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2032 brw_pop_insn_state(p);
2033 break;
2034
2035 case OPCODE_RET:
2036 brw_push_insn_state(p);
2037 brw_set_mask_control(p, BRW_MASK_DISABLE);
2038 brw_ADD(p, get_addr_reg(stack_index),
2039 get_addr_reg(stack_index), brw_imm_d(-4));
2040 brw_set_access_mode(p, BRW_ALIGN_1);
2041 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2042 brw_set_access_mode(p, BRW_ALIGN_16);
2043 brw_pop_insn_state(p);
2044
2045 break;
2046 case OPCODE_BGNLOOP:
2047 /* XXX may need to invalidate the current_constant regs */
2048 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2049 break;
2050 case OPCODE_BRK:
2051 brw_BREAK(p);
2052 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2053 break;
2054 case OPCODE_CONT:
2055 brw_CONT(p);
2056 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2057 break;
2058 case OPCODE_ENDLOOP:
2059 {
2060 struct brw_instruction *inst0, *inst1;
2061 GLuint br = 1;
2062
2063 if (intel->is_ironlake)
2064 br = 2;
2065
2066 assert(loop_depth > 0);
2067 loop_depth--;
2068 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2069 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2070 while (inst0 > loop_inst[loop_depth]) {
2071 inst0--;
2072 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2073 inst0->bits3.if_else.jump_count == 0) {
2074 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2075 inst0->bits3.if_else.pop_count = 0;
2076 }
2077 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2078 inst0->bits3.if_else.jump_count == 0) {
2079 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2080 inst0->bits3.if_else.pop_count = 0;
2081 }
2082 }
2083 }
2084 break;
2085 default:
2086 printf("unsupported opcode %d (%s) in fragment shader\n",
2087 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2088 _mesa_opcode_string(inst->Opcode) : "unknown");
2089 }
2090
2091 /* Release temporaries containing any unaliased source regs. */
2092 release_tmps( c, mark );
2093
2094 if (inst->CondUpdate)
2095 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2096 else
2097 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2098 }
2099 post_wm_emit(c);
2100
2101 if (INTEL_DEBUG & DEBUG_WM) {
2102 printf("wm-native:\n");
2103 for (i = 0; i < p->nr_insn; i++)
2104 brw_disasm(stderr, &p->store[i]);
2105 printf("\n");
2106 }
2107 }
2108
2109 /**
2110 * Do GPU code generation for shaders that use GLSL features such as
2111 * flow control. Other shaders will be compiled with the
2112 */
2113 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2114 {
2115 if (INTEL_DEBUG & DEBUG_WM) {
2116 printf("brw_wm_glsl_emit:\n");
2117 }
2118
2119 /* initial instruction translation/simplification */
2120 brw_wm_pass_fp(c);
2121
2122 /* actual code generation */
2123 brw_wm_emit_glsl(brw, c);
2124
2125 if (INTEL_DEBUG & DEBUG_WM) {
2126 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2127 }
2128
2129 c->prog_data.total_grf = num_grf_used(c);
2130 c->prog_data.total_scratch = 0;
2131 }