2df47344d5545a00402687b869eccffcfa90a754
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & (1 << i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553
554 /**
555 * Same as \sa get_src_reg() but if the register is a literal, emit
556 * a brw_reg encoding the literal.
557 * Note that a brw instruction only allows one src operand to be a literal.
558 * For instructions with more than one operand, only the second can be a
559 * literal. This means that we treat some literals as constants/uniforms
560 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
561 *
562 */
563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
564 const struct prog_instruction *inst,
565 GLuint srcRegIndex, GLuint channel)
566 {
567 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
568 if (src->File == PROGRAM_CONSTANT) {
569 /* a literal */
570 const int component = GET_SWZ(src->Swizzle, channel);
571 const GLfloat *param =
572 c->fp->program.Base.Parameters->ParameterValues[src->Index];
573 GLfloat value = param[component];
574 if (src->Negate & (1 << channel))
575 value = -value;
576 if (src->Abs)
577 value = FABSF(value);
578 #if 0
579 printf(" form immed value %f for chan %d\n", value, channel);
580 #endif
581 return brw_imm_f(value);
582 }
583 else {
584 return get_src_reg(c, inst, srcRegIndex, channel);
585 }
586 }
587
588
589 /**
590 * Subroutines are minimal support for resusable instruction sequences.
591 * They are implemented as simply as possible to minimise overhead: there
592 * is no explicit support for communication between the caller and callee
593 * other than saving the return address in a temporary register, nor is
594 * there any automatic local storage. This implies that great care is
595 * required before attempting reentrancy or any kind of nested
596 * subroutine invocations.
597 */
598 static void invoke_subroutine( struct brw_wm_compile *c,
599 enum _subroutine subroutine,
600 void (*emit)( struct brw_wm_compile * ) )
601 {
602 struct brw_compile *p = &c->func;
603
604 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
605
606 if( c->subroutines[ subroutine ] ) {
607 /* subroutine previously emitted: reuse existing instructions */
608
609 int mark = mark_tmps( c );
610 struct brw_reg return_address = retype( alloc_tmp( c ),
611 BRW_REGISTER_TYPE_UD );
612 int here = p->nr_insn;
613
614 brw_push_insn_state(p);
615 brw_set_mask_control(p, BRW_MASK_DISABLE);
616 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
617
618 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
619 brw_imm_d( ( c->subroutines[ subroutine ] -
620 here - 1 ) << 4 ) );
621 brw_pop_insn_state(p);
622
623 release_tmps( c, mark );
624 } else {
625 /* previously unused subroutine: emit, and mark for later reuse */
626
627 int mark = mark_tmps( c );
628 struct brw_reg return_address = retype( alloc_tmp( c ),
629 BRW_REGISTER_TYPE_UD );
630 struct brw_instruction *calc;
631 int base = p->nr_insn;
632
633 brw_push_insn_state(p);
634 brw_set_mask_control(p, BRW_MASK_DISABLE);
635 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
636 brw_pop_insn_state(p);
637
638 c->subroutines[ subroutine ] = p->nr_insn;
639
640 emit( c );
641
642 brw_push_insn_state(p);
643 brw_set_mask_control(p, BRW_MASK_DISABLE);
644 brw_MOV( p, brw_ip_reg(), return_address );
645 brw_pop_insn_state(p);
646
647 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
648
649 release_tmps( c, mark );
650 }
651 }
652
653 /* Workaround for using brw_wm_emit.c's emit functions, which expect
654 * destination regs to be uniquely written. Moves arguments out to
655 * temporaries as necessary for instructions which use their destination as
656 * a temporary.
657 */
658 static void
659 unalias3(struct brw_wm_compile *c,
660 void (*func)(struct brw_compile *c,
661 const struct brw_reg *dst,
662 GLuint mask,
663 const struct brw_reg *arg0,
664 const struct brw_reg *arg1,
665 const struct brw_reg *arg2),
666 const struct brw_reg *dst,
667 GLuint mask,
668 const struct brw_reg *arg0,
669 const struct brw_reg *arg1,
670 const struct brw_reg *arg2)
671 {
672 struct brw_compile *p = &c->func;
673 struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
674 int i, j;
675 int mark = mark_tmps(c);
676
677 for (j = 0; j < 4; j++) {
678 tmp_arg0[j] = arg0[j];
679 tmp_arg1[j] = arg1[j];
680 tmp_arg2[j] = arg2[j];
681 }
682
683 for (i = 0; i < 4; i++) {
684 if (mask & (1<<i)) {
685 for (j = 0; j < 4; j++) {
686 if (arg0[j].file == dst[i].file &&
687 dst[i].nr == arg0[j].nr) {
688 tmp_arg0[j] = alloc_tmp(c);
689 brw_MOV(p, tmp_arg0[j], arg0[j]);
690 }
691 if (arg1[j].file == dst[i].file &&
692 dst[i].nr == arg1[j].nr) {
693 tmp_arg1[j] = alloc_tmp(c);
694 brw_MOV(p, tmp_arg1[j], arg1[j]);
695 }
696 if (arg2[j].file == dst[i].file &&
697 dst[i].nr == arg2[j].nr) {
698 tmp_arg2[j] = alloc_tmp(c);
699 brw_MOV(p, tmp_arg2[j], arg2[j]);
700 }
701 }
702 }
703 }
704
705 func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
706
707 release_tmps(c, mark);
708 }
709
710 static void emit_pixel_xy(struct brw_wm_compile *c,
711 const struct prog_instruction *inst)
712 {
713 struct brw_reg r1 = brw_vec1_grf(1, 0);
714 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
715
716 struct brw_reg dst0, dst1;
717 struct brw_compile *p = &c->func;
718 GLuint mask = inst->DstReg.WriteMask;
719
720 dst0 = get_dst_reg(c, inst, 0);
721 dst1 = get_dst_reg(c, inst, 1);
722 /* Calculate pixel centers by adding 1 or 0 to each of the
723 * micro-tile coordinates passed in r1.
724 */
725 if (mask & WRITEMASK_X) {
726 brw_ADD(p,
727 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
728 stride(suboffset(r1_uw, 4), 2, 4, 0),
729 brw_imm_v(0x10101010));
730 }
731
732 if (mask & WRITEMASK_Y) {
733 brw_ADD(p,
734 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
735 stride(suboffset(r1_uw, 5), 2, 4, 0),
736 brw_imm_v(0x11001100));
737 }
738 }
739
740 static void emit_delta_xy(struct brw_wm_compile *c,
741 const struct prog_instruction *inst)
742 {
743 struct brw_reg r1 = brw_vec1_grf(1, 0);
744 struct brw_reg dst0, dst1, src0, src1;
745 struct brw_compile *p = &c->func;
746 GLuint mask = inst->DstReg.WriteMask;
747
748 dst0 = get_dst_reg(c, inst, 0);
749 dst1 = get_dst_reg(c, inst, 1);
750 src0 = get_src_reg(c, inst, 0, 0);
751 src1 = get_src_reg(c, inst, 0, 1);
752 /* Calc delta X,Y by subtracting origin in r1 from the pixel
753 * centers.
754 */
755 if (mask & WRITEMASK_X) {
756 brw_ADD(p,
757 dst0,
758 retype(src0, BRW_REGISTER_TYPE_UW),
759 negate(r1));
760 }
761
762 if (mask & WRITEMASK_Y) {
763 brw_ADD(p,
764 dst1,
765 retype(src1, BRW_REGISTER_TYPE_UW),
766 negate(suboffset(r1,1)));
767
768 }
769 }
770
771 static void fire_fb_write( struct brw_wm_compile *c,
772 GLuint base_reg,
773 GLuint nr,
774 GLuint target,
775 GLuint eot)
776 {
777 struct brw_compile *p = &c->func;
778 /* Pass through control information:
779 */
780 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
781 {
782 brw_push_insn_state(p);
783 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
784 brw_MOV(p,
785 brw_message_reg(base_reg + 1),
786 brw_vec8_grf(1, 0));
787 brw_pop_insn_state(p);
788 }
789 /* Send framebuffer write message: */
790 brw_fb_WRITE(p,
791 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
792 base_reg,
793 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
794 target,
795 nr,
796 0,
797 eot);
798 }
799
800 static void emit_fb_write(struct brw_wm_compile *c,
801 const struct prog_instruction *inst)
802 {
803 struct brw_compile *p = &c->func;
804 int nr = 2;
805 int channel;
806 GLuint target, eot;
807 struct brw_reg src0;
808
809 /* Reserve a space for AA - may not be needed:
810 */
811 if (c->key.aa_dest_stencil_reg)
812 nr += 1;
813
814 brw_push_insn_state(p);
815 for (channel = 0; channel < 4; channel++) {
816 src0 = get_src_reg(c, inst, 0, channel);
817 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
818 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
819 brw_MOV(p, brw_message_reg(nr + channel), src0);
820 }
821 /* skip over the regs populated above: */
822 nr += 8;
823 brw_pop_insn_state(p);
824
825 if (c->key.source_depth_to_render_target) {
826 if (c->key.computes_depth) {
827 src0 = get_src_reg(c, inst, 2, 2);
828 brw_MOV(p, brw_message_reg(nr), src0);
829 }
830 else {
831 src0 = get_src_reg(c, inst, 1, 1);
832 brw_MOV(p, brw_message_reg(nr), src0);
833 }
834
835 nr += 2;
836 }
837
838 if (c->key.dest_depth_reg) {
839 const GLuint comp = c->key.dest_depth_reg / 2;
840 const GLuint off = c->key.dest_depth_reg % 2;
841
842 if (off != 0) {
843 /* XXX this code needs review/testing */
844 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
845 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
846
847 brw_push_insn_state(p);
848 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
849
850 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
851 /* 2nd half? */
852 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
853 brw_pop_insn_state(p);
854 }
855 else
856 {
857 struct brw_reg src = get_src_reg(c, inst, 1, 1);
858 brw_MOV(p, brw_message_reg(nr), src);
859 }
860 nr += 2;
861 }
862
863 target = INST_AUX_GET_TARGET(inst->Aux);
864 eot = inst->Aux & INST_AUX_EOT;
865 fire_fb_write(c, 0, nr, target, eot);
866 }
867
868 static void emit_pixel_w( struct brw_wm_compile *c,
869 const struct prog_instruction *inst)
870 {
871 struct brw_compile *p = &c->func;
872 GLuint mask = inst->DstReg.WriteMask;
873 if (mask & WRITEMASK_W) {
874 struct brw_reg dst, src0, delta0, delta1;
875 struct brw_reg interp3;
876
877 dst = get_dst_reg(c, inst, 3);
878 src0 = get_src_reg(c, inst, 0, 0);
879 delta0 = get_src_reg(c, inst, 1, 0);
880 delta1 = get_src_reg(c, inst, 1, 1);
881
882 interp3 = brw_vec1_grf(src0.nr+1, 4);
883 /* Calc 1/w - just linterp wpos[3] optimized by putting the
884 * result straight into a message reg.
885 */
886 brw_LINE(p, brw_null_reg(), interp3, delta0);
887 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
888
889 /* Calc w */
890 brw_math_16( p, dst,
891 BRW_MATH_FUNCTION_INV,
892 BRW_MATH_SATURATE_NONE,
893 2, brw_null_reg(),
894 BRW_MATH_PRECISION_FULL);
895 }
896 }
897
898 static void emit_linterp(struct brw_wm_compile *c,
899 const struct prog_instruction *inst)
900 {
901 struct brw_compile *p = &c->func;
902 GLuint mask = inst->DstReg.WriteMask;
903 struct brw_reg interp[4];
904 struct brw_reg dst, delta0, delta1;
905 struct brw_reg src0;
906 GLuint nr, i;
907
908 src0 = get_src_reg(c, inst, 0, 0);
909 delta0 = get_src_reg(c, inst, 1, 0);
910 delta1 = get_src_reg(c, inst, 1, 1);
911 nr = src0.nr;
912
913 interp[0] = brw_vec1_grf(nr, 0);
914 interp[1] = brw_vec1_grf(nr, 4);
915 interp[2] = brw_vec1_grf(nr+1, 0);
916 interp[3] = brw_vec1_grf(nr+1, 4);
917
918 for(i = 0; i < 4; i++ ) {
919 if (mask & (1<<i)) {
920 dst = get_dst_reg(c, inst, i);
921 brw_LINE(p, brw_null_reg(), interp[i], delta0);
922 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
923 }
924 }
925 }
926
927 static void emit_cinterp(struct brw_wm_compile *c,
928 const struct prog_instruction *inst)
929 {
930 struct brw_compile *p = &c->func;
931 GLuint mask = inst->DstReg.WriteMask;
932
933 struct brw_reg interp[4];
934 struct brw_reg dst, src0;
935 GLuint nr, i;
936
937 src0 = get_src_reg(c, inst, 0, 0);
938 nr = src0.nr;
939
940 interp[0] = brw_vec1_grf(nr, 0);
941 interp[1] = brw_vec1_grf(nr, 4);
942 interp[2] = brw_vec1_grf(nr+1, 0);
943 interp[3] = brw_vec1_grf(nr+1, 4);
944
945 for(i = 0; i < 4; i++ ) {
946 if (mask & (1<<i)) {
947 dst = get_dst_reg(c, inst, i);
948 brw_MOV(p, dst, suboffset(interp[i],3));
949 }
950 }
951 }
952
953 static void emit_pinterp(struct brw_wm_compile *c,
954 const struct prog_instruction *inst)
955 {
956 struct brw_compile *p = &c->func;
957 GLuint mask = inst->DstReg.WriteMask;
958
959 struct brw_reg interp[4];
960 struct brw_reg dst, delta0, delta1;
961 struct brw_reg src0, w;
962 GLuint nr, i;
963
964 src0 = get_src_reg(c, inst, 0, 0);
965 delta0 = get_src_reg(c, inst, 1, 0);
966 delta1 = get_src_reg(c, inst, 1, 1);
967 w = get_src_reg(c, inst, 2, 3);
968 nr = src0.nr;
969
970 interp[0] = brw_vec1_grf(nr, 0);
971 interp[1] = brw_vec1_grf(nr, 4);
972 interp[2] = brw_vec1_grf(nr+1, 0);
973 interp[3] = brw_vec1_grf(nr+1, 4);
974
975 for(i = 0; i < 4; i++ ) {
976 if (mask & (1<<i)) {
977 dst = get_dst_reg(c, inst, i);
978 brw_LINE(p, brw_null_reg(), interp[i], delta0);
979 brw_MAC(p, dst, suboffset(interp[i],1),
980 delta1);
981 brw_MUL(p, dst, dst, w);
982 }
983 }
984 }
985
986 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
987 static void emit_frontfacing(struct brw_wm_compile *c,
988 const struct prog_instruction *inst)
989 {
990 struct brw_compile *p = &c->func;
991 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
992 struct brw_reg dst;
993 GLuint mask = inst->DstReg.WriteMask;
994 int i;
995
996 for (i = 0; i < 4; i++) {
997 if (mask & (1<<i)) {
998 dst = get_dst_reg(c, inst, i);
999 brw_MOV(p, dst, brw_imm_f(0.0));
1000 }
1001 }
1002
1003 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1004 * us front face
1005 */
1006 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
1007 for (i = 0; i < 4; i++) {
1008 if (mask & (1<<i)) {
1009 dst = get_dst_reg(c, inst, i);
1010 brw_MOV(p, dst, brw_imm_f(1.0));
1011 }
1012 }
1013 brw_set_predicate_control_flag_value(p, 0xff);
1014 }
1015
1016 static void emit_xpd(struct brw_wm_compile *c,
1017 const struct prog_instruction *inst)
1018 {
1019 int i;
1020 struct brw_compile *p = &c->func;
1021 GLuint mask = inst->DstReg.WriteMask;
1022 for (i = 0; i < 4; i++) {
1023 GLuint i2 = (i+2)%3;
1024 GLuint i1 = (i+1)%3;
1025 if (mask & (1<<i)) {
1026 struct brw_reg src0, src1, dst;
1027 dst = get_dst_reg(c, inst, i);
1028 src0 = negate(get_src_reg(c, inst, 0, i2));
1029 src1 = get_src_reg_imm(c, inst, 1, i1);
1030 brw_MUL(p, brw_null_reg(), src0, src1);
1031 src0 = get_src_reg(c, inst, 0, i1);
1032 src1 = get_src_reg_imm(c, inst, 1, i2);
1033 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1034 brw_MAC(p, dst, src0, src1);
1035 brw_set_saturate(p, 0);
1036 }
1037 }
1038 brw_set_saturate(p, 0);
1039 }
1040
1041 /**
1042 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1043 * Note that the result of the function is smeared across the dest
1044 * register's X, Y, Z and W channels (subject to writemasking of course).
1045 */
1046 static void emit_math1(struct brw_wm_compile *c,
1047 const struct prog_instruction *inst, GLuint func)
1048 {
1049 struct brw_compile *p = &c->func;
1050 struct brw_reg src0, dst;
1051 GLuint mask = inst->DstReg.WriteMask;
1052 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1053
1054 if (!(mask & WRITEMASK_XYZW))
1055 return;
1056
1057 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1058
1059 /* Get first component of source register */
1060 dst = get_dst_reg(c, inst, dst_chan);
1061 src0 = get_src_reg(c, inst, 0, 0);
1062
1063 brw_MOV(p, brw_message_reg(2), src0);
1064 brw_math(p,
1065 dst,
1066 func,
1067 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1068 2,
1069 brw_null_reg(),
1070 BRW_MATH_DATA_VECTOR,
1071 BRW_MATH_PRECISION_FULL);
1072 }
1073
1074 static void emit_rcp(struct brw_wm_compile *c,
1075 const struct prog_instruction *inst)
1076 {
1077 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1078 }
1079
1080 static void emit_rsq(struct brw_wm_compile *c,
1081 const struct prog_instruction *inst)
1082 {
1083 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1084 }
1085
1086 static void emit_sin(struct brw_wm_compile *c,
1087 const struct prog_instruction *inst)
1088 {
1089 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1090 }
1091
1092 static void emit_cos(struct brw_wm_compile *c,
1093 const struct prog_instruction *inst)
1094 {
1095 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1096 }
1097
1098 static void emit_ex2(struct brw_wm_compile *c,
1099 const struct prog_instruction *inst)
1100 {
1101 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1102 }
1103
1104 static void emit_lg2(struct brw_wm_compile *c,
1105 const struct prog_instruction *inst)
1106 {
1107 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1108 }
1109
1110 static void emit_arl(struct brw_wm_compile *c,
1111 const struct prog_instruction *inst)
1112 {
1113 struct brw_compile *p = &c->func;
1114 struct brw_reg src0, addr_reg;
1115 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1116 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1117 BRW_ARF_ADDRESS, 0);
1118 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1119 brw_MOV(p, addr_reg, src0);
1120 brw_set_saturate(p, 0);
1121 }
1122
1123
1124 static void emit_min_max(struct brw_wm_compile *c,
1125 const struct prog_instruction *inst)
1126 {
1127 struct brw_compile *p = &c->func;
1128 const GLuint mask = inst->DstReg.WriteMask;
1129 const int mark = mark_tmps(c);
1130 int i;
1131 brw_push_insn_state(p);
1132 for (i = 0; i < 4; i++) {
1133 if (mask & (1<<i)) {
1134 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1135 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1136 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1137 struct brw_reg dst;
1138 /* if dst==src0 or dst==src1 we need to use a temp reg */
1139 GLboolean use_temp = brw_same_reg(dst, src0) ||
1140 brw_same_reg(dst, src1);
1141 if (use_temp)
1142 dst = alloc_tmp(c);
1143 else
1144 dst = real_dst;
1145
1146 /*
1147 printf(" Min/max: dst %d src0 %d src1 %d\n",
1148 dst.nr, src0.nr, src1.nr);
1149 */
1150 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1151 brw_MOV(p, dst, src0);
1152 brw_set_saturate(p, 0);
1153
1154 if (inst->Opcode == OPCODE_MIN)
1155 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1156 else
1157 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1158
1159 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1160 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1161 brw_MOV(p, dst, src1);
1162 brw_set_saturate(p, 0);
1163 brw_set_predicate_control_flag_value(p, 0xff);
1164 if (use_temp)
1165 brw_MOV(p, real_dst, dst);
1166 }
1167 }
1168 brw_pop_insn_state(p);
1169 release_tmps(c, mark);
1170 }
1171
1172 static void emit_pow(struct brw_wm_compile *c,
1173 const struct prog_instruction *inst)
1174 {
1175 struct brw_compile *p = &c->func;
1176 struct brw_reg dst, src0, src1;
1177 GLuint mask = inst->DstReg.WriteMask;
1178 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1179
1180 if (!(mask & WRITEMASK_XYZW))
1181 return;
1182
1183 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1184
1185 dst = get_dst_reg(c, inst, dst_chan);
1186 src0 = get_src_reg_imm(c, inst, 0, 0);
1187 src1 = get_src_reg_imm(c, inst, 1, 0);
1188
1189 brw_MOV(p, brw_message_reg(2), src0);
1190 brw_MOV(p, brw_message_reg(3), src1);
1191
1192 brw_math(p,
1193 dst,
1194 BRW_MATH_FUNCTION_POW,
1195 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1196 2,
1197 brw_null_reg(),
1198 BRW_MATH_DATA_VECTOR,
1199 BRW_MATH_PRECISION_FULL);
1200 }
1201
1202 /**
1203 * For GLSL shaders, this KIL will be unconditional.
1204 * It may be contained inside an IF/ENDIF structure of course.
1205 */
1206 static void emit_kil(struct brw_wm_compile *c)
1207 {
1208 struct brw_compile *p = &c->func;
1209 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1210 brw_push_insn_state(p);
1211 brw_set_mask_control(p, BRW_MASK_DISABLE);
1212 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1213 brw_AND(p, depth, c->emit_mask_reg, depth);
1214 brw_pop_insn_state(p);
1215 }
1216
1217 static INLINE struct brw_reg high_words( struct brw_reg reg )
1218 {
1219 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1220 0, 8, 2 );
1221 }
1222
1223 static INLINE struct brw_reg low_words( struct brw_reg reg )
1224 {
1225 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1226 }
1227
1228 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1229 {
1230 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1231 }
1232
1233 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1234 {
1235 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1236 0, 16, 2 );
1237 }
1238
1239 /* One-, two- and three-dimensional Perlin noise, similar to the description
1240 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1241 static void noise1_sub( struct brw_wm_compile *c ) {
1242
1243 struct brw_compile *p = &c->func;
1244 struct brw_reg param,
1245 x0, x1, /* gradients at each end */
1246 t, tmp[ 2 ], /* float temporaries */
1247 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1248 int i;
1249 int mark = mark_tmps( c );
1250
1251 x0 = alloc_tmp( c );
1252 x1 = alloc_tmp( c );
1253 t = alloc_tmp( c );
1254 tmp[ 0 ] = alloc_tmp( c );
1255 tmp[ 1 ] = alloc_tmp( c );
1256 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1257 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1258 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1259 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1260 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1261
1262 param = lookup_tmp( c, mark - 2 );
1263
1264 brw_set_access_mode( p, BRW_ALIGN_1 );
1265
1266 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1267
1268 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1269 be hashed. Also compute the remainder (offset within the unit
1270 length), interleaved to reduce register dependency penalties. */
1271 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1272 brw_FRC( p, param, param );
1273 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1274 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1275 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1276
1277 /* We're now ready to perform the hashing. The two hashes are
1278 interleaved for performance. The hash function used is
1279 designed to rapidly achieve avalanche and require only 32x16
1280 bit multiplication, and 16-bit swizzles (which we get for
1281 free). We can't use immediate operands in the multiplies,
1282 because immediates are permitted only in src1 and the 16-bit
1283 factor is permitted only in src0. */
1284 for( i = 0; i < 2; i++ )
1285 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1286 for( i = 0; i < 2; i++ )
1287 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1288 high_words( itmp[ i ] ) );
1289 for( i = 0; i < 2; i++ )
1290 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1291 for( i = 0; i < 2; i++ )
1292 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1293 high_words( itmp[ i ] ) );
1294 for( i = 0; i < 2; i++ )
1295 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1296 for( i = 0; i < 2; i++ )
1297 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1298 high_words( itmp[ i ] ) );
1299
1300 /* Now we want to initialise the two gradients based on the
1301 hashes. Format conversion from signed integer to float leaves
1302 everything scaled too high by a factor of pow( 2, 31 ), but
1303 we correct for that right at the end. */
1304 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1305 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1306 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1307
1308 brw_MUL( p, x0, x0, param );
1309 brw_MUL( p, x1, x1, t );
1310
1311 /* We interpolate between the gradients using the polynomial
1312 6t^5 - 15t^4 + 10t^3 (Perlin). */
1313 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1314 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1315 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1316 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1317 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1318 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1319 pipeline */
1320 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1321 brw_MUL( p, param, tmp[ 0 ], param );
1322 brw_MUL( p, x1, x1, param );
1323 brw_ADD( p, x0, x0, x1 );
1324 /* scale by pow( 2, -30 ), to compensate for the format conversion
1325 above and an extra factor of 2 so that a single gradient covers
1326 the [-1,1] range */
1327 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1328
1329 release_tmps( c, mark );
1330 }
1331
1332 static void emit_noise1( struct brw_wm_compile *c,
1333 const struct prog_instruction *inst )
1334 {
1335 struct brw_compile *p = &c->func;
1336 struct brw_reg src, param, dst;
1337 GLuint mask = inst->DstReg.WriteMask;
1338 int i;
1339 int mark = mark_tmps( c );
1340
1341 assert( mark == 0 );
1342
1343 src = get_src_reg( c, inst, 0, 0 );
1344
1345 param = alloc_tmp( c );
1346
1347 brw_MOV( p, param, src );
1348
1349 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1350
1351 /* Fill in the result: */
1352 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1353 for (i = 0 ; i < 4; i++) {
1354 if (mask & (1<<i)) {
1355 dst = get_dst_reg(c, inst, i);
1356 brw_MOV( p, dst, param );
1357 }
1358 }
1359 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1360 brw_set_saturate( p, 0 );
1361
1362 release_tmps( c, mark );
1363 }
1364
1365 static void noise2_sub( struct brw_wm_compile *c ) {
1366
1367 struct brw_compile *p = &c->func;
1368 struct brw_reg param0, param1,
1369 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1370 t, tmp[ 4 ], /* float temporaries */
1371 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1372 int i;
1373 int mark = mark_tmps( c );
1374
1375 x0y0 = alloc_tmp( c );
1376 x0y1 = alloc_tmp( c );
1377 x1y0 = alloc_tmp( c );
1378 x1y1 = alloc_tmp( c );
1379 t = alloc_tmp( c );
1380 for( i = 0; i < 4; i++ ) {
1381 tmp[ i ] = alloc_tmp( c );
1382 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1383 }
1384 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1385 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1386 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1387
1388 param0 = lookup_tmp( c, mark - 3 );
1389 param1 = lookup_tmp( c, mark - 2 );
1390
1391 brw_set_access_mode( p, BRW_ALIGN_1 );
1392
1393 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1394 be hashed. Also compute the remainders (offsets within the unit
1395 square), interleaved to reduce register dependency penalties. */
1396 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1397 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1398 brw_FRC( p, param0, param0 );
1399 brw_FRC( p, param1, param1 );
1400 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1401 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1402 low_words( itmp[ 1 ] ) );
1403 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1404 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1405 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1406 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1407 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1408
1409 /* We're now ready to perform the hashing. The four hashes are
1410 interleaved for performance. The hash function used is
1411 designed to rapidly achieve avalanche and require only 32x16
1412 bit multiplication, and 16-bit swizzles (which we get for
1413 free). We can't use immediate operands in the multiplies,
1414 because immediates are permitted only in src1 and the 16-bit
1415 factor is permitted only in src0. */
1416 for( i = 0; i < 4; i++ )
1417 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1418 for( i = 0; i < 4; i++ )
1419 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1420 high_words( itmp[ i ] ) );
1421 for( i = 0; i < 4; i++ )
1422 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1423 for( i = 0; i < 4; i++ )
1424 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1425 high_words( itmp[ i ] ) );
1426 for( i = 0; i < 4; i++ )
1427 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1428 for( i = 0; i < 4; i++ )
1429 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1430 high_words( itmp[ i ] ) );
1431
1432 /* Now we want to initialise the four gradients based on the
1433 hashes. Format conversion from signed integer to float leaves
1434 everything scaled too high by a factor of pow( 2, 15 ), but
1435 we correct for that right at the end. */
1436 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1437 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1438 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1439 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1440 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1441
1442 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1443 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1444 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1445 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1446
1447 brw_MUL( p, x1y0, x1y0, t );
1448 brw_MUL( p, x1y1, x1y1, t );
1449 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1450 brw_MUL( p, x0y0, x0y0, param0 );
1451 brw_MUL( p, x0y1, x0y1, param0 );
1452
1453 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1454 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1455 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1456 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1457
1458 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1459 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1460 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1461 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1462
1463 /* We interpolate between the gradients using the polynomial
1464 6t^5 - 15t^4 + 10t^3 (Perlin). */
1465 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1466 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1467 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1468 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1469 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1470 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1471 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1472 pipeline */
1473 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1474 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1475 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1476 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1477 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1478 pipeline */
1479 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1480 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1481 brw_MUL( p, param0, tmp[ 0 ], param0 );
1482 brw_MUL( p, param1, tmp[ 1 ], param1 );
1483
1484 /* Here we interpolate in the y dimension... */
1485 brw_MUL( p, x0y1, x0y1, param1 );
1486 brw_MUL( p, x1y1, x1y1, param1 );
1487 brw_ADD( p, x0y0, x0y0, x0y1 );
1488 brw_ADD( p, x1y0, x1y0, x1y1 );
1489
1490 /* And now in x. There are horrible register dependencies here,
1491 but we have nothing else to do. */
1492 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1493 brw_MUL( p, x1y0, x1y0, param0 );
1494 brw_ADD( p, x0y0, x0y0, x1y0 );
1495
1496 /* scale by pow( 2, -15 ), as described above */
1497 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1498
1499 release_tmps( c, mark );
1500 }
1501
1502 static void emit_noise2( struct brw_wm_compile *c,
1503 const struct prog_instruction *inst )
1504 {
1505 struct brw_compile *p = &c->func;
1506 struct brw_reg src0, src1, param0, param1, dst;
1507 GLuint mask = inst->DstReg.WriteMask;
1508 int i;
1509 int mark = mark_tmps( c );
1510
1511 assert( mark == 0 );
1512
1513 src0 = get_src_reg( c, inst, 0, 0 );
1514 src1 = get_src_reg( c, inst, 0, 1 );
1515
1516 param0 = alloc_tmp( c );
1517 param1 = alloc_tmp( c );
1518
1519 brw_MOV( p, param0, src0 );
1520 brw_MOV( p, param1, src1 );
1521
1522 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1523
1524 /* Fill in the result: */
1525 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1526 for (i = 0 ; i < 4; i++) {
1527 if (mask & (1<<i)) {
1528 dst = get_dst_reg(c, inst, i);
1529 brw_MOV( p, dst, param0 );
1530 }
1531 }
1532 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1533 brw_set_saturate( p, 0 );
1534
1535 release_tmps( c, mark );
1536 }
1537
1538 /**
1539 * The three-dimensional case is much like the one- and two- versions above,
1540 * but since the number of corners is rapidly growing we now pack 16 16-bit
1541 * hashes into each register to extract more parallelism from the EUs.
1542 */
1543 static void noise3_sub( struct brw_wm_compile *c ) {
1544
1545 struct brw_compile *p = &c->func;
1546 struct brw_reg param0, param1, param2,
1547 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1548 xi, yi, zi, /* interpolation coefficients */
1549 t, tmp[ 8 ], /* float temporaries */
1550 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1551 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1552 int i;
1553 int mark = mark_tmps( c );
1554
1555 x0y0 = alloc_tmp( c );
1556 x0y1 = alloc_tmp( c );
1557 x1y0 = alloc_tmp( c );
1558 x1y1 = alloc_tmp( c );
1559 xi = alloc_tmp( c );
1560 yi = alloc_tmp( c );
1561 zi = alloc_tmp( c );
1562 t = alloc_tmp( c );
1563 for( i = 0; i < 8; i++ ) {
1564 tmp[ i ] = alloc_tmp( c );
1565 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1566 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1567 }
1568
1569 param0 = lookup_tmp( c, mark - 4 );
1570 param1 = lookup_tmp( c, mark - 3 );
1571 param2 = lookup_tmp( c, mark - 2 );
1572
1573 brw_set_access_mode( p, BRW_ALIGN_1 );
1574
1575 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1576 be hashed. Also compute the remainders (offsets within the unit
1577 cube), interleaved to reduce register dependency penalties. */
1578 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1579 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1580 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1581 brw_FRC( p, param0, param0 );
1582 brw_FRC( p, param1, param1 );
1583 brw_FRC( p, param2, param2 );
1584 /* Since we now have only 16 bits of precision in the hash, we must
1585 be more careful about thorough mixing to maintain entropy as we
1586 squash the input vector into a small scalar. */
1587 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1588 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1589 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1590 brw_imm_uw( 0x9B93 ) );
1591 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1592 brw_imm_uw( 0xBC8F ) );
1593
1594 /* Temporarily disable the execution mask while we work with ExecSize=16
1595 channels (the mask is set for ExecSize=8 and is probably incorrect).
1596 Although this might cause execution of unwanted channels, the code
1597 writes only to temporary registers and has no side effects, so
1598 disabling the mask is harmless. */
1599 brw_push_insn_state( p );
1600 brw_set_mask_control( p, BRW_MASK_DISABLE );
1601 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1602 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1603 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1604
1605 /* We're now ready to perform the hashing. The eight hashes are
1606 interleaved for performance. The hash function used is
1607 designed to rapidly achieve avalanche and require only 16x16
1608 bit multiplication, and 8-bit swizzles (which we get for
1609 free). */
1610 for( i = 0; i < 4; i++ )
1611 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1612 for( i = 0; i < 4; i++ )
1613 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1614 odd_bytes( wtmp[ i ] ) );
1615 for( i = 0; i < 4; i++ )
1616 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1617 for( i = 0; i < 4; i++ )
1618 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1619 odd_bytes( wtmp[ i ] ) );
1620 brw_pop_insn_state( p );
1621
1622 /* Now we want to initialise the four rear gradients based on the
1623 hashes. Format conversion from signed integer to float leaves
1624 everything scaled too high by a factor of pow( 2, 15 ), but
1625 we correct for that right at the end. */
1626 /* x component */
1627 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1628 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1629 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1630 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1631 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1632
1633 brw_push_insn_state( p );
1634 brw_set_mask_control( p, BRW_MASK_DISABLE );
1635 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1636 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1637 brw_pop_insn_state( p );
1638
1639 brw_MUL( p, x1y0, x1y0, t );
1640 brw_MUL( p, x1y1, x1y1, t );
1641 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1642 brw_MUL( p, x0y0, x0y0, param0 );
1643 brw_MUL( p, x0y1, x0y1, param0 );
1644
1645 /* y component */
1646 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1647 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1648 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1649 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1650
1651 brw_push_insn_state( p );
1652 brw_set_mask_control( p, BRW_MASK_DISABLE );
1653 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1654 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1655 brw_pop_insn_state( p );
1656
1657 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1658 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1659 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1660 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1661 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1662
1663 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1664 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1665 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1666 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1667
1668 /* z component */
1669 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1670 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1671 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1672 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1673
1674 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1675 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1676 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1677 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1678
1679 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1680 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1681 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1682 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1683
1684 /* We interpolate between the gradients using the polynomial
1685 6t^5 - 15t^4 + 10t^3 (Perlin). */
1686 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1687 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1688 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1689 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1690 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1691 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1692 brw_MUL( p, xi, xi, param0 );
1693 brw_MUL( p, yi, yi, param1 );
1694 brw_MUL( p, zi, zi, param2 );
1695 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1696 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1697 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1698 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1699 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1700 brw_MUL( p, xi, xi, param0 );
1701 brw_MUL( p, yi, yi, param1 );
1702 brw_MUL( p, zi, zi, param2 );
1703 brw_MUL( p, xi, xi, param0 );
1704 brw_MUL( p, yi, yi, param1 );
1705 brw_MUL( p, zi, zi, param2 );
1706 brw_MUL( p, xi, xi, param0 );
1707 brw_MUL( p, yi, yi, param1 );
1708 brw_MUL( p, zi, zi, param2 );
1709
1710 /* Here we interpolate in the y dimension... */
1711 brw_MUL( p, x0y1, x0y1, yi );
1712 brw_MUL( p, x1y1, x1y1, yi );
1713 brw_ADD( p, x0y0, x0y0, x0y1 );
1714 brw_ADD( p, x1y0, x1y0, x1y1 );
1715
1716 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1717 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1718 brw_MUL( p, x1y0, x1y0, xi );
1719 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1720
1721 /* Now do the same thing for the front four gradients... */
1722 /* x component */
1723 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1724 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1725 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1726 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1727
1728 brw_push_insn_state( p );
1729 brw_set_mask_control( p, BRW_MASK_DISABLE );
1730 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1731 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1732 brw_pop_insn_state( p );
1733
1734 brw_MUL( p, x1y0, x1y0, t );
1735 brw_MUL( p, x1y1, x1y1, t );
1736 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1737 brw_MUL( p, x0y0, x0y0, param0 );
1738 brw_MUL( p, x0y1, x0y1, param0 );
1739
1740 /* y component */
1741 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1742 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1743 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1744 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1745
1746 brw_push_insn_state( p );
1747 brw_set_mask_control( p, BRW_MASK_DISABLE );
1748 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1749 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1750 brw_pop_insn_state( p );
1751
1752 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1753 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1754 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1755 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1756 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1757
1758 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1759 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1760 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1761 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1762
1763 /* z component */
1764 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1765 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1766 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1767 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1768
1769 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1770 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1771 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1772 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1773
1774 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1775 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1776 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1777 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1778
1779 /* The interpolation coefficients are still around from last time, so
1780 again interpolate in the y dimension... */
1781 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1782 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1783 brw_MUL( p, x0y1, x0y1, yi );
1784 brw_MUL( p, x1y1, x1y1, yi );
1785 brw_ADD( p, x0y0, x0y0, x0y1 );
1786 brw_ADD( p, x1y0, x1y0, x1y1 );
1787
1788 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1789 time put the front face in tmp[ 1 ] and we're nearly there... */
1790 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1791 brw_MUL( p, x1y0, x1y0, xi );
1792 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1793
1794 /* The final interpolation, in the z dimension: */
1795 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1796 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1797 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1798
1799 /* scale by pow( 2, -15 ), as described above */
1800 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1801
1802 release_tmps( c, mark );
1803 }
1804
1805 static void emit_noise3( struct brw_wm_compile *c,
1806 const struct prog_instruction *inst )
1807 {
1808 struct brw_compile *p = &c->func;
1809 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1810 GLuint mask = inst->DstReg.WriteMask;
1811 int i;
1812 int mark = mark_tmps( c );
1813
1814 assert( mark == 0 );
1815
1816 src0 = get_src_reg( c, inst, 0, 0 );
1817 src1 = get_src_reg( c, inst, 0, 1 );
1818 src2 = get_src_reg( c, inst, 0, 2 );
1819
1820 param0 = alloc_tmp( c );
1821 param1 = alloc_tmp( c );
1822 param2 = alloc_tmp( c );
1823
1824 brw_MOV( p, param0, src0 );
1825 brw_MOV( p, param1, src1 );
1826 brw_MOV( p, param2, src2 );
1827
1828 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1829
1830 /* Fill in the result: */
1831 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1832 for (i = 0 ; i < 4; i++) {
1833 if (mask & (1<<i)) {
1834 dst = get_dst_reg(c, inst, i);
1835 brw_MOV( p, dst, param0 );
1836 }
1837 }
1838 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1839 brw_set_saturate( p, 0 );
1840
1841 release_tmps( c, mark );
1842 }
1843
1844 /**
1845 * For the four-dimensional case, the little micro-optimisation benefits
1846 * we obtain by unrolling all the loops aren't worth the massive bloat it
1847 * now causes. Instead, we loop twice around performing a similar operation
1848 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1849 * code to glue it all together.
1850 */
1851 static void noise4_sub( struct brw_wm_compile *c )
1852 {
1853 struct brw_compile *p = &c->func;
1854 struct brw_reg param[ 4 ],
1855 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1856 w0, /* noise for the w=0 cube */
1857 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1858 interp[ 4 ], /* interpolation coefficients */
1859 t, tmp[ 8 ], /* float temporaries */
1860 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1861 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1862 int i, j;
1863 int mark = mark_tmps( c );
1864 GLuint loop, origin;
1865
1866 x0y0 = alloc_tmp( c );
1867 x0y1 = alloc_tmp( c );
1868 x1y0 = alloc_tmp( c );
1869 x1y1 = alloc_tmp( c );
1870 t = alloc_tmp( c );
1871 w0 = alloc_tmp( c );
1872 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1873 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1874
1875 for( i = 0; i < 4; i++ ) {
1876 param[ i ] = lookup_tmp( c, mark - 5 + i );
1877 interp[ i ] = alloc_tmp( c );
1878 }
1879
1880 for( i = 0; i < 8; i++ ) {
1881 tmp[ i ] = alloc_tmp( c );
1882 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1883 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1884 }
1885
1886 brw_set_access_mode( p, BRW_ALIGN_1 );
1887
1888 /* We only want 16 bits of precision from the integral part of each
1889 co-ordinate, but unfortunately the RNDD semantics would saturate
1890 at 16 bits if we performed the operation directly to a 16-bit
1891 destination. Therefore, we round to 32-bit temporaries where
1892 appropriate, and then store only the lower 16 bits. */
1893 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1894 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1895 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1896 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1897 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1898 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1899
1900 /* Modify the flag register here, because the side effect is useful
1901 later (see below). We know for certain that all flags will be
1902 cleared, since the FRC instruction cannot possibly generate
1903 negative results. Even for exceptional inputs (infinities, denormals,
1904 NaNs), the architecture guarantees that the L conditional is false. */
1905 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1906 brw_FRC( p, param[ 0 ], param[ 0 ] );
1907 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1908 for( i = 1; i < 4; i++ )
1909 brw_FRC( p, param[ i ], param[ i ] );
1910
1911 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1912 of all. */
1913 for( i = 0; i < 4; i++ )
1914 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1915 for( i = 0; i < 4; i++ )
1916 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1917 for( i = 0; i < 4; i++ )
1918 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1919 for( i = 0; i < 4; i++ )
1920 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1921 for( j = 0; j < 3; j++ )
1922 for( i = 0; i < 4; i++ )
1923 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1924
1925 /* Mark the current address, as it will be a jump destination. The
1926 following code will be executed twice: first, with the flag
1927 register clear indicating the w=0 case, and second with flags
1928 set for w=1. */
1929 loop = p->nr_insn;
1930
1931 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1932 be hashed. Since we have only 16 bits of precision in the hash, we
1933 must be careful about thorough mixing to maintain entropy as we
1934 squash the input vector into a small scalar. */
1935 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1936 brw_imm_uw( 0xBC8F ) );
1937 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1938 brw_imm_uw( 0xD0BD ) );
1939 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1940 brw_imm_uw( 0x9B93 ) );
1941 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1942 brw_imm_uw( 0xA359 ) );
1943 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1944 brw_imm_uw( 0xBC8F ) );
1945
1946 /* Temporarily disable the execution mask while we work with ExecSize=16
1947 channels (the mask is set for ExecSize=8 and is probably incorrect).
1948 Although this might cause execution of unwanted channels, the code
1949 writes only to temporary registers and has no side effects, so
1950 disabling the mask is harmless. */
1951 brw_push_insn_state( p );
1952 brw_set_mask_control( p, BRW_MASK_DISABLE );
1953 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1954 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1955 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1956
1957 /* We're now ready to perform the hashing. The eight hashes are
1958 interleaved for performance. The hash function used is
1959 designed to rapidly achieve avalanche and require only 16x16
1960 bit multiplication, and 8-bit swizzles (which we get for
1961 free). */
1962 for( i = 0; i < 4; i++ )
1963 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1964 for( i = 0; i < 4; i++ )
1965 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1966 odd_bytes( wtmp[ i ] ) );
1967 for( i = 0; i < 4; i++ )
1968 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1969 for( i = 0; i < 4; i++ )
1970 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1971 odd_bytes( wtmp[ i ] ) );
1972 brw_pop_insn_state( p );
1973
1974 /* Now we want to initialise the four rear gradients based on the
1975 hashes. Format conversion from signed integer to float leaves
1976 everything scaled too high by a factor of pow( 2, 15 ), but
1977 we correct for that right at the end. */
1978 /* x component */
1979 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1980 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1981 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1982 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1983 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1984
1985 brw_push_insn_state( p );
1986 brw_set_mask_control( p, BRW_MASK_DISABLE );
1987 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1988 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1989 brw_pop_insn_state( p );
1990
1991 brw_MUL( p, x1y0, x1y0, t );
1992 brw_MUL( p, x1y1, x1y1, t );
1993 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1994 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1995 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1996
1997 /* y component */
1998 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1999 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2000 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2001 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2002
2003 brw_push_insn_state( p );
2004 brw_set_mask_control( p, BRW_MASK_DISABLE );
2005 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2006 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2007 brw_pop_insn_state( p );
2008
2009 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2010 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2011 /* prepare t for the w component (used below): w the first time through
2012 the loop; w - 1 the second time) */
2013 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2014 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2015 p->current->header.predicate_inverse = 1;
2016 brw_MOV( p, t, param[ 3 ] );
2017 p->current->header.predicate_inverse = 0;
2018 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2019 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2020 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2021
2022 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2023 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2024 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2025 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2026
2027 /* z component */
2028 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2029 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2030 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2031 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2032
2033 brw_push_insn_state( p );
2034 brw_set_mask_control( p, BRW_MASK_DISABLE );
2035 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2036 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2037 brw_pop_insn_state( p );
2038
2039 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2040 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2041 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2042 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2043
2044 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2045 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2046 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2047 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2048
2049 /* w component */
2050 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2051 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2052 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2053 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2054
2055 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2056 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2057 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2058 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2059 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2060
2061 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2062 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2063 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2064 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2065
2066 /* Here we interpolate in the y dimension... */
2067 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2068 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2069 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2070 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2071 brw_ADD( p, x0y0, x0y0, x0y1 );
2072 brw_ADD( p, x1y0, x1y0, x1y1 );
2073
2074 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2075 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2076 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2077 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2078
2079 /* Now do the same thing for the front four gradients... */
2080 /* x component */
2081 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2082 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2083 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2084 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2085
2086 brw_push_insn_state( p );
2087 brw_set_mask_control( p, BRW_MASK_DISABLE );
2088 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2089 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2090 brw_pop_insn_state( p );
2091
2092 brw_MUL( p, x1y0, x1y0, t );
2093 brw_MUL( p, x1y1, x1y1, t );
2094 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2095 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2096 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2097
2098 /* y component */
2099 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2100 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2101 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2102 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2103
2104 brw_push_insn_state( p );
2105 brw_set_mask_control( p, BRW_MASK_DISABLE );
2106 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2107 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2108 brw_pop_insn_state( p );
2109
2110 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2111 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2112 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2113 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2114 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2115
2116 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2117 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2118 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2119 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2120
2121 /* z component */
2122 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2123 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2124 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2125 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2126
2127 brw_push_insn_state( p );
2128 brw_set_mask_control( p, BRW_MASK_DISABLE );
2129 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2130 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2131 brw_pop_insn_state( p );
2132
2133 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2134 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2135 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2136 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2137 /* prepare t for the w component (used below): w the first time through
2138 the loop; w - 1 the second time) */
2139 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2140 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2141 p->current->header.predicate_inverse = 1;
2142 brw_MOV( p, t, param[ 3 ] );
2143 p->current->header.predicate_inverse = 0;
2144 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2145
2146 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2147 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2148 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2149 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2150
2151 /* w component */
2152 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2153 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2154 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2155 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2156
2157 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2158 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2159 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2160 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2161
2162 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2163 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2164 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2165 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2166
2167 /* Interpolate in the y dimension: */
2168 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2169 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2170 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2171 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2172 brw_ADD( p, x0y0, x0y0, x0y1 );
2173 brw_ADD( p, x1y0, x1y0, x1y1 );
2174
2175 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2176 time put the front face in tmp[ 1 ] and we're nearly there... */
2177 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2178 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2179 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2180
2181 /* Another interpolation, in the z dimension: */
2182 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2183 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2184 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2185
2186 /* Exit the loop if we've computed both cubes... */
2187 origin = p->nr_insn;
2188 brw_push_insn_state( p );
2189 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2190 brw_set_mask_control( p, BRW_MASK_DISABLE );
2191 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2192 brw_pop_insn_state( p );
2193
2194 /* Save the result for the w=0 case, and increment the w coordinate: */
2195 brw_MOV( p, w0, tmp[ 0 ] );
2196 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2197 brw_imm_uw( 1 ) );
2198
2199 /* Loop around for the other cube. Explicitly set the flag register
2200 (unfortunately we must spend an extra instruction to do this: we
2201 can't rely on a side effect of the previous MOV or ADD because
2202 conditional modifiers which are normally true might be false in
2203 exceptional circumstances, e.g. given a NaN input; the add to
2204 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2205 brw_push_insn_state( p );
2206 brw_set_mask_control( p, BRW_MASK_DISABLE );
2207 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2208 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2209 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2210 brw_pop_insn_state( p );
2211
2212 /* Patch the previous conditional branch now that we know the
2213 destination address. */
2214 brw_set_src1( p->store + origin,
2215 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2216
2217 /* The very last interpolation. */
2218 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2219 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2220 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2221
2222 /* scale by pow( 2, -15 ), as described above */
2223 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2224
2225 release_tmps( c, mark );
2226 }
2227
2228 static void emit_noise4( struct brw_wm_compile *c,
2229 const struct prog_instruction *inst )
2230 {
2231 struct brw_compile *p = &c->func;
2232 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2233 GLuint mask = inst->DstReg.WriteMask;
2234 int i;
2235 int mark = mark_tmps( c );
2236
2237 assert( mark == 0 );
2238
2239 src0 = get_src_reg( c, inst, 0, 0 );
2240 src1 = get_src_reg( c, inst, 0, 1 );
2241 src2 = get_src_reg( c, inst, 0, 2 );
2242 src3 = get_src_reg( c, inst, 0, 3 );
2243
2244 param0 = alloc_tmp( c );
2245 param1 = alloc_tmp( c );
2246 param2 = alloc_tmp( c );
2247 param3 = alloc_tmp( c );
2248
2249 brw_MOV( p, param0, src0 );
2250 brw_MOV( p, param1, src1 );
2251 brw_MOV( p, param2, src2 );
2252 brw_MOV( p, param3, src3 );
2253
2254 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2255
2256 /* Fill in the result: */
2257 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2258 for (i = 0 ; i < 4; i++) {
2259 if (mask & (1<<i)) {
2260 dst = get_dst_reg(c, inst, i);
2261 brw_MOV( p, dst, param0 );
2262 }
2263 }
2264 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2265 brw_set_saturate( p, 0 );
2266
2267 release_tmps( c, mark );
2268 }
2269
2270 static void emit_wpos_xy(struct brw_wm_compile *c,
2271 const struct prog_instruction *inst)
2272 {
2273 struct brw_compile *p = &c->func;
2274 GLuint mask = inst->DstReg.WriteMask;
2275 struct brw_reg src0[2], dst[2];
2276
2277 dst[0] = get_dst_reg(c, inst, 0);
2278 dst[1] = get_dst_reg(c, inst, 1);
2279
2280 src0[0] = get_src_reg(c, inst, 0, 0);
2281 src0[1] = get_src_reg(c, inst, 0, 1);
2282
2283 /* Calculate the pixel offset from window bottom left into destination
2284 * X and Y channels.
2285 */
2286 if (mask & WRITEMASK_X) {
2287 /* X' = X - origin_x */
2288 brw_ADD(p,
2289 dst[0],
2290 retype(src0[0], BRW_REGISTER_TYPE_W),
2291 brw_imm_d(0 - c->key.origin_x));
2292 }
2293
2294 if (mask & WRITEMASK_Y) {
2295 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2296 brw_ADD(p,
2297 dst[1],
2298 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2299 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2300 }
2301 }
2302
2303 /* TODO
2304 BIAS on SIMD8 not working yet...
2305 */
2306 static void emit_txb(struct brw_wm_compile *c,
2307 const struct prog_instruction *inst)
2308 {
2309 struct brw_compile *p = &c->func;
2310 struct brw_reg dst[4], src[4], payload_reg;
2311 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2312 const GLuint unit = inst->TexSrcUnit;
2313 GLuint i;
2314 GLuint msg_type;
2315
2316 assert(unit < BRW_MAX_TEX_UNIT);
2317
2318 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2319
2320 for (i = 0; i < 4; i++)
2321 dst[i] = get_dst_reg(c, inst, i);
2322 for (i = 0; i < 4; i++)
2323 src[i] = get_src_reg(c, inst, 0, i);
2324
2325 switch (inst->TexSrcTarget) {
2326 case TEXTURE_1D_INDEX:
2327 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2328 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2329 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2330 break;
2331 case TEXTURE_2D_INDEX:
2332 case TEXTURE_RECT_INDEX:
2333 brw_MOV(p, brw_message_reg(2), src[0]);
2334 brw_MOV(p, brw_message_reg(3), src[1]);
2335 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2336 break;
2337 case TEXTURE_3D_INDEX:
2338 case TEXTURE_CUBE_INDEX:
2339 brw_MOV(p, brw_message_reg(2), src[0]);
2340 brw_MOV(p, brw_message_reg(3), src[1]);
2341 brw_MOV(p, brw_message_reg(4), src[2]);
2342 break;
2343 default:
2344 /* invalid target */
2345 abort();
2346 }
2347 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2348 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2349
2350 if (BRW_IS_IGDNG(p->brw)) {
2351 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2352 } else {
2353 /* Does it work well on SIMD8? */
2354 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2355 }
2356
2357 brw_SAMPLE(p,
2358 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2359 1, /* msg_reg_nr */
2360 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2361 SURF_INDEX_TEXTURE(unit),
2362 unit, /* sampler */
2363 inst->DstReg.WriteMask, /* writemask */
2364 msg_type, /* msg_type */
2365 4, /* response_length */
2366 4, /* msg_length */
2367 0, /* eot */
2368 1,
2369 BRW_SAMPLER_SIMD_MODE_SIMD8);
2370 }
2371
2372
2373 static void emit_tex(struct brw_wm_compile *c,
2374 const struct prog_instruction *inst)
2375 {
2376 struct brw_compile *p = &c->func;
2377 struct brw_reg dst[4], src[4], payload_reg;
2378 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2379 const GLuint unit = inst->TexSrcUnit;
2380 GLuint msg_len;
2381 GLuint i, nr;
2382 GLuint emit;
2383 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2384 GLuint msg_type;
2385
2386 assert(unit < BRW_MAX_TEX_UNIT);
2387
2388 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2389
2390 for (i = 0; i < 4; i++)
2391 dst[i] = get_dst_reg(c, inst, i);
2392 for (i = 0; i < 4; i++)
2393 src[i] = get_src_reg(c, inst, 0, i);
2394
2395 switch (inst->TexSrcTarget) {
2396 case TEXTURE_1D_INDEX:
2397 emit = WRITEMASK_X;
2398 nr = 1;
2399 break;
2400 case TEXTURE_2D_INDEX:
2401 case TEXTURE_RECT_INDEX:
2402 emit = WRITEMASK_XY;
2403 nr = 2;
2404 break;
2405 case TEXTURE_3D_INDEX:
2406 case TEXTURE_CUBE_INDEX:
2407 emit = WRITEMASK_XYZ;
2408 nr = 3;
2409 break;
2410 default:
2411 /* invalid target */
2412 abort();
2413 }
2414 msg_len = 1;
2415
2416 /* move/load S, T, R coords */
2417 for (i = 0; i < nr; i++) {
2418 static const GLuint swz[4] = {0,1,2,2};
2419 if (emit & (1<<i))
2420 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2421 else
2422 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2423 msg_len += 1;
2424 }
2425
2426 if (shadow) {
2427 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2428 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2429 }
2430
2431 if (BRW_IS_IGDNG(p->brw)) {
2432 if (shadow)
2433 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2434 else
2435 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2436 } else {
2437 /* Does it work for shadow on SIMD8 ? */
2438 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2439 }
2440
2441 brw_SAMPLE(p,
2442 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2443 1, /* msg_reg_nr */
2444 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2445 SURF_INDEX_TEXTURE(unit),
2446 unit, /* sampler */
2447 inst->DstReg.WriteMask, /* writemask */
2448 msg_type, /* msg_type */
2449 4, /* response_length */
2450 shadow ? 6 : 4, /* msg_length */
2451 0, /* eot */
2452 1,
2453 BRW_SAMPLER_SIMD_MODE_SIMD8);
2454
2455 if (shadow)
2456 brw_MOV(p, dst[3], brw_imm_f(1.0));
2457 }
2458
2459
2460 /**
2461 * Resolve subroutine calls after code emit is done.
2462 */
2463 static void post_wm_emit( struct brw_wm_compile *c )
2464 {
2465 brw_resolve_cals(&c->func);
2466 }
2467
2468 static void
2469 get_argument_regs(struct brw_wm_compile *c,
2470 const struct prog_instruction *inst,
2471 int index,
2472 struct brw_reg *regs,
2473 int mask)
2474 {
2475 int i;
2476
2477 for (i = 0; i < 4; i++) {
2478 if (mask & (1 << i))
2479 regs[i] = get_src_reg(c, inst, index, i);
2480 }
2481 }
2482
2483 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2484 {
2485 #define MAX_IF_DEPTH 32
2486 #define MAX_LOOP_DEPTH 32
2487 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2488 GLuint i, if_depth = 0, loop_depth = 0;
2489 struct brw_compile *p = &c->func;
2490 struct brw_indirect stack_index = brw_indirect(0, 0);
2491
2492 c->out_of_regs = GL_FALSE;
2493
2494 prealloc_reg(c);
2495 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2496 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2497
2498 for (i = 0; i < c->nr_fp_insns; i++) {
2499 const struct prog_instruction *inst = &c->prog_instructions[i];
2500 int dst_flags;
2501 struct brw_reg args[3][4], dst[4];
2502 int j;
2503
2504 c->cur_inst = i;
2505
2506 #if 0
2507 _mesa_printf("Inst %d: ", i);
2508 _mesa_print_instruction(inst);
2509 #endif
2510
2511 /* fetch any constants that this instruction needs */
2512 if (c->fp->use_const_buffer)
2513 fetch_constants(c, inst);
2514
2515 if (inst->Opcode != OPCODE_ARL) {
2516 for (j = 0; j < 4; j++) {
2517 if (inst->DstReg.WriteMask & (1 << j))
2518 dst[j] = get_dst_reg(c, inst, j);
2519 else
2520 dst[j] = brw_null_reg();
2521 }
2522 }
2523 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2524 get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2525
2526 dst_flags = inst->DstReg.WriteMask;
2527 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2528 dst_flags |= SATURATE;
2529
2530 if (inst->CondUpdate)
2531 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2532 else
2533 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2534
2535 dst_flags = inst->DstReg.WriteMask;
2536 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2537 dst_flags |= SATURATE;
2538
2539 switch (inst->Opcode) {
2540 case WM_PIXELXY:
2541 emit_pixel_xy(c, inst);
2542 break;
2543 case WM_DELTAXY:
2544 emit_delta_xy(c, inst);
2545 break;
2546 case WM_PIXELW:
2547 emit_pixel_w(c, inst);
2548 break;
2549 case WM_LINTERP:
2550 emit_linterp(c, inst);
2551 break;
2552 case WM_PINTERP:
2553 emit_pinterp(c, inst);
2554 break;
2555 case WM_CINTERP:
2556 emit_cinterp(c, inst);
2557 break;
2558 case WM_WPOSXY:
2559 emit_wpos_xy(c, inst);
2560 break;
2561 case WM_FB_WRITE:
2562 emit_fb_write(c, inst);
2563 break;
2564 case WM_FRONTFACING:
2565 emit_frontfacing(c, inst);
2566 break;
2567 case OPCODE_ADD:
2568 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2569 break;
2570 case OPCODE_ARL:
2571 emit_arl(c, inst);
2572 break;
2573 case OPCODE_FRC:
2574 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2575 break;
2576 case OPCODE_FLR:
2577 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2578 break;
2579 case OPCODE_LRP:
2580 unalias3(c, emit_lrp,
2581 dst, dst_flags, args[0], args[1], args[2]);
2582 break;
2583 case OPCODE_TRUNC:
2584 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2585 break;
2586 case OPCODE_MOV:
2587 case OPCODE_SWZ:
2588 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2589 break;
2590 case OPCODE_DP3:
2591 emit_dp3(p, dst, dst_flags, args[0], args[1]);
2592 break;
2593 case OPCODE_DP4:
2594 emit_dp4(p, dst, dst_flags, args[0], args[1]);
2595 break;
2596 case OPCODE_XPD:
2597 emit_xpd(c, inst);
2598 break;
2599 case OPCODE_DPH:
2600 emit_dph(p, dst, dst_flags, args[0], args[1]);
2601 break;
2602 case OPCODE_RCP:
2603 emit_rcp(c, inst);
2604 break;
2605 case OPCODE_RSQ:
2606 emit_rsq(c, inst);
2607 break;
2608 case OPCODE_SIN:
2609 emit_sin(c, inst);
2610 break;
2611 case OPCODE_COS:
2612 emit_cos(c, inst);
2613 break;
2614 case OPCODE_EX2:
2615 emit_ex2(c, inst);
2616 break;
2617 case OPCODE_LG2:
2618 emit_lg2(c, inst);
2619 break;
2620 case OPCODE_MIN:
2621 case OPCODE_MAX:
2622 emit_min_max(c, inst);
2623 break;
2624 case OPCODE_DDX:
2625 case OPCODE_DDY:
2626 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2627 args[0]);
2628 break;
2629 case OPCODE_SLT:
2630 emit_sop(p, dst, dst_flags,
2631 BRW_CONDITIONAL_L, args[0], args[1]);
2632 break;
2633 case OPCODE_SLE:
2634 emit_sop(p, dst, dst_flags,
2635 BRW_CONDITIONAL_LE, args[0], args[1]);
2636 break;
2637 case OPCODE_SGT:
2638 emit_sop(p, dst, dst_flags,
2639 BRW_CONDITIONAL_G, args[0], args[1]);
2640 break;
2641 case OPCODE_SGE:
2642 emit_sop(p, dst, dst_flags,
2643 BRW_CONDITIONAL_GE, args[0], args[1]);
2644 break;
2645 case OPCODE_SEQ:
2646 emit_sop(p, dst, dst_flags,
2647 BRW_CONDITIONAL_EQ, args[0], args[1]);
2648 break;
2649 case OPCODE_SNE:
2650 emit_sop(p, dst, dst_flags,
2651 BRW_CONDITIONAL_NEQ, args[0], args[1]);
2652 break;
2653 case OPCODE_MUL:
2654 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2655 break;
2656 case OPCODE_POW:
2657 emit_pow(c, inst);
2658 break;
2659 case OPCODE_MAD:
2660 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2661 break;
2662 case OPCODE_NOISE1:
2663 emit_noise1(c, inst);
2664 break;
2665 case OPCODE_NOISE2:
2666 emit_noise2(c, inst);
2667 break;
2668 case OPCODE_NOISE3:
2669 emit_noise3(c, inst);
2670 break;
2671 case OPCODE_NOISE4:
2672 emit_noise4(c, inst);
2673 break;
2674 case OPCODE_TEX:
2675 emit_tex(c, inst);
2676 break;
2677 case OPCODE_TXB:
2678 emit_txb(c, inst);
2679 break;
2680 case OPCODE_KIL_NV:
2681 emit_kil(c);
2682 break;
2683 case OPCODE_IF:
2684 assert(if_depth < MAX_IF_DEPTH);
2685 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2686 break;
2687 case OPCODE_ELSE:
2688 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2689 break;
2690 case OPCODE_ENDIF:
2691 assert(if_depth > 0);
2692 brw_ENDIF(p, if_inst[--if_depth]);
2693 break;
2694 case OPCODE_BGNSUB:
2695 brw_save_label(p, inst->Comment, p->nr_insn);
2696 break;
2697 case OPCODE_ENDSUB:
2698 /* no-op */
2699 break;
2700 case OPCODE_CAL:
2701 brw_push_insn_state(p);
2702 brw_set_mask_control(p, BRW_MASK_DISABLE);
2703 brw_set_access_mode(p, BRW_ALIGN_1);
2704 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2705 brw_set_access_mode(p, BRW_ALIGN_16);
2706 brw_ADD(p, get_addr_reg(stack_index),
2707 get_addr_reg(stack_index), brw_imm_d(4));
2708 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2709 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2710 brw_pop_insn_state(p);
2711 break;
2712
2713 case OPCODE_RET:
2714 brw_push_insn_state(p);
2715 brw_set_mask_control(p, BRW_MASK_DISABLE);
2716 brw_ADD(p, get_addr_reg(stack_index),
2717 get_addr_reg(stack_index), brw_imm_d(-4));
2718 brw_set_access_mode(p, BRW_ALIGN_1);
2719 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2720 brw_set_access_mode(p, BRW_ALIGN_16);
2721 brw_pop_insn_state(p);
2722
2723 break;
2724 case OPCODE_BGNLOOP:
2725 /* XXX may need to invalidate the current_constant regs */
2726 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2727 break;
2728 case OPCODE_BRK:
2729 brw_BREAK(p);
2730 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2731 break;
2732 case OPCODE_CONT:
2733 brw_CONT(p);
2734 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2735 break;
2736 case OPCODE_ENDLOOP:
2737 {
2738 struct brw_instruction *inst0, *inst1;
2739 GLuint br = 1;
2740
2741 if (BRW_IS_IGDNG(brw))
2742 br = 2;
2743
2744 loop_depth--;
2745 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2746 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2747 while (inst0 > loop_inst[loop_depth]) {
2748 inst0--;
2749 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2750 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2751 inst0->bits3.if_else.pop_count = 0;
2752 }
2753 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2754 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2755 inst0->bits3.if_else.pop_count = 0;
2756 }
2757 }
2758 }
2759 break;
2760 default:
2761 _mesa_printf("unsupported IR in fragment shader %d\n",
2762 inst->Opcode);
2763 }
2764
2765 if (inst->CondUpdate)
2766 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2767 else
2768 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2769 }
2770 post_wm_emit(c);
2771
2772 if (INTEL_DEBUG & DEBUG_WM) {
2773 _mesa_printf("wm-native:\n");
2774 for (i = 0; i < p->nr_insn; i++)
2775 brw_disasm(stderr, &p->store[i]);
2776 _mesa_printf("\n");
2777 }
2778 }
2779
2780 /**
2781 * Do GPU code generation for shaders that use GLSL features such as
2782 * flow control. Other shaders will be compiled with the
2783 */
2784 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2785 {
2786 if (INTEL_DEBUG & DEBUG_WM) {
2787 _mesa_printf("brw_wm_glsl_emit:\n");
2788 }
2789
2790 /* initial instruction translation/simplification */
2791 brw_wm_pass_fp(c);
2792
2793 /* actual code generation */
2794 brw_wm_emit_glsl(brw, c);
2795
2796 if (INTEL_DEBUG & DEBUG_WM) {
2797 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2798 }
2799
2800 c->prog_data.total_grf = num_grf_used(c);
2801 c->prog_data.total_scratch = 0;
2802 }