Merge branch 'mesa_7_5_branch'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25 for (i = 0; i < fp->Base.NumInstructions; i++) {
26 const struct prog_instruction *inst = &fp->Base.Instructions[i];
27 switch (inst->Opcode) {
28 case OPCODE_ARL:
29 case OPCODE_IF:
30 case OPCODE_ENDIF:
31 case OPCODE_CAL:
32 case OPCODE_BRK:
33 case OPCODE_RET:
34 case OPCODE_DDX:
35 case OPCODE_DDY:
36 case OPCODE_NOISE1:
37 case OPCODE_NOISE2:
38 case OPCODE_NOISE3:
39 case OPCODE_NOISE4:
40 case OPCODE_BGNLOOP:
41 return GL_TRUE;
42 default:
43 break;
44 }
45 }
46 return GL_FALSE;
47 }
48
49
50
51 static void
52 reclaim_temps(struct brw_wm_compile *c);
53
54
55 /** Mark GRF register as used. */
56 static void
57 prealloc_grf(struct brw_wm_compile *c, int r)
58 {
59 c->used_grf[r] = GL_TRUE;
60 }
61
62
63 /** Mark given GRF register as not in use. */
64 static void
65 release_grf(struct brw_wm_compile *c, int r)
66 {
67 /*assert(c->used_grf[r]);*/
68 c->used_grf[r] = GL_FALSE;
69 c->first_free_grf = MIN2(c->first_free_grf, r);
70 }
71
72
73 /** Return index of a free GRF, mark it as used. */
74 static int
75 alloc_grf(struct brw_wm_compile *c)
76 {
77 GLuint r;
78 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
79 if (!c->used_grf[r]) {
80 c->used_grf[r] = GL_TRUE;
81 c->first_free_grf = r + 1; /* a guess */
82 return r;
83 }
84 }
85
86 /* no free temps, try to reclaim some */
87 reclaim_temps(c);
88 c->first_free_grf = 0;
89
90 /* try alloc again */
91 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
92 if (!c->used_grf[r]) {
93 c->used_grf[r] = GL_TRUE;
94 c->first_free_grf = r + 1; /* a guess */
95 return r;
96 }
97 }
98
99 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
100 assert(c->used_grf[r]);
101 }
102
103 /* really, no free GRF regs found */
104 if (!c->out_of_regs) {
105 /* print warning once per compilation */
106 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
107 c->out_of_regs = GL_TRUE;
108 }
109
110 return -1;
111 }
112
113
114 /** Return number of GRF registers used */
115 static int
116 num_grf_used(const struct brw_wm_compile *c)
117 {
118 int r;
119 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
120 if (c->used_grf[r])
121 return r + 1;
122 return 0;
123 }
124
125
126
127 /**
128 * Record the mapping of a Mesa register to a hardware register.
129 */
130 static void set_reg(struct brw_wm_compile *c, int file, int index,
131 int component, struct brw_reg reg)
132 {
133 c->wm_regs[file][index][component].reg = reg;
134 c->wm_regs[file][index][component].inited = GL_TRUE;
135 }
136
137 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
138 {
139 struct brw_reg reg;
140
141 /* if we need to allocate another temp, grow the tmp_regs[] array */
142 if (c->tmp_index == c->tmp_max) {
143 int r = alloc_grf(c);
144 if (r < 0) {
145 /*printf("Out of temps in %s\n", __FUNCTION__);*/
146 r = 50; /* XXX random register! */
147 }
148 c->tmp_regs[ c->tmp_max++ ] = r;
149 }
150
151 /* form the GRF register */
152 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
153 /*printf("alloc_temp %d\n", reg.nr);*/
154 assert(reg.nr < BRW_WM_MAX_GRF);
155 return reg;
156
157 }
158
159 /**
160 * Save current temp register info.
161 * There must be a matching call to release_tmps().
162 */
163 static int mark_tmps(struct brw_wm_compile *c)
164 {
165 return c->tmp_index;
166 }
167
168 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
169 {
170 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
171 }
172
173 static void release_tmps(struct brw_wm_compile *c, int mark)
174 {
175 c->tmp_index = mark;
176 }
177
178 /**
179 * Convert Mesa src register to brw register.
180 *
181 * Since we're running in SOA mode each Mesa register corresponds to four
182 * hardware registers. We allocate the hardware registers as needed here.
183 *
184 * \param file register file, one of PROGRAM_x
185 * \param index register number
186 * \param component src component (X=0, Y=1, Z=2, W=3)
187 * \param nr not used?!?
188 * \param neg negate value?
189 * \param abs take absolute value?
190 */
191 static struct brw_reg
192 get_reg(struct brw_wm_compile *c, int file, int index, int component,
193 int nr, GLuint neg, GLuint abs)
194 {
195 struct brw_reg reg;
196 switch (file) {
197 case PROGRAM_STATE_VAR:
198 case PROGRAM_CONSTANT:
199 case PROGRAM_UNIFORM:
200 file = PROGRAM_STATE_VAR;
201 break;
202 case PROGRAM_UNDEFINED:
203 return brw_null_reg();
204 case PROGRAM_TEMPORARY:
205 case PROGRAM_INPUT:
206 case PROGRAM_OUTPUT:
207 case PROGRAM_PAYLOAD:
208 break;
209 default:
210 _mesa_problem(NULL, "Unexpected file in get_reg()");
211 return brw_null_reg();
212 }
213
214 assert(index < 256);
215 assert(component < 4);
216
217 /* see if we've already allocated a HW register for this Mesa register */
218 if (c->wm_regs[file][index][component].inited) {
219 /* yes, re-use */
220 reg = c->wm_regs[file][index][component].reg;
221 }
222 else {
223 /* no, allocate new register */
224 int grf = alloc_grf(c);
225 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
226 if (grf < 0) {
227 /* totally out of temps */
228 grf = 51; /* XXX random register! */
229 }
230
231 reg = brw_vec8_grf(grf, 0);
232 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
233
234 set_reg(c, file, index, component, reg);
235 }
236
237 if (neg & (1 << component)) {
238 reg = negate(reg);
239 }
240 if (abs)
241 reg = brw_abs(reg);
242 return reg;
243 }
244
245
246
247 /**
248 * This is called if we run out of GRF registers. Examine the live intervals
249 * of temp regs in the program and free those which won't be used again.
250 */
251 static void
252 reclaim_temps(struct brw_wm_compile *c)
253 {
254 GLint intBegin[MAX_PROGRAM_TEMPS];
255 GLint intEnd[MAX_PROGRAM_TEMPS];
256 int index;
257
258 /*printf("Reclaim temps:\n");*/
259
260 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
261 intBegin, intEnd);
262
263 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
264 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
265 /* program temp[i] can be freed */
266 int component;
267 /*printf(" temp[%d] is dead\n", index);*/
268 for (component = 0; component < 4; component++) {
269 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
270 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
271 release_grf(c, r);
272 /*
273 printf(" Reclaim temp %d, reg %d at inst %d\n",
274 index, r, c->cur_inst);
275 */
276 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
277 }
278 }
279 }
280 }
281 }
282
283
284
285
286 /**
287 * Preallocate registers. This sets up the Mesa to hardware register
288 * mapping for certain registers, such as constants (uniforms/state vars)
289 * and shader inputs.
290 */
291 static void prealloc_reg(struct brw_wm_compile *c)
292 {
293 int i, j;
294 struct brw_reg reg;
295 int urb_read_length = 0;
296 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
297 GLuint reg_index = 0;
298
299 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
300 c->first_free_grf = 0;
301
302 for (i = 0; i < 4; i++) {
303 if (i < c->key.nr_depth_regs)
304 reg = brw_vec8_grf(i * 2, 0);
305 else
306 reg = brw_vec8_grf(0, 0);
307 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
308 }
309 reg_index += 2 * c->key.nr_depth_regs;
310
311 /* constants */
312 {
313 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
314 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
315
316 /* use a real constant buffer, or just use a section of the GRF? */
317 /* XXX this heuristic may need adjustment... */
318 if ((nr_params + nr_temps) * 4 + reg_index > 80)
319 c->fp->use_const_buffer = GL_TRUE;
320 else
321 c->fp->use_const_buffer = GL_FALSE;
322 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
323
324 if (c->fp->use_const_buffer) {
325 /* We'll use a real constant buffer and fetch constants from
326 * it with a dataport read message.
327 */
328
329 /* number of float constants in CURBE */
330 c->prog_data.nr_params = 0;
331 }
332 else {
333 const struct gl_program_parameter_list *plist =
334 c->fp->program.Base.Parameters;
335 int index = 0;
336
337 /* number of float constants in CURBE */
338 c->prog_data.nr_params = 4 * nr_params;
339
340 /* loop over program constants (float[4]) */
341 for (i = 0; i < nr_params; i++) {
342 /* loop over XYZW channels */
343 for (j = 0; j < 4; j++, index++) {
344 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
345 /* Save pointer to parameter/constant value.
346 * Constants will be copied in prepare_constant_buffer()
347 */
348 c->prog_data.param[index] = &plist->ParameterValues[i][j];
349 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
350 }
351 }
352 /* number of constant regs used (each reg is float[8]) */
353 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
354 reg_index += c->nr_creg;
355 }
356 }
357
358 /* fragment shader inputs */
359 for (i = 0; i < VERT_RESULT_MAX; i++) {
360 int fp_input;
361
362 if (i >= VERT_RESULT_VAR0)
363 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
364 else if (i <= VERT_RESULT_TEX7)
365 fp_input = i;
366 else
367 fp_input = -1;
368
369 if (fp_input >= 0 && inputs & (1 << fp_input)) {
370 urb_read_length = reg_index;
371 reg = brw_vec8_grf(reg_index, 0);
372 for (j = 0; j < 4; j++)
373 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
374 }
375 if (c->key.vp_outputs_written & (1 << i)) {
376 reg_index += 2;
377 }
378 }
379
380 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
381 c->prog_data.urb_read_length = urb_read_length;
382 c->prog_data.curb_read_length = c->nr_creg;
383 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
384 reg_index++;
385 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
386 reg_index += 2;
387
388 /* mark GRF regs [0..reg_index-1] as in-use */
389 for (i = 0; i < reg_index; i++)
390 prealloc_grf(c, i);
391
392 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
393 prealloc_grf(c, 126);
394 prealloc_grf(c, 127);
395
396 for (i = 0; i < c->nr_fp_insns; i++) {
397 const struct prog_instruction *inst = &c->prog_instructions[i];
398 struct brw_reg dst[4];
399
400 switch (inst->Opcode) {
401 case OPCODE_TEX:
402 case OPCODE_TXB:
403 /* Allocate the channels of texture results contiguously,
404 * since they are written out that way by the sampler unit.
405 */
406 for (j = 0; j < 4; j++) {
407 dst[j] = get_dst_reg(c, inst, j);
408 if (j != 0)
409 assert(dst[j].nr == dst[j - 1].nr + 1);
410 }
411 break;
412 default:
413 break;
414 }
415 }
416
417 /* An instruction may reference up to three constants.
418 * They'll be found in these registers.
419 * XXX alloc these on demand!
420 */
421 if (c->fp->use_const_buffer) {
422 for (i = 0; i < 3; i++) {
423 c->current_const[i].index = -1;
424 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
425 }
426 }
427 #if 0
428 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
429 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
430 #endif
431 }
432
433
434 /**
435 * Check if any of the instruction's src registers are constants, uniforms,
436 * or statevars. If so, fetch any constants that we don't already have in
437 * the three GRF slots.
438 */
439 static void fetch_constants(struct brw_wm_compile *c,
440 const struct prog_instruction *inst)
441 {
442 struct brw_compile *p = &c->func;
443 GLuint i;
444
445 /* loop over instruction src regs */
446 for (i = 0; i < 3; i++) {
447 const struct prog_src_register *src = &inst->SrcReg[i];
448 if (src->File == PROGRAM_STATE_VAR ||
449 src->File == PROGRAM_CONSTANT ||
450 src->File == PROGRAM_UNIFORM) {
451 c->current_const[i].index = src->Index;
452
453 #if 0
454 printf(" fetch const[%d] for arg %d into reg %d\n",
455 src->Index, i, c->current_const[i].reg.nr);
456 #endif
457
458 /* need to fetch the constant now */
459 brw_dp_READ_4(p,
460 c->current_const[i].reg, /* writeback dest */
461 src->RelAddr, /* relative indexing? */
462 16 * src->Index, /* byte offset */
463 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
464 );
465 }
466 }
467 }
468
469
470 /**
471 * Convert Mesa dst register to brw register.
472 */
473 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
474 const struct prog_instruction *inst,
475 GLuint component)
476 {
477 const int nr = 1;
478 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
479 0, 0);
480 }
481
482
483 static struct brw_reg
484 get_src_reg_const(struct brw_wm_compile *c,
485 const struct prog_instruction *inst,
486 GLuint srcRegIndex, GLuint component)
487 {
488 /* We should have already fetched the constant from the constant
489 * buffer in fetch_constants(). Now we just have to return a
490 * register description that extracts the needed component and
491 * smears it across all eight vector components.
492 */
493 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
494 struct brw_reg const_reg;
495
496 assert(component < 4);
497 assert(srcRegIndex < 3);
498 assert(c->current_const[srcRegIndex].index != -1);
499 const_reg = c->current_const[srcRegIndex].reg;
500
501 /* extract desired float from the const_reg, and smear */
502 const_reg = stride(const_reg, 0, 1, 0);
503 const_reg.subnr = component * 4;
504
505 if (src->Negate & (1 << component))
506 const_reg = negate(const_reg);
507 if (src->Abs)
508 const_reg = brw_abs(const_reg);
509
510 #if 0
511 printf(" form const[%d].%d for arg %d, reg %d\n",
512 c->current_const[srcRegIndex].index,
513 component,
514 srcRegIndex,
515 const_reg.nr);
516 #endif
517
518 return const_reg;
519 }
520
521
522 /**
523 * Convert Mesa src register to brw register.
524 */
525 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
526 const struct prog_instruction *inst,
527 GLuint srcRegIndex, GLuint channel)
528 {
529 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
530 const GLuint nr = 1;
531 const GLuint component = GET_SWZ(src->Swizzle, channel);
532
533 /* Extended swizzle terms */
534 if (component == SWIZZLE_ZERO) {
535 return brw_imm_f(0.0F);
536 }
537 else if (component == SWIZZLE_ONE) {
538 return brw_imm_f(1.0F);
539 }
540
541 if (c->fp->use_const_buffer &&
542 (src->File == PROGRAM_STATE_VAR ||
543 src->File == PROGRAM_CONSTANT ||
544 src->File == PROGRAM_UNIFORM)) {
545 return get_src_reg_const(c, inst, srcRegIndex, component);
546 }
547 else {
548 /* other type of source register */
549 return get_reg(c, src->File, src->Index, component, nr,
550 src->Negate, src->Abs);
551 }
552 }
553
554
555 /**
556 * Same as \sa get_src_reg() but if the register is a literal, emit
557 * a brw_reg encoding the literal.
558 * Note that a brw instruction only allows one src operand to be a literal.
559 * For instructions with more than one operand, only the second can be a
560 * literal. This means that we treat some literals as constants/uniforms
561 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
562 *
563 */
564 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
565 const struct prog_instruction *inst,
566 GLuint srcRegIndex, GLuint channel)
567 {
568 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
569 if (src->File == PROGRAM_CONSTANT) {
570 /* a literal */
571 const int component = GET_SWZ(src->Swizzle, channel);
572 const GLfloat *param =
573 c->fp->program.Base.Parameters->ParameterValues[src->Index];
574 GLfloat value = param[component];
575 if (src->Negate & (1 << channel))
576 value = -value;
577 if (src->Abs)
578 value = FABSF(value);
579 #if 0
580 printf(" form immed value %f for chan %d\n", value, channel);
581 #endif
582 return brw_imm_f(value);
583 }
584 else {
585 return get_src_reg(c, inst, srcRegIndex, channel);
586 }
587 }
588
589
590 /**
591 * Subroutines are minimal support for resusable instruction sequences.
592 * They are implemented as simply as possible to minimise overhead: there
593 * is no explicit support for communication between the caller and callee
594 * other than saving the return address in a temporary register, nor is
595 * there any automatic local storage. This implies that great care is
596 * required before attempting reentrancy or any kind of nested
597 * subroutine invocations.
598 */
599 static void invoke_subroutine( struct brw_wm_compile *c,
600 enum _subroutine subroutine,
601 void (*emit)( struct brw_wm_compile * ) )
602 {
603 struct brw_compile *p = &c->func;
604
605 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
606
607 if( c->subroutines[ subroutine ] ) {
608 /* subroutine previously emitted: reuse existing instructions */
609
610 int mark = mark_tmps( c );
611 struct brw_reg return_address = retype( alloc_tmp( c ),
612 BRW_REGISTER_TYPE_UD );
613 int here = p->nr_insn;
614
615 brw_push_insn_state(p);
616 brw_set_mask_control(p, BRW_MASK_DISABLE);
617 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
618
619 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
620 brw_imm_d( ( c->subroutines[ subroutine ] -
621 here - 1 ) << 4 ) );
622 brw_pop_insn_state(p);
623
624 release_tmps( c, mark );
625 } else {
626 /* previously unused subroutine: emit, and mark for later reuse */
627
628 int mark = mark_tmps( c );
629 struct brw_reg return_address = retype( alloc_tmp( c ),
630 BRW_REGISTER_TYPE_UD );
631 struct brw_instruction *calc;
632 int base = p->nr_insn;
633
634 brw_push_insn_state(p);
635 brw_set_mask_control(p, BRW_MASK_DISABLE);
636 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
637 brw_pop_insn_state(p);
638
639 c->subroutines[ subroutine ] = p->nr_insn;
640
641 emit( c );
642
643 brw_push_insn_state(p);
644 brw_set_mask_control(p, BRW_MASK_DISABLE);
645 brw_MOV( p, brw_ip_reg(), return_address );
646 brw_pop_insn_state(p);
647
648 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
649
650 release_tmps( c, mark );
651 }
652 }
653
654 static void emit_trunc( struct brw_wm_compile *c,
655 const struct prog_instruction *inst)
656 {
657 int i;
658 struct brw_compile *p = &c->func;
659 GLuint mask = inst->DstReg.WriteMask;
660 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
661 for (i = 0; i < 4; i++) {
662 if (mask & (1<<i)) {
663 struct brw_reg src, dst;
664 dst = get_dst_reg(c, inst, i);
665 src = get_src_reg(c, inst, 0, i);
666 brw_RNDZ(p, dst, src);
667 }
668 }
669 brw_set_saturate(p, 0);
670 }
671
672 static void emit_mov( struct brw_wm_compile *c,
673 const struct prog_instruction *inst)
674 {
675 int i;
676 struct brw_compile *p = &c->func;
677 GLuint mask = inst->DstReg.WriteMask;
678 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
679 for (i = 0; i < 4; i++) {
680 if (mask & (1<<i)) {
681 struct brw_reg src, dst;
682 dst = get_dst_reg(c, inst, i);
683 /* XXX some moves from immediate value don't work reliably!!! */
684 /*src = get_src_reg_imm(c, inst, 0, i);*/
685 src = get_src_reg(c, inst, 0, i);
686 brw_MOV(p, dst, src);
687 }
688 }
689 brw_set_saturate(p, 0);
690 }
691
692 static void emit_pixel_xy(struct brw_wm_compile *c,
693 const struct prog_instruction *inst)
694 {
695 struct brw_reg r1 = brw_vec1_grf(1, 0);
696 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
697
698 struct brw_reg dst0, dst1;
699 struct brw_compile *p = &c->func;
700 GLuint mask = inst->DstReg.WriteMask;
701
702 dst0 = get_dst_reg(c, inst, 0);
703 dst1 = get_dst_reg(c, inst, 1);
704 /* Calculate pixel centers by adding 1 or 0 to each of the
705 * micro-tile coordinates passed in r1.
706 */
707 if (mask & WRITEMASK_X) {
708 brw_ADD(p,
709 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
710 stride(suboffset(r1_uw, 4), 2, 4, 0),
711 brw_imm_v(0x10101010));
712 }
713
714 if (mask & WRITEMASK_Y) {
715 brw_ADD(p,
716 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
717 stride(suboffset(r1_uw, 5), 2, 4, 0),
718 brw_imm_v(0x11001100));
719 }
720 }
721
722 static void emit_delta_xy(struct brw_wm_compile *c,
723 const struct prog_instruction *inst)
724 {
725 struct brw_reg r1 = brw_vec1_grf(1, 0);
726 struct brw_reg dst0, dst1, src0, src1;
727 struct brw_compile *p = &c->func;
728 GLuint mask = inst->DstReg.WriteMask;
729
730 dst0 = get_dst_reg(c, inst, 0);
731 dst1 = get_dst_reg(c, inst, 1);
732 src0 = get_src_reg(c, inst, 0, 0);
733 src1 = get_src_reg(c, inst, 0, 1);
734 /* Calc delta X,Y by subtracting origin in r1 from the pixel
735 * centers.
736 */
737 if (mask & WRITEMASK_X) {
738 brw_ADD(p,
739 dst0,
740 retype(src0, BRW_REGISTER_TYPE_UW),
741 negate(r1));
742 }
743
744 if (mask & WRITEMASK_Y) {
745 brw_ADD(p,
746 dst1,
747 retype(src1, BRW_REGISTER_TYPE_UW),
748 negate(suboffset(r1,1)));
749
750 }
751 }
752
753 static void fire_fb_write( struct brw_wm_compile *c,
754 GLuint base_reg,
755 GLuint nr,
756 GLuint target,
757 GLuint eot)
758 {
759 struct brw_compile *p = &c->func;
760 /* Pass through control information:
761 */
762 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
763 {
764 brw_push_insn_state(p);
765 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
766 brw_MOV(p,
767 brw_message_reg(base_reg + 1),
768 brw_vec8_grf(1, 0));
769 brw_pop_insn_state(p);
770 }
771 /* Send framebuffer write message: */
772 brw_fb_WRITE(p,
773 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
774 base_reg,
775 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
776 target,
777 nr,
778 0,
779 eot);
780 }
781
782 static void emit_fb_write(struct brw_wm_compile *c,
783 const struct prog_instruction *inst)
784 {
785 struct brw_compile *p = &c->func;
786 int nr = 2;
787 int channel;
788 GLuint target, eot;
789 struct brw_reg src0;
790
791 /* Reserve a space for AA - may not be needed:
792 */
793 if (c->key.aa_dest_stencil_reg)
794 nr += 1;
795
796 brw_push_insn_state(p);
797 for (channel = 0; channel < 4; channel++) {
798 src0 = get_src_reg(c, inst, 0, channel);
799 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
800 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
801 brw_MOV(p, brw_message_reg(nr + channel), src0);
802 }
803 /* skip over the regs populated above: */
804 nr += 8;
805 brw_pop_insn_state(p);
806
807 if (c->key.source_depth_to_render_target) {
808 if (c->key.computes_depth) {
809 src0 = get_src_reg(c, inst, 2, 2);
810 brw_MOV(p, brw_message_reg(nr), src0);
811 }
812 else {
813 src0 = get_src_reg(c, inst, 1, 1);
814 brw_MOV(p, brw_message_reg(nr), src0);
815 }
816
817 nr += 2;
818 }
819
820 if (c->key.dest_depth_reg) {
821 const GLuint comp = c->key.dest_depth_reg / 2;
822 const GLuint off = c->key.dest_depth_reg % 2;
823
824 if (off != 0) {
825 /* XXX this code needs review/testing */
826 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
827 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
828
829 brw_push_insn_state(p);
830 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
831
832 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
833 /* 2nd half? */
834 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
835 brw_pop_insn_state(p);
836 }
837 else
838 {
839 struct brw_reg src = get_src_reg(c, inst, 1, 1);
840 brw_MOV(p, brw_message_reg(nr), src);
841 }
842 nr += 2;
843 }
844
845 target = inst->Aux >> 1;
846 eot = inst->Aux & 1;
847 fire_fb_write(c, 0, nr, target, eot);
848 }
849
850 static void emit_pixel_w( struct brw_wm_compile *c,
851 const struct prog_instruction *inst)
852 {
853 struct brw_compile *p = &c->func;
854 GLuint mask = inst->DstReg.WriteMask;
855 if (mask & WRITEMASK_W) {
856 struct brw_reg dst, src0, delta0, delta1;
857 struct brw_reg interp3;
858
859 dst = get_dst_reg(c, inst, 3);
860 src0 = get_src_reg(c, inst, 0, 0);
861 delta0 = get_src_reg(c, inst, 1, 0);
862 delta1 = get_src_reg(c, inst, 1, 1);
863
864 interp3 = brw_vec1_grf(src0.nr+1, 4);
865 /* Calc 1/w - just linterp wpos[3] optimized by putting the
866 * result straight into a message reg.
867 */
868 brw_LINE(p, brw_null_reg(), interp3, delta0);
869 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
870
871 /* Calc w */
872 brw_math_16( p, dst,
873 BRW_MATH_FUNCTION_INV,
874 BRW_MATH_SATURATE_NONE,
875 2, brw_null_reg(),
876 BRW_MATH_PRECISION_FULL);
877 }
878 }
879
880 static void emit_linterp(struct brw_wm_compile *c,
881 const struct prog_instruction *inst)
882 {
883 struct brw_compile *p = &c->func;
884 GLuint mask = inst->DstReg.WriteMask;
885 struct brw_reg interp[4];
886 struct brw_reg dst, delta0, delta1;
887 struct brw_reg src0;
888 GLuint nr, i;
889
890 src0 = get_src_reg(c, inst, 0, 0);
891 delta0 = get_src_reg(c, inst, 1, 0);
892 delta1 = get_src_reg(c, inst, 1, 1);
893 nr = src0.nr;
894
895 interp[0] = brw_vec1_grf(nr, 0);
896 interp[1] = brw_vec1_grf(nr, 4);
897 interp[2] = brw_vec1_grf(nr+1, 0);
898 interp[3] = brw_vec1_grf(nr+1, 4);
899
900 for(i = 0; i < 4; i++ ) {
901 if (mask & (1<<i)) {
902 dst = get_dst_reg(c, inst, i);
903 brw_LINE(p, brw_null_reg(), interp[i], delta0);
904 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
905 }
906 }
907 }
908
909 static void emit_cinterp(struct brw_wm_compile *c,
910 const struct prog_instruction *inst)
911 {
912 struct brw_compile *p = &c->func;
913 GLuint mask = inst->DstReg.WriteMask;
914
915 struct brw_reg interp[4];
916 struct brw_reg dst, src0;
917 GLuint nr, i;
918
919 src0 = get_src_reg(c, inst, 0, 0);
920 nr = src0.nr;
921
922 interp[0] = brw_vec1_grf(nr, 0);
923 interp[1] = brw_vec1_grf(nr, 4);
924 interp[2] = brw_vec1_grf(nr+1, 0);
925 interp[3] = brw_vec1_grf(nr+1, 4);
926
927 for(i = 0; i < 4; i++ ) {
928 if (mask & (1<<i)) {
929 dst = get_dst_reg(c, inst, i);
930 brw_MOV(p, dst, suboffset(interp[i],3));
931 }
932 }
933 }
934
935 static void emit_pinterp(struct brw_wm_compile *c,
936 const struct prog_instruction *inst)
937 {
938 struct brw_compile *p = &c->func;
939 GLuint mask = inst->DstReg.WriteMask;
940
941 struct brw_reg interp[4];
942 struct brw_reg dst, delta0, delta1;
943 struct brw_reg src0, w;
944 GLuint nr, i;
945
946 src0 = get_src_reg(c, inst, 0, 0);
947 delta0 = get_src_reg(c, inst, 1, 0);
948 delta1 = get_src_reg(c, inst, 1, 1);
949 w = get_src_reg(c, inst, 2, 3);
950 nr = src0.nr;
951
952 interp[0] = brw_vec1_grf(nr, 0);
953 interp[1] = brw_vec1_grf(nr, 4);
954 interp[2] = brw_vec1_grf(nr+1, 0);
955 interp[3] = brw_vec1_grf(nr+1, 4);
956
957 for(i = 0; i < 4; i++ ) {
958 if (mask & (1<<i)) {
959 dst = get_dst_reg(c, inst, i);
960 brw_LINE(p, brw_null_reg(), interp[i], delta0);
961 brw_MAC(p, dst, suboffset(interp[i],1),
962 delta1);
963 brw_MUL(p, dst, dst, w);
964 }
965 }
966 }
967
968 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
969 static void emit_frontfacing(struct brw_wm_compile *c,
970 const struct prog_instruction *inst)
971 {
972 struct brw_compile *p = &c->func;
973 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
974 struct brw_reg dst;
975 GLuint mask = inst->DstReg.WriteMask;
976 int i;
977
978 for (i = 0; i < 4; i++) {
979 if (mask & (1<<i)) {
980 dst = get_dst_reg(c, inst, i);
981 brw_MOV(p, dst, brw_imm_f(0.0));
982 }
983 }
984
985 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
986 * us front face
987 */
988 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
989 for (i = 0; i < 4; i++) {
990 if (mask & (1<<i)) {
991 dst = get_dst_reg(c, inst, i);
992 brw_MOV(p, dst, brw_imm_f(1.0));
993 }
994 }
995 brw_set_predicate_control_flag_value(p, 0xff);
996 }
997
998 static void emit_xpd(struct brw_wm_compile *c,
999 const struct prog_instruction *inst)
1000 {
1001 int i;
1002 struct brw_compile *p = &c->func;
1003 GLuint mask = inst->DstReg.WriteMask;
1004 for (i = 0; i < 4; i++) {
1005 GLuint i2 = (i+2)%3;
1006 GLuint i1 = (i+1)%3;
1007 if (mask & (1<<i)) {
1008 struct brw_reg src0, src1, dst;
1009 dst = get_dst_reg(c, inst, i);
1010 src0 = negate(get_src_reg(c, inst, 0, i2));
1011 src1 = get_src_reg_imm(c, inst, 1, i1);
1012 brw_MUL(p, brw_null_reg(), src0, src1);
1013 src0 = get_src_reg(c, inst, 0, i1);
1014 src1 = get_src_reg_imm(c, inst, 1, i2);
1015 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1016 brw_MAC(p, dst, src0, src1);
1017 brw_set_saturate(p, 0);
1018 }
1019 }
1020 brw_set_saturate(p, 0);
1021 }
1022
1023 static void emit_dp3(struct brw_wm_compile *c,
1024 const struct prog_instruction *inst)
1025 {
1026 struct brw_reg src0[3], src1[3], dst;
1027 int i;
1028 struct brw_compile *p = &c->func;
1029 GLuint mask = inst->DstReg.WriteMask;
1030 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1031
1032 if (!(mask & WRITEMASK_XYZW))
1033 return;
1034
1035 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1036
1037 for (i = 0; i < 3; i++) {
1038 src0[i] = get_src_reg(c, inst, 0, i);
1039 src1[i] = get_src_reg_imm(c, inst, 1, i);
1040 }
1041
1042 dst = get_dst_reg(c, inst, dst_chan);
1043 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1044 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1045 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1046 brw_MAC(p, dst, src0[2], src1[2]);
1047 brw_set_saturate(p, 0);
1048 }
1049
1050 static void emit_dp4(struct brw_wm_compile *c,
1051 const struct prog_instruction *inst)
1052 {
1053 struct brw_reg src0[4], src1[4], dst;
1054 int i;
1055 struct brw_compile *p = &c->func;
1056 GLuint mask = inst->DstReg.WriteMask;
1057 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1058
1059 if (!(mask & WRITEMASK_XYZW))
1060 return;
1061
1062 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1063
1064 for (i = 0; i < 4; i++) {
1065 src0[i] = get_src_reg(c, inst, 0, i);
1066 src1[i] = get_src_reg_imm(c, inst, 1, i);
1067 }
1068 dst = get_dst_reg(c, inst, dst_chan);
1069 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1070 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1071 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1072 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1073 brw_MAC(p, dst, src0[3], src1[3]);
1074 brw_set_saturate(p, 0);
1075 }
1076
1077 static void emit_dph(struct brw_wm_compile *c,
1078 const struct prog_instruction *inst)
1079 {
1080 struct brw_reg src0[4], src1[4], dst;
1081 int i;
1082 struct brw_compile *p = &c->func;
1083 GLuint mask = inst->DstReg.WriteMask;
1084 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1085
1086 if (!(mask & WRITEMASK_XYZW))
1087 return;
1088
1089 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1090
1091 for (i = 0; i < 4; i++) {
1092 src0[i] = get_src_reg(c, inst, 0, i);
1093 src1[i] = get_src_reg_imm(c, inst, 1, i);
1094 }
1095 dst = get_dst_reg(c, inst, dst_chan);
1096 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1097 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1098 brw_MAC(p, dst, src0[2], src1[2]);
1099 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1100 brw_ADD(p, dst, dst, src1[3]);
1101 brw_set_saturate(p, 0);
1102 }
1103
1104 /**
1105 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1106 * Note that the result of the function is smeared across the dest
1107 * register's X, Y, Z and W channels (subject to writemasking of course).
1108 */
1109 static void emit_math1(struct brw_wm_compile *c,
1110 const struct prog_instruction *inst, GLuint func)
1111 {
1112 struct brw_compile *p = &c->func;
1113 struct brw_reg src0, dst;
1114 GLuint mask = inst->DstReg.WriteMask;
1115 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1116
1117 if (!(mask & WRITEMASK_XYZW))
1118 return;
1119
1120 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1121
1122 /* Get first component of source register */
1123 dst = get_dst_reg(c, inst, dst_chan);
1124 src0 = get_src_reg(c, inst, 0, 0);
1125
1126 brw_MOV(p, brw_message_reg(2), src0);
1127 brw_math(p,
1128 dst,
1129 func,
1130 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1131 2,
1132 brw_null_reg(),
1133 BRW_MATH_DATA_VECTOR,
1134 BRW_MATH_PRECISION_FULL);
1135 }
1136
1137 static void emit_rcp(struct brw_wm_compile *c,
1138 const struct prog_instruction *inst)
1139 {
1140 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1141 }
1142
1143 static void emit_rsq(struct brw_wm_compile *c,
1144 const struct prog_instruction *inst)
1145 {
1146 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1147 }
1148
1149 static void emit_sin(struct brw_wm_compile *c,
1150 const struct prog_instruction *inst)
1151 {
1152 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1153 }
1154
1155 static void emit_cos(struct brw_wm_compile *c,
1156 const struct prog_instruction *inst)
1157 {
1158 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1159 }
1160
1161 static void emit_ex2(struct brw_wm_compile *c,
1162 const struct prog_instruction *inst)
1163 {
1164 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1165 }
1166
1167 static void emit_lg2(struct brw_wm_compile *c,
1168 const struct prog_instruction *inst)
1169 {
1170 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1171 }
1172
1173 static void emit_add(struct brw_wm_compile *c,
1174 const struct prog_instruction *inst)
1175 {
1176 struct brw_compile *p = &c->func;
1177 struct brw_reg src0, src1, dst;
1178 GLuint mask = inst->DstReg.WriteMask;
1179 int i;
1180 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1181 for (i = 0 ; i < 4; i++) {
1182 if (mask & (1<<i)) {
1183 dst = get_dst_reg(c, inst, i);
1184 src0 = get_src_reg(c, inst, 0, i);
1185 src1 = get_src_reg_imm(c, inst, 1, i);
1186 brw_ADD(p, dst, src0, src1);
1187 }
1188 }
1189 brw_set_saturate(p, 0);
1190 }
1191
1192 static void emit_arl(struct brw_wm_compile *c,
1193 const struct prog_instruction *inst)
1194 {
1195 struct brw_compile *p = &c->func;
1196 struct brw_reg src0, addr_reg;
1197 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1198 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1199 BRW_ARF_ADDRESS, 0);
1200 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1201 brw_MOV(p, addr_reg, src0);
1202 brw_set_saturate(p, 0);
1203 }
1204
1205
1206 static void emit_mul(struct brw_wm_compile *c,
1207 const struct prog_instruction *inst)
1208 {
1209 struct brw_compile *p = &c->func;
1210 struct brw_reg src0, src1, dst;
1211 GLuint mask = inst->DstReg.WriteMask;
1212 int i;
1213 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1214 for (i = 0 ; i < 4; i++) {
1215 if (mask & (1<<i)) {
1216 dst = get_dst_reg(c, inst, i);
1217 src0 = get_src_reg(c, inst, 0, i);
1218 src1 = get_src_reg_imm(c, inst, 1, i);
1219 brw_MUL(p, dst, src0, src1);
1220 }
1221 }
1222 brw_set_saturate(p, 0);
1223 }
1224
1225 static void emit_frc(struct brw_wm_compile *c,
1226 const struct prog_instruction *inst)
1227 {
1228 struct brw_compile *p = &c->func;
1229 struct brw_reg src0, dst;
1230 GLuint mask = inst->DstReg.WriteMask;
1231 int i;
1232 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1233 for (i = 0 ; i < 4; i++) {
1234 if (mask & (1<<i)) {
1235 dst = get_dst_reg(c, inst, i);
1236 src0 = get_src_reg_imm(c, inst, 0, i);
1237 brw_FRC(p, dst, src0);
1238 }
1239 }
1240 if (inst->SaturateMode != SATURATE_OFF)
1241 brw_set_saturate(p, 0);
1242 }
1243
1244 static void emit_flr(struct brw_wm_compile *c,
1245 const struct prog_instruction *inst)
1246 {
1247 struct brw_compile *p = &c->func;
1248 struct brw_reg src0, dst;
1249 GLuint mask = inst->DstReg.WriteMask;
1250 int i;
1251 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1252 for (i = 0 ; i < 4; i++) {
1253 if (mask & (1<<i)) {
1254 dst = get_dst_reg(c, inst, i);
1255 src0 = get_src_reg_imm(c, inst, 0, i);
1256 brw_RNDD(p, dst, src0);
1257 }
1258 }
1259 brw_set_saturate(p, 0);
1260 }
1261
1262
1263 static void emit_min_max(struct brw_wm_compile *c,
1264 const struct prog_instruction *inst)
1265 {
1266 struct brw_compile *p = &c->func;
1267 const GLuint mask = inst->DstReg.WriteMask;
1268 const int mark = mark_tmps(c);
1269 int i;
1270 brw_push_insn_state(p);
1271 for (i = 0; i < 4; i++) {
1272 if (mask & (1<<i)) {
1273 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1274 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1275 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1276 struct brw_reg dst;
1277 /* if dst==src0 or dst==src1 we need to use a temp reg */
1278 GLboolean use_temp = brw_same_reg(dst, src0) ||
1279 brw_same_reg(dst, src1);
1280 if (use_temp)
1281 dst = alloc_tmp(c);
1282 else
1283 dst = real_dst;
1284
1285 /*
1286 printf(" Min/max: dst %d src0 %d src1 %d\n",
1287 dst.nr, src0.nr, src1.nr);
1288 */
1289 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1290 brw_MOV(p, dst, src0);
1291 brw_set_saturate(p, 0);
1292
1293 if (inst->Opcode == OPCODE_MIN)
1294 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1295 else
1296 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1297
1298 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1299 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1300 brw_MOV(p, dst, src1);
1301 brw_set_saturate(p, 0);
1302 brw_set_predicate_control_flag_value(p, 0xff);
1303 if (use_temp)
1304 brw_MOV(p, real_dst, dst);
1305 }
1306 }
1307 brw_pop_insn_state(p);
1308 release_tmps(c, mark);
1309 }
1310
1311 static void emit_pow(struct brw_wm_compile *c,
1312 const struct prog_instruction *inst)
1313 {
1314 struct brw_compile *p = &c->func;
1315 struct brw_reg dst, src0, src1;
1316 GLuint mask = inst->DstReg.WriteMask;
1317 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1318
1319 if (!(mask & WRITEMASK_XYZW))
1320 return;
1321
1322 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1323
1324 dst = get_dst_reg(c, inst, dst_chan);
1325 src0 = get_src_reg_imm(c, inst, 0, 0);
1326 src1 = get_src_reg_imm(c, inst, 1, 0);
1327
1328 brw_MOV(p, brw_message_reg(2), src0);
1329 brw_MOV(p, brw_message_reg(3), src1);
1330
1331 brw_math(p,
1332 dst,
1333 BRW_MATH_FUNCTION_POW,
1334 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1335 2,
1336 brw_null_reg(),
1337 BRW_MATH_DATA_VECTOR,
1338 BRW_MATH_PRECISION_FULL);
1339 }
1340
1341 static void emit_lrp(struct brw_wm_compile *c,
1342 const struct prog_instruction *inst)
1343 {
1344 struct brw_compile *p = &c->func;
1345 GLuint mask = inst->DstReg.WriteMask;
1346 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1347 int i;
1348 int mark = mark_tmps(c);
1349 for (i = 0; i < 4; i++) {
1350 if (mask & (1<<i)) {
1351 dst = get_dst_reg(c, inst, i);
1352 src0 = get_src_reg(c, inst, 0, i);
1353
1354 src1 = get_src_reg_imm(c, inst, 1, i);
1355
1356 if (src1.nr == dst.nr) {
1357 tmp1 = alloc_tmp(c);
1358 brw_MOV(p, tmp1, src1);
1359 } else
1360 tmp1 = src1;
1361
1362 src2 = get_src_reg(c, inst, 2, i);
1363 if (src2.nr == dst.nr) {
1364 tmp2 = alloc_tmp(c);
1365 brw_MOV(p, tmp2, src2);
1366 } else
1367 tmp2 = src2;
1368
1369 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1370 brw_MUL(p, brw_null_reg(), dst, tmp2);
1371 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1372 brw_MAC(p, dst, src0, tmp1);
1373 brw_set_saturate(p, 0);
1374 }
1375 release_tmps(c, mark);
1376 }
1377 }
1378
1379 /**
1380 * For GLSL shaders, this KIL will be unconditional.
1381 * It may be contained inside an IF/ENDIF structure of course.
1382 */
1383 static void emit_kil(struct brw_wm_compile *c)
1384 {
1385 struct brw_compile *p = &c->func;
1386 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1387 brw_push_insn_state(p);
1388 brw_set_mask_control(p, BRW_MASK_DISABLE);
1389 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1390 brw_AND(p, depth, c->emit_mask_reg, depth);
1391 brw_pop_insn_state(p);
1392 }
1393
1394 static void emit_mad(struct brw_wm_compile *c,
1395 const struct prog_instruction *inst)
1396 {
1397 struct brw_compile *p = &c->func;
1398 GLuint mask = inst->DstReg.WriteMask;
1399 struct brw_reg dst, src0, src1, src2;
1400 int i;
1401
1402 for (i = 0; i < 4; i++) {
1403 if (mask & (1<<i)) {
1404 dst = get_dst_reg(c, inst, i);
1405 src0 = get_src_reg(c, inst, 0, i);
1406 src1 = get_src_reg_imm(c, inst, 1, i);
1407 src2 = get_src_reg_imm(c, inst, 2, i);
1408 brw_MUL(p, dst, src0, src1);
1409
1410 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1411 brw_ADD(p, dst, dst, src2);
1412 brw_set_saturate(p, 0);
1413 }
1414 }
1415 }
1416
1417 static void emit_sop(struct brw_wm_compile *c,
1418 const struct prog_instruction *inst, GLuint cond)
1419 {
1420 struct brw_compile *p = &c->func;
1421 GLuint mask = inst->DstReg.WriteMask;
1422 struct brw_reg dst, src0, src1;
1423 int i;
1424
1425 for (i = 0; i < 4; i++) {
1426 if (mask & (1<<i)) {
1427 dst = get_dst_reg(c, inst, i);
1428 src0 = get_src_reg(c, inst, 0, i);
1429 src1 = get_src_reg_imm(c, inst, 1, i);
1430 brw_push_insn_state(p);
1431 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1432 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1433 brw_MOV(p, dst, brw_imm_f(0.0));
1434 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1435 brw_MOV(p, dst, brw_imm_f(1.0));
1436 brw_pop_insn_state(p);
1437 }
1438 }
1439 }
1440
1441 static void emit_slt(struct brw_wm_compile *c,
1442 const struct prog_instruction *inst)
1443 {
1444 emit_sop(c, inst, BRW_CONDITIONAL_L);
1445 }
1446
1447 static void emit_sle(struct brw_wm_compile *c,
1448 const struct prog_instruction *inst)
1449 {
1450 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1451 }
1452
1453 static void emit_sgt(struct brw_wm_compile *c,
1454 const struct prog_instruction *inst)
1455 {
1456 emit_sop(c, inst, BRW_CONDITIONAL_G);
1457 }
1458
1459 static void emit_sge(struct brw_wm_compile *c,
1460 const struct prog_instruction *inst)
1461 {
1462 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1463 }
1464
1465 static void emit_seq(struct brw_wm_compile *c,
1466 const struct prog_instruction *inst)
1467 {
1468 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1469 }
1470
1471 static void emit_sne(struct brw_wm_compile *c,
1472 const struct prog_instruction *inst)
1473 {
1474 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1475 }
1476
1477 static void emit_ddx(struct brw_wm_compile *c,
1478 const struct prog_instruction *inst)
1479 {
1480 struct brw_compile *p = &c->func;
1481 GLuint mask = inst->DstReg.WriteMask;
1482 struct brw_reg interp[4];
1483 struct brw_reg dst;
1484 struct brw_reg src0, w;
1485 GLuint nr, i;
1486 src0 = get_src_reg(c, inst, 0, 0);
1487 w = get_src_reg(c, inst, 1, 3);
1488 nr = src0.nr;
1489 interp[0] = brw_vec1_grf(nr, 0);
1490 interp[1] = brw_vec1_grf(nr, 4);
1491 interp[2] = brw_vec1_grf(nr+1, 0);
1492 interp[3] = brw_vec1_grf(nr+1, 4);
1493 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1494 for(i = 0; i < 4; i++ ) {
1495 if (mask & (1<<i)) {
1496 dst = get_dst_reg(c, inst, i);
1497 brw_MOV(p, dst, interp[i]);
1498 brw_MUL(p, dst, dst, w);
1499 }
1500 }
1501 brw_set_saturate(p, 0);
1502 }
1503
1504 static void emit_ddy(struct brw_wm_compile *c,
1505 const struct prog_instruction *inst)
1506 {
1507 struct brw_compile *p = &c->func;
1508 GLuint mask = inst->DstReg.WriteMask;
1509 struct brw_reg interp[4];
1510 struct brw_reg dst;
1511 struct brw_reg src0, w;
1512 GLuint nr, i;
1513
1514 src0 = get_src_reg(c, inst, 0, 0);
1515 nr = src0.nr;
1516 w = get_src_reg(c, inst, 1, 3);
1517 interp[0] = brw_vec1_grf(nr, 0);
1518 interp[1] = brw_vec1_grf(nr, 4);
1519 interp[2] = brw_vec1_grf(nr+1, 0);
1520 interp[3] = brw_vec1_grf(nr+1, 4);
1521 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1522 for(i = 0; i < 4; i++ ) {
1523 if (mask & (1<<i)) {
1524 dst = get_dst_reg(c, inst, i);
1525 brw_MOV(p, dst, suboffset(interp[i], 1));
1526 brw_MUL(p, dst, dst, w);
1527 }
1528 }
1529 brw_set_saturate(p, 0);
1530 }
1531
1532 static INLINE struct brw_reg high_words( struct brw_reg reg )
1533 {
1534 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1535 0, 8, 2 );
1536 }
1537
1538 static INLINE struct brw_reg low_words( struct brw_reg reg )
1539 {
1540 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1541 }
1542
1543 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1544 {
1545 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1546 }
1547
1548 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1549 {
1550 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1551 0, 16, 2 );
1552 }
1553
1554 /* One-, two- and three-dimensional Perlin noise, similar to the description
1555 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1556 static void noise1_sub( struct brw_wm_compile *c ) {
1557
1558 struct brw_compile *p = &c->func;
1559 struct brw_reg param,
1560 x0, x1, /* gradients at each end */
1561 t, tmp[ 2 ], /* float temporaries */
1562 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1563 int i;
1564 int mark = mark_tmps( c );
1565
1566 x0 = alloc_tmp( c );
1567 x1 = alloc_tmp( c );
1568 t = alloc_tmp( c );
1569 tmp[ 0 ] = alloc_tmp( c );
1570 tmp[ 1 ] = alloc_tmp( c );
1571 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1572 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1573 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1574 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1575 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1576
1577 param = lookup_tmp( c, mark - 2 );
1578
1579 brw_set_access_mode( p, BRW_ALIGN_1 );
1580
1581 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1582
1583 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1584 be hashed. Also compute the remainder (offset within the unit
1585 length), interleaved to reduce register dependency penalties. */
1586 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1587 brw_FRC( p, param, param );
1588 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1589 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1590 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1591
1592 /* We're now ready to perform the hashing. The two hashes are
1593 interleaved for performance. The hash function used is
1594 designed to rapidly achieve avalanche and require only 32x16
1595 bit multiplication, and 16-bit swizzles (which we get for
1596 free). We can't use immediate operands in the multiplies,
1597 because immediates are permitted only in src1 and the 16-bit
1598 factor is permitted only in src0. */
1599 for( i = 0; i < 2; i++ )
1600 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1601 for( i = 0; i < 2; i++ )
1602 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1603 high_words( itmp[ i ] ) );
1604 for( i = 0; i < 2; i++ )
1605 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1606 for( i = 0; i < 2; i++ )
1607 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1608 high_words( itmp[ i ] ) );
1609 for( i = 0; i < 2; i++ )
1610 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1611 for( i = 0; i < 2; i++ )
1612 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1613 high_words( itmp[ i ] ) );
1614
1615 /* Now we want to initialise the two gradients based on the
1616 hashes. Format conversion from signed integer to float leaves
1617 everything scaled too high by a factor of pow( 2, 31 ), but
1618 we correct for that right at the end. */
1619 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1620 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1621 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1622
1623 brw_MUL( p, x0, x0, param );
1624 brw_MUL( p, x1, x1, t );
1625
1626 /* We interpolate between the gradients using the polynomial
1627 6t^5 - 15t^4 + 10t^3 (Perlin). */
1628 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1629 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1630 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1631 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1632 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1633 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1634 pipeline */
1635 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1636 brw_MUL( p, param, tmp[ 0 ], param );
1637 brw_MUL( p, x1, x1, param );
1638 brw_ADD( p, x0, x0, x1 );
1639 /* scale by pow( 2, -30 ), to compensate for the format conversion
1640 above and an extra factor of 2 so that a single gradient covers
1641 the [-1,1] range */
1642 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1643
1644 release_tmps( c, mark );
1645 }
1646
1647 static void emit_noise1( struct brw_wm_compile *c,
1648 const struct prog_instruction *inst )
1649 {
1650 struct brw_compile *p = &c->func;
1651 struct brw_reg src, param, dst;
1652 GLuint mask = inst->DstReg.WriteMask;
1653 int i;
1654 int mark = mark_tmps( c );
1655
1656 assert( mark == 0 );
1657
1658 src = get_src_reg( c, inst, 0, 0 );
1659
1660 param = alloc_tmp( c );
1661
1662 brw_MOV( p, param, src );
1663
1664 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1665
1666 /* Fill in the result: */
1667 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1668 for (i = 0 ; i < 4; i++) {
1669 if (mask & (1<<i)) {
1670 dst = get_dst_reg(c, inst, i);
1671 brw_MOV( p, dst, param );
1672 }
1673 }
1674 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1675 brw_set_saturate( p, 0 );
1676
1677 release_tmps( c, mark );
1678 }
1679
1680 static void noise2_sub( struct brw_wm_compile *c ) {
1681
1682 struct brw_compile *p = &c->func;
1683 struct brw_reg param0, param1,
1684 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1685 t, tmp[ 4 ], /* float temporaries */
1686 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1687 int i;
1688 int mark = mark_tmps( c );
1689
1690 x0y0 = alloc_tmp( c );
1691 x0y1 = alloc_tmp( c );
1692 x1y0 = alloc_tmp( c );
1693 x1y1 = alloc_tmp( c );
1694 t = alloc_tmp( c );
1695 for( i = 0; i < 4; i++ ) {
1696 tmp[ i ] = alloc_tmp( c );
1697 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1698 }
1699 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1700 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1701 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1702
1703 param0 = lookup_tmp( c, mark - 3 );
1704 param1 = lookup_tmp( c, mark - 2 );
1705
1706 brw_set_access_mode( p, BRW_ALIGN_1 );
1707
1708 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1709 be hashed. Also compute the remainders (offsets within the unit
1710 square), interleaved to reduce register dependency penalties. */
1711 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1712 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1713 brw_FRC( p, param0, param0 );
1714 brw_FRC( p, param1, param1 );
1715 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1716 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1717 low_words( itmp[ 1 ] ) );
1718 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1719 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1720 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1721 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1722 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1723
1724 /* We're now ready to perform the hashing. The four hashes are
1725 interleaved for performance. The hash function used is
1726 designed to rapidly achieve avalanche and require only 32x16
1727 bit multiplication, and 16-bit swizzles (which we get for
1728 free). We can't use immediate operands in the multiplies,
1729 because immediates are permitted only in src1 and the 16-bit
1730 factor is permitted only in src0. */
1731 for( i = 0; i < 4; i++ )
1732 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1733 for( i = 0; i < 4; i++ )
1734 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1735 high_words( itmp[ i ] ) );
1736 for( i = 0; i < 4; i++ )
1737 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1738 for( i = 0; i < 4; i++ )
1739 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1740 high_words( itmp[ i ] ) );
1741 for( i = 0; i < 4; i++ )
1742 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1743 for( i = 0; i < 4; i++ )
1744 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1745 high_words( itmp[ i ] ) );
1746
1747 /* Now we want to initialise the four gradients based on the
1748 hashes. Format conversion from signed integer to float leaves
1749 everything scaled too high by a factor of pow( 2, 15 ), but
1750 we correct for that right at the end. */
1751 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1752 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1753 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1754 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1755 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1756
1757 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1758 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1759 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1760 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1761
1762 brw_MUL( p, x1y0, x1y0, t );
1763 brw_MUL( p, x1y1, x1y1, t );
1764 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1765 brw_MUL( p, x0y0, x0y0, param0 );
1766 brw_MUL( p, x0y1, x0y1, param0 );
1767
1768 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1769 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1770 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1771 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1772
1773 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1774 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1775 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1776 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1777
1778 /* We interpolate between the gradients using the polynomial
1779 6t^5 - 15t^4 + 10t^3 (Perlin). */
1780 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1781 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1782 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1783 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1784 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1785 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1786 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1787 pipeline */
1788 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1789 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1790 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1791 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1792 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1793 pipeline */
1794 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1795 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1796 brw_MUL( p, param0, tmp[ 0 ], param0 );
1797 brw_MUL( p, param1, tmp[ 1 ], param1 );
1798
1799 /* Here we interpolate in the y dimension... */
1800 brw_MUL( p, x0y1, x0y1, param1 );
1801 brw_MUL( p, x1y1, x1y1, param1 );
1802 brw_ADD( p, x0y0, x0y0, x0y1 );
1803 brw_ADD( p, x1y0, x1y0, x1y1 );
1804
1805 /* And now in x. There are horrible register dependencies here,
1806 but we have nothing else to do. */
1807 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1808 brw_MUL( p, x1y0, x1y0, param0 );
1809 brw_ADD( p, x0y0, x0y0, x1y0 );
1810
1811 /* scale by pow( 2, -15 ), as described above */
1812 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1813
1814 release_tmps( c, mark );
1815 }
1816
1817 static void emit_noise2( struct brw_wm_compile *c,
1818 const struct prog_instruction *inst )
1819 {
1820 struct brw_compile *p = &c->func;
1821 struct brw_reg src0, src1, param0, param1, dst;
1822 GLuint mask = inst->DstReg.WriteMask;
1823 int i;
1824 int mark = mark_tmps( c );
1825
1826 assert( mark == 0 );
1827
1828 src0 = get_src_reg( c, inst, 0, 0 );
1829 src1 = get_src_reg( c, inst, 0, 1 );
1830
1831 param0 = alloc_tmp( c );
1832 param1 = alloc_tmp( c );
1833
1834 brw_MOV( p, param0, src0 );
1835 brw_MOV( p, param1, src1 );
1836
1837 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1838
1839 /* Fill in the result: */
1840 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1841 for (i = 0 ; i < 4; i++) {
1842 if (mask & (1<<i)) {
1843 dst = get_dst_reg(c, inst, i);
1844 brw_MOV( p, dst, param0 );
1845 }
1846 }
1847 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1848 brw_set_saturate( p, 0 );
1849
1850 release_tmps( c, mark );
1851 }
1852
1853 /**
1854 * The three-dimensional case is much like the one- and two- versions above,
1855 * but since the number of corners is rapidly growing we now pack 16 16-bit
1856 * hashes into each register to extract more parallelism from the EUs.
1857 */
1858 static void noise3_sub( struct brw_wm_compile *c ) {
1859
1860 struct brw_compile *p = &c->func;
1861 struct brw_reg param0, param1, param2,
1862 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1863 xi, yi, zi, /* interpolation coefficients */
1864 t, tmp[ 8 ], /* float temporaries */
1865 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1866 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1867 int i;
1868 int mark = mark_tmps( c );
1869
1870 x0y0 = alloc_tmp( c );
1871 x0y1 = alloc_tmp( c );
1872 x1y0 = alloc_tmp( c );
1873 x1y1 = alloc_tmp( c );
1874 xi = alloc_tmp( c );
1875 yi = alloc_tmp( c );
1876 zi = alloc_tmp( c );
1877 t = alloc_tmp( c );
1878 for( i = 0; i < 8; i++ ) {
1879 tmp[ i ] = alloc_tmp( c );
1880 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1881 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1882 }
1883
1884 param0 = lookup_tmp( c, mark - 4 );
1885 param1 = lookup_tmp( c, mark - 3 );
1886 param2 = lookup_tmp( c, mark - 2 );
1887
1888 brw_set_access_mode( p, BRW_ALIGN_1 );
1889
1890 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1891 be hashed. Also compute the remainders (offsets within the unit
1892 cube), interleaved to reduce register dependency penalties. */
1893 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1894 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1895 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1896 brw_FRC( p, param0, param0 );
1897 brw_FRC( p, param1, param1 );
1898 brw_FRC( p, param2, param2 );
1899 /* Since we now have only 16 bits of precision in the hash, we must
1900 be more careful about thorough mixing to maintain entropy as we
1901 squash the input vector into a small scalar. */
1902 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1903 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1904 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1905 brw_imm_uw( 0x9B93 ) );
1906 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1907 brw_imm_uw( 0xBC8F ) );
1908
1909 /* Temporarily disable the execution mask while we work with ExecSize=16
1910 channels (the mask is set for ExecSize=8 and is probably incorrect).
1911 Although this might cause execution of unwanted channels, the code
1912 writes only to temporary registers and has no side effects, so
1913 disabling the mask is harmless. */
1914 brw_push_insn_state( p );
1915 brw_set_mask_control( p, BRW_MASK_DISABLE );
1916 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1917 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1918 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1919
1920 /* We're now ready to perform the hashing. The eight hashes are
1921 interleaved for performance. The hash function used is
1922 designed to rapidly achieve avalanche and require only 16x16
1923 bit multiplication, and 8-bit swizzles (which we get for
1924 free). */
1925 for( i = 0; i < 4; i++ )
1926 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1927 for( i = 0; i < 4; i++ )
1928 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1929 odd_bytes( wtmp[ i ] ) );
1930 for( i = 0; i < 4; i++ )
1931 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1932 for( i = 0; i < 4; i++ )
1933 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1934 odd_bytes( wtmp[ i ] ) );
1935 brw_pop_insn_state( p );
1936
1937 /* Now we want to initialise the four rear gradients based on the
1938 hashes. Format conversion from signed integer to float leaves
1939 everything scaled too high by a factor of pow( 2, 15 ), but
1940 we correct for that right at the end. */
1941 /* x component */
1942 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1943 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1944 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1945 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1946 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1947
1948 brw_push_insn_state( p );
1949 brw_set_mask_control( p, BRW_MASK_DISABLE );
1950 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1951 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1952 brw_pop_insn_state( p );
1953
1954 brw_MUL( p, x1y0, x1y0, t );
1955 brw_MUL( p, x1y1, x1y1, t );
1956 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1957 brw_MUL( p, x0y0, x0y0, param0 );
1958 brw_MUL( p, x0y1, x0y1, param0 );
1959
1960 /* y component */
1961 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1962 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1963 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1964 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1965
1966 brw_push_insn_state( p );
1967 brw_set_mask_control( p, BRW_MASK_DISABLE );
1968 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1969 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1970 brw_pop_insn_state( p );
1971
1972 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1973 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1974 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1975 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1976 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1977
1978 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1979 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1980 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1981 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1982
1983 /* z component */
1984 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1985 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1986 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1987 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1988
1989 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1990 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1991 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1992 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1993
1994 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1995 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1996 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1997 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1998
1999 /* We interpolate between the gradients using the polynomial
2000 6t^5 - 15t^4 + 10t^3 (Perlin). */
2001 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2002 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2003 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2004 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2005 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2006 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2007 brw_MUL( p, xi, xi, param0 );
2008 brw_MUL( p, yi, yi, param1 );
2009 brw_MUL( p, zi, zi, param2 );
2010 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2011 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2012 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2013 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2014 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2015 brw_MUL( p, xi, xi, param0 );
2016 brw_MUL( p, yi, yi, param1 );
2017 brw_MUL( p, zi, zi, param2 );
2018 brw_MUL( p, xi, xi, param0 );
2019 brw_MUL( p, yi, yi, param1 );
2020 brw_MUL( p, zi, zi, param2 );
2021 brw_MUL( p, xi, xi, param0 );
2022 brw_MUL( p, yi, yi, param1 );
2023 brw_MUL( p, zi, zi, param2 );
2024
2025 /* Here we interpolate in the y dimension... */
2026 brw_MUL( p, x0y1, x0y1, yi );
2027 brw_MUL( p, x1y1, x1y1, yi );
2028 brw_ADD( p, x0y0, x0y0, x0y1 );
2029 brw_ADD( p, x1y0, x1y0, x1y1 );
2030
2031 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2032 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2033 brw_MUL( p, x1y0, x1y0, xi );
2034 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2035
2036 /* Now do the same thing for the front four gradients... */
2037 /* x component */
2038 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2039 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2040 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2041 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2042
2043 brw_push_insn_state( p );
2044 brw_set_mask_control( p, BRW_MASK_DISABLE );
2045 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2046 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2047 brw_pop_insn_state( p );
2048
2049 brw_MUL( p, x1y0, x1y0, t );
2050 brw_MUL( p, x1y1, x1y1, t );
2051 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2052 brw_MUL( p, x0y0, x0y0, param0 );
2053 brw_MUL( p, x0y1, x0y1, param0 );
2054
2055 /* y component */
2056 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2057 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2058 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2059 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2060
2061 brw_push_insn_state( p );
2062 brw_set_mask_control( p, BRW_MASK_DISABLE );
2063 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2064 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2065 brw_pop_insn_state( p );
2066
2067 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2068 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2069 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2070 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2071 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2072
2073 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2074 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2075 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2076 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2077
2078 /* z component */
2079 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2080 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2081 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2082 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2083
2084 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2085 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2086 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2087 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2088
2089 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2090 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2091 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2092 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2093
2094 /* The interpolation coefficients are still around from last time, so
2095 again interpolate in the y dimension... */
2096 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2097 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2098 brw_MUL( p, x0y1, x0y1, yi );
2099 brw_MUL( p, x1y1, x1y1, yi );
2100 brw_ADD( p, x0y0, x0y0, x0y1 );
2101 brw_ADD( p, x1y0, x1y0, x1y1 );
2102
2103 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2104 time put the front face in tmp[ 1 ] and we're nearly there... */
2105 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2106 brw_MUL( p, x1y0, x1y0, xi );
2107 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2108
2109 /* The final interpolation, in the z dimension: */
2110 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2111 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2112 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2113
2114 /* scale by pow( 2, -15 ), as described above */
2115 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2116
2117 release_tmps( c, mark );
2118 }
2119
2120 static void emit_noise3( struct brw_wm_compile *c,
2121 const struct prog_instruction *inst )
2122 {
2123 struct brw_compile *p = &c->func;
2124 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2125 GLuint mask = inst->DstReg.WriteMask;
2126 int i;
2127 int mark = mark_tmps( c );
2128
2129 assert( mark == 0 );
2130
2131 src0 = get_src_reg( c, inst, 0, 0 );
2132 src1 = get_src_reg( c, inst, 0, 1 );
2133 src2 = get_src_reg( c, inst, 0, 2 );
2134
2135 param0 = alloc_tmp( c );
2136 param1 = alloc_tmp( c );
2137 param2 = alloc_tmp( c );
2138
2139 brw_MOV( p, param0, src0 );
2140 brw_MOV( p, param1, src1 );
2141 brw_MOV( p, param2, src2 );
2142
2143 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2144
2145 /* Fill in the result: */
2146 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2147 for (i = 0 ; i < 4; i++) {
2148 if (mask & (1<<i)) {
2149 dst = get_dst_reg(c, inst, i);
2150 brw_MOV( p, dst, param0 );
2151 }
2152 }
2153 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2154 brw_set_saturate( p, 0 );
2155
2156 release_tmps( c, mark );
2157 }
2158
2159 /**
2160 * For the four-dimensional case, the little micro-optimisation benefits
2161 * we obtain by unrolling all the loops aren't worth the massive bloat it
2162 * now causes. Instead, we loop twice around performing a similar operation
2163 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2164 * code to glue it all together.
2165 */
2166 static void noise4_sub( struct brw_wm_compile *c )
2167 {
2168 struct brw_compile *p = &c->func;
2169 struct brw_reg param[ 4 ],
2170 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2171 w0, /* noise for the w=0 cube */
2172 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2173 interp[ 4 ], /* interpolation coefficients */
2174 t, tmp[ 8 ], /* float temporaries */
2175 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2176 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2177 int i, j;
2178 int mark = mark_tmps( c );
2179 GLuint loop, origin;
2180
2181 x0y0 = alloc_tmp( c );
2182 x0y1 = alloc_tmp( c );
2183 x1y0 = alloc_tmp( c );
2184 x1y1 = alloc_tmp( c );
2185 t = alloc_tmp( c );
2186 w0 = alloc_tmp( c );
2187 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2188 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2189
2190 for( i = 0; i < 4; i++ ) {
2191 param[ i ] = lookup_tmp( c, mark - 5 + i );
2192 interp[ i ] = alloc_tmp( c );
2193 }
2194
2195 for( i = 0; i < 8; i++ ) {
2196 tmp[ i ] = alloc_tmp( c );
2197 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2198 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2199 }
2200
2201 brw_set_access_mode( p, BRW_ALIGN_1 );
2202
2203 /* We only want 16 bits of precision from the integral part of each
2204 co-ordinate, but unfortunately the RNDD semantics would saturate
2205 at 16 bits if we performed the operation directly to a 16-bit
2206 destination. Therefore, we round to 32-bit temporaries where
2207 appropriate, and then store only the lower 16 bits. */
2208 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2209 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2210 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2211 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2212 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2213 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2214
2215 /* Modify the flag register here, because the side effect is useful
2216 later (see below). We know for certain that all flags will be
2217 cleared, since the FRC instruction cannot possibly generate
2218 negative results. Even for exceptional inputs (infinities, denormals,
2219 NaNs), the architecture guarantees that the L conditional is false. */
2220 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2221 brw_FRC( p, param[ 0 ], param[ 0 ] );
2222 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2223 for( i = 1; i < 4; i++ )
2224 brw_FRC( p, param[ i ], param[ i ] );
2225
2226 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2227 of all. */
2228 for( i = 0; i < 4; i++ )
2229 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2230 for( i = 0; i < 4; i++ )
2231 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2232 for( i = 0; i < 4; i++ )
2233 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2234 for( i = 0; i < 4; i++ )
2235 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2236 for( j = 0; j < 3; j++ )
2237 for( i = 0; i < 4; i++ )
2238 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2239
2240 /* Mark the current address, as it will be a jump destination. The
2241 following code will be executed twice: first, with the flag
2242 register clear indicating the w=0 case, and second with flags
2243 set for w=1. */
2244 loop = p->nr_insn;
2245
2246 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2247 be hashed. Since we have only 16 bits of precision in the hash, we
2248 must be careful about thorough mixing to maintain entropy as we
2249 squash the input vector into a small scalar. */
2250 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2251 brw_imm_uw( 0xBC8F ) );
2252 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2253 brw_imm_uw( 0xD0BD ) );
2254 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2255 brw_imm_uw( 0x9B93 ) );
2256 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2257 brw_imm_uw( 0xA359 ) );
2258 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2259 brw_imm_uw( 0xBC8F ) );
2260
2261 /* Temporarily disable the execution mask while we work with ExecSize=16
2262 channels (the mask is set for ExecSize=8 and is probably incorrect).
2263 Although this might cause execution of unwanted channels, the code
2264 writes only to temporary registers and has no side effects, so
2265 disabling the mask is harmless. */
2266 brw_push_insn_state( p );
2267 brw_set_mask_control( p, BRW_MASK_DISABLE );
2268 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2269 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2270 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2271
2272 /* We're now ready to perform the hashing. The eight hashes are
2273 interleaved for performance. The hash function used is
2274 designed to rapidly achieve avalanche and require only 16x16
2275 bit multiplication, and 8-bit swizzles (which we get for
2276 free). */
2277 for( i = 0; i < 4; i++ )
2278 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2279 for( i = 0; i < 4; i++ )
2280 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2281 odd_bytes( wtmp[ i ] ) );
2282 for( i = 0; i < 4; i++ )
2283 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2284 for( i = 0; i < 4; i++ )
2285 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2286 odd_bytes( wtmp[ i ] ) );
2287 brw_pop_insn_state( p );
2288
2289 /* Now we want to initialise the four rear gradients based on the
2290 hashes. Format conversion from signed integer to float leaves
2291 everything scaled too high by a factor of pow( 2, 15 ), but
2292 we correct for that right at the end. */
2293 /* x component */
2294 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2295 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2296 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2297 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2298 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2299
2300 brw_push_insn_state( p );
2301 brw_set_mask_control( p, BRW_MASK_DISABLE );
2302 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2303 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2304 brw_pop_insn_state( p );
2305
2306 brw_MUL( p, x1y0, x1y0, t );
2307 brw_MUL( p, x1y1, x1y1, t );
2308 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2309 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2310 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2311
2312 /* y component */
2313 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2314 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2315 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2316 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2317
2318 brw_push_insn_state( p );
2319 brw_set_mask_control( p, BRW_MASK_DISABLE );
2320 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2321 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2322 brw_pop_insn_state( p );
2323
2324 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2325 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2326 /* prepare t for the w component (used below): w the first time through
2327 the loop; w - 1 the second time) */
2328 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2329 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2330 p->current->header.predicate_inverse = 1;
2331 brw_MOV( p, t, param[ 3 ] );
2332 p->current->header.predicate_inverse = 0;
2333 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2334 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2335 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2336
2337 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2338 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2339 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2340 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2341
2342 /* z component */
2343 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2344 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2345 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2346 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2347
2348 brw_push_insn_state( p );
2349 brw_set_mask_control( p, BRW_MASK_DISABLE );
2350 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2351 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2352 brw_pop_insn_state( p );
2353
2354 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2355 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2356 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2357 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2358
2359 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2360 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2361 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2362 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2363
2364 /* w component */
2365 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2366 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2367 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2368 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2369
2370 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2371 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2372 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2373 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2374 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2375
2376 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2377 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2378 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2379 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2380
2381 /* Here we interpolate in the y dimension... */
2382 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2383 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2384 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2385 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2386 brw_ADD( p, x0y0, x0y0, x0y1 );
2387 brw_ADD( p, x1y0, x1y0, x1y1 );
2388
2389 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2390 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2391 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2392 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2393
2394 /* Now do the same thing for the front four gradients... */
2395 /* x component */
2396 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2397 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2398 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2399 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2400
2401 brw_push_insn_state( p );
2402 brw_set_mask_control( p, BRW_MASK_DISABLE );
2403 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2404 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2405 brw_pop_insn_state( p );
2406
2407 brw_MUL( p, x1y0, x1y0, t );
2408 brw_MUL( p, x1y1, x1y1, t );
2409 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2410 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2411 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2412
2413 /* y component */
2414 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2415 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2416 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2417 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2418
2419 brw_push_insn_state( p );
2420 brw_set_mask_control( p, BRW_MASK_DISABLE );
2421 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2422 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2423 brw_pop_insn_state( p );
2424
2425 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2426 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2427 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2428 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2429 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2430
2431 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2432 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2433 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2434 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2435
2436 /* z component */
2437 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2438 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2439 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2440 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2441
2442 brw_push_insn_state( p );
2443 brw_set_mask_control( p, BRW_MASK_DISABLE );
2444 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2445 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2446 brw_pop_insn_state( p );
2447
2448 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2449 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2450 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2451 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2452 /* prepare t for the w component (used below): w the first time through
2453 the loop; w - 1 the second time) */
2454 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2455 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2456 p->current->header.predicate_inverse = 1;
2457 brw_MOV( p, t, param[ 3 ] );
2458 p->current->header.predicate_inverse = 0;
2459 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2460
2461 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2462 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2463 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2464 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2465
2466 /* w component */
2467 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2468 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2469 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2470 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2471
2472 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2473 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2474 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2475 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2476
2477 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2478 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2479 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2480 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2481
2482 /* Interpolate in the y dimension: */
2483 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2484 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2485 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2486 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2487 brw_ADD( p, x0y0, x0y0, x0y1 );
2488 brw_ADD( p, x1y0, x1y0, x1y1 );
2489
2490 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2491 time put the front face in tmp[ 1 ] and we're nearly there... */
2492 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2493 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2494 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2495
2496 /* Another interpolation, in the z dimension: */
2497 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2498 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2499 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2500
2501 /* Exit the loop if we've computed both cubes... */
2502 origin = p->nr_insn;
2503 brw_push_insn_state( p );
2504 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2505 brw_set_mask_control( p, BRW_MASK_DISABLE );
2506 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2507 brw_pop_insn_state( p );
2508
2509 /* Save the result for the w=0 case, and increment the w coordinate: */
2510 brw_MOV( p, w0, tmp[ 0 ] );
2511 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2512 brw_imm_uw( 1 ) );
2513
2514 /* Loop around for the other cube. Explicitly set the flag register
2515 (unfortunately we must spend an extra instruction to do this: we
2516 can't rely on a side effect of the previous MOV or ADD because
2517 conditional modifiers which are normally true might be false in
2518 exceptional circumstances, e.g. given a NaN input; the add to
2519 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2520 brw_push_insn_state( p );
2521 brw_set_mask_control( p, BRW_MASK_DISABLE );
2522 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2523 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2524 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2525 brw_pop_insn_state( p );
2526
2527 /* Patch the previous conditional branch now that we know the
2528 destination address. */
2529 brw_set_src1( p->store + origin,
2530 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2531
2532 /* The very last interpolation. */
2533 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2534 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2535 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2536
2537 /* scale by pow( 2, -15 ), as described above */
2538 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2539
2540 release_tmps( c, mark );
2541 }
2542
2543 static void emit_noise4( struct brw_wm_compile *c,
2544 const struct prog_instruction *inst )
2545 {
2546 struct brw_compile *p = &c->func;
2547 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2548 GLuint mask = inst->DstReg.WriteMask;
2549 int i;
2550 int mark = mark_tmps( c );
2551
2552 assert( mark == 0 );
2553
2554 src0 = get_src_reg( c, inst, 0, 0 );
2555 src1 = get_src_reg( c, inst, 0, 1 );
2556 src2 = get_src_reg( c, inst, 0, 2 );
2557 src3 = get_src_reg( c, inst, 0, 3 );
2558
2559 param0 = alloc_tmp( c );
2560 param1 = alloc_tmp( c );
2561 param2 = alloc_tmp( c );
2562 param3 = alloc_tmp( c );
2563
2564 brw_MOV( p, param0, src0 );
2565 brw_MOV( p, param1, src1 );
2566 brw_MOV( p, param2, src2 );
2567 brw_MOV( p, param3, src3 );
2568
2569 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2570
2571 /* Fill in the result: */
2572 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2573 for (i = 0 ; i < 4; i++) {
2574 if (mask & (1<<i)) {
2575 dst = get_dst_reg(c, inst, i);
2576 brw_MOV( p, dst, param0 );
2577 }
2578 }
2579 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2580 brw_set_saturate( p, 0 );
2581
2582 release_tmps( c, mark );
2583 }
2584
2585 static void emit_wpos_xy(struct brw_wm_compile *c,
2586 const struct prog_instruction *inst)
2587 {
2588 struct brw_compile *p = &c->func;
2589 GLuint mask = inst->DstReg.WriteMask;
2590 struct brw_reg src0[2], dst[2];
2591
2592 dst[0] = get_dst_reg(c, inst, 0);
2593 dst[1] = get_dst_reg(c, inst, 1);
2594
2595 src0[0] = get_src_reg(c, inst, 0, 0);
2596 src0[1] = get_src_reg(c, inst, 0, 1);
2597
2598 /* Calculate the pixel offset from window bottom left into destination
2599 * X and Y channels.
2600 */
2601 if (mask & WRITEMASK_X) {
2602 /* X' = X - origin_x */
2603 brw_ADD(p,
2604 dst[0],
2605 retype(src0[0], BRW_REGISTER_TYPE_W),
2606 brw_imm_d(0 - c->key.origin_x));
2607 }
2608
2609 if (mask & WRITEMASK_Y) {
2610 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2611 brw_ADD(p,
2612 dst[1],
2613 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2614 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2615 }
2616 }
2617
2618 /* TODO
2619 BIAS on SIMD8 not working yet...
2620 */
2621 static void emit_txb(struct brw_wm_compile *c,
2622 const struct prog_instruction *inst)
2623 {
2624 struct brw_compile *p = &c->func;
2625 struct brw_reg dst[4], src[4], payload_reg;
2626 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2627 const GLuint unit = inst->TexSrcUnit;
2628 GLuint i;
2629 GLuint msg_type;
2630
2631 assert(unit < BRW_MAX_TEX_UNIT);
2632
2633 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2634
2635 for (i = 0; i < 4; i++)
2636 dst[i] = get_dst_reg(c, inst, i);
2637 for (i = 0; i < 4; i++)
2638 src[i] = get_src_reg(c, inst, 0, i);
2639
2640 switch (inst->TexSrcTarget) {
2641 case TEXTURE_1D_INDEX:
2642 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2643 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2644 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2645 break;
2646 case TEXTURE_2D_INDEX:
2647 case TEXTURE_RECT_INDEX:
2648 brw_MOV(p, brw_message_reg(2), src[0]);
2649 brw_MOV(p, brw_message_reg(3), src[1]);
2650 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2651 break;
2652 case TEXTURE_3D_INDEX:
2653 case TEXTURE_CUBE_INDEX:
2654 brw_MOV(p, brw_message_reg(2), src[0]);
2655 brw_MOV(p, brw_message_reg(3), src[1]);
2656 brw_MOV(p, brw_message_reg(4), src[2]);
2657 break;
2658 default:
2659 /* invalid target */
2660 abort();
2661 }
2662 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2663 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2664
2665 if (BRW_IS_IGDNG(p->brw)) {
2666 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2667 } else {
2668 /* Does it work well on SIMD8? */
2669 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2670 }
2671
2672 brw_SAMPLE(p,
2673 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2674 1, /* msg_reg_nr */
2675 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2676 SURF_INDEX_TEXTURE(unit),
2677 unit, /* sampler */
2678 inst->DstReg.WriteMask, /* writemask */
2679 msg_type, /* msg_type */
2680 4, /* response_length */
2681 4, /* msg_length */
2682 0, /* eot */
2683 1,
2684 BRW_SAMPLER_SIMD_MODE_SIMD8);
2685 }
2686
2687
2688 static void emit_tex(struct brw_wm_compile *c,
2689 const struct prog_instruction *inst)
2690 {
2691 struct brw_compile *p = &c->func;
2692 struct brw_reg dst[4], src[4], payload_reg;
2693 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2694 const GLuint unit = inst->TexSrcUnit;
2695 GLuint msg_len;
2696 GLuint i, nr;
2697 GLuint emit;
2698 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2699 GLuint msg_type;
2700
2701 assert(unit < BRW_MAX_TEX_UNIT);
2702
2703 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2704
2705 for (i = 0; i < 4; i++)
2706 dst[i] = get_dst_reg(c, inst, i);
2707 for (i = 0; i < 4; i++)
2708 src[i] = get_src_reg(c, inst, 0, i);
2709
2710 switch (inst->TexSrcTarget) {
2711 case TEXTURE_1D_INDEX:
2712 emit = WRITEMASK_X;
2713 nr = 1;
2714 break;
2715 case TEXTURE_2D_INDEX:
2716 case TEXTURE_RECT_INDEX:
2717 emit = WRITEMASK_XY;
2718 nr = 2;
2719 break;
2720 case TEXTURE_3D_INDEX:
2721 case TEXTURE_CUBE_INDEX:
2722 emit = WRITEMASK_XYZ;
2723 nr = 3;
2724 break;
2725 default:
2726 /* invalid target */
2727 abort();
2728 }
2729 msg_len = 1;
2730
2731 /* move/load S, T, R coords */
2732 for (i = 0; i < nr; i++) {
2733 static const GLuint swz[4] = {0,1,2,2};
2734 if (emit & (1<<i))
2735 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2736 else
2737 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2738 msg_len += 1;
2739 }
2740
2741 if (shadow) {
2742 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2743 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2744 }
2745
2746 if (BRW_IS_IGDNG(p->brw)) {
2747 if (shadow)
2748 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2749 else
2750 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2751 } else {
2752 /* Does it work for shadow on SIMD8 ? */
2753 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2754 }
2755
2756 brw_SAMPLE(p,
2757 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2758 1, /* msg_reg_nr */
2759 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2760 SURF_INDEX_TEXTURE(unit),
2761 unit, /* sampler */
2762 inst->DstReg.WriteMask, /* writemask */
2763 msg_type, /* msg_type */
2764 4, /* response_length */
2765 shadow ? 6 : 4, /* msg_length */
2766 0, /* eot */
2767 1,
2768 BRW_SAMPLER_SIMD_MODE_SIMD8);
2769
2770 if (shadow)
2771 brw_MOV(p, dst[3], brw_imm_f(1.0));
2772 }
2773
2774
2775 /**
2776 * Resolve subroutine calls after code emit is done.
2777 */
2778 static void post_wm_emit( struct brw_wm_compile *c )
2779 {
2780 brw_resolve_cals(&c->func);
2781 }
2782
2783 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2784 {
2785 #define MAX_IF_DEPTH 32
2786 #define MAX_LOOP_DEPTH 32
2787 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2788 GLuint i, if_depth = 0, loop_depth = 0;
2789 struct brw_compile *p = &c->func;
2790 struct brw_indirect stack_index = brw_indirect(0, 0);
2791
2792 c->out_of_regs = GL_FALSE;
2793
2794 prealloc_reg(c);
2795 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2796 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2797
2798 for (i = 0; i < c->nr_fp_insns; i++) {
2799 const struct prog_instruction *inst = &c->prog_instructions[i];
2800
2801 c->cur_inst = i;
2802
2803 #if 0
2804 _mesa_printf("Inst %d: ", i);
2805 _mesa_print_instruction(inst);
2806 #endif
2807
2808 /* fetch any constants that this instruction needs */
2809 if (c->fp->use_const_buffer)
2810 fetch_constants(c, inst);
2811
2812 if (inst->CondUpdate)
2813 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2814 else
2815 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2816
2817 switch (inst->Opcode) {
2818 case WM_PIXELXY:
2819 emit_pixel_xy(c, inst);
2820 break;
2821 case WM_DELTAXY:
2822 emit_delta_xy(c, inst);
2823 break;
2824 case WM_PIXELW:
2825 emit_pixel_w(c, inst);
2826 break;
2827 case WM_LINTERP:
2828 emit_linterp(c, inst);
2829 break;
2830 case WM_PINTERP:
2831 emit_pinterp(c, inst);
2832 break;
2833 case WM_CINTERP:
2834 emit_cinterp(c, inst);
2835 break;
2836 case WM_WPOSXY:
2837 emit_wpos_xy(c, inst);
2838 break;
2839 case WM_FB_WRITE:
2840 emit_fb_write(c, inst);
2841 break;
2842 case WM_FRONTFACING:
2843 emit_frontfacing(c, inst);
2844 break;
2845 case OPCODE_ADD:
2846 emit_add(c, inst);
2847 break;
2848 case OPCODE_ARL:
2849 emit_arl(c, inst);
2850 break;
2851 case OPCODE_FRC:
2852 emit_frc(c, inst);
2853 break;
2854 case OPCODE_FLR:
2855 emit_flr(c, inst);
2856 break;
2857 case OPCODE_LRP:
2858 emit_lrp(c, inst);
2859 break;
2860 case OPCODE_TRUNC:
2861 emit_trunc(c, inst);
2862 break;
2863 case OPCODE_MOV:
2864 case OPCODE_SWZ:
2865 emit_mov(c, inst);
2866 break;
2867 case OPCODE_DP3:
2868 emit_dp3(c, inst);
2869 break;
2870 case OPCODE_DP4:
2871 emit_dp4(c, inst);
2872 break;
2873 case OPCODE_XPD:
2874 emit_xpd(c, inst);
2875 break;
2876 case OPCODE_DPH:
2877 emit_dph(c, inst);
2878 break;
2879 case OPCODE_RCP:
2880 emit_rcp(c, inst);
2881 break;
2882 case OPCODE_RSQ:
2883 emit_rsq(c, inst);
2884 break;
2885 case OPCODE_SIN:
2886 emit_sin(c, inst);
2887 break;
2888 case OPCODE_COS:
2889 emit_cos(c, inst);
2890 break;
2891 case OPCODE_EX2:
2892 emit_ex2(c, inst);
2893 break;
2894 case OPCODE_LG2:
2895 emit_lg2(c, inst);
2896 break;
2897 case OPCODE_MIN:
2898 case OPCODE_MAX:
2899 emit_min_max(c, inst);
2900 break;
2901 case OPCODE_DDX:
2902 emit_ddx(c, inst);
2903 break;
2904 case OPCODE_DDY:
2905 emit_ddy(c, inst);
2906 break;
2907 case OPCODE_SLT:
2908 emit_slt(c, inst);
2909 break;
2910 case OPCODE_SLE:
2911 emit_sle(c, inst);
2912 break;
2913 case OPCODE_SGT:
2914 emit_sgt(c, inst);
2915 break;
2916 case OPCODE_SGE:
2917 emit_sge(c, inst);
2918 break;
2919 case OPCODE_SEQ:
2920 emit_seq(c, inst);
2921 break;
2922 case OPCODE_SNE:
2923 emit_sne(c, inst);
2924 break;
2925 case OPCODE_MUL:
2926 emit_mul(c, inst);
2927 break;
2928 case OPCODE_POW:
2929 emit_pow(c, inst);
2930 break;
2931 case OPCODE_MAD:
2932 emit_mad(c, inst);
2933 break;
2934 case OPCODE_NOISE1:
2935 emit_noise1(c, inst);
2936 break;
2937 case OPCODE_NOISE2:
2938 emit_noise2(c, inst);
2939 break;
2940 case OPCODE_NOISE3:
2941 emit_noise3(c, inst);
2942 break;
2943 case OPCODE_NOISE4:
2944 emit_noise4(c, inst);
2945 break;
2946 case OPCODE_TEX:
2947 emit_tex(c, inst);
2948 break;
2949 case OPCODE_TXB:
2950 emit_txb(c, inst);
2951 break;
2952 case OPCODE_KIL_NV:
2953 emit_kil(c);
2954 break;
2955 case OPCODE_IF:
2956 assert(if_depth < MAX_IF_DEPTH);
2957 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2958 break;
2959 case OPCODE_ELSE:
2960 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2961 break;
2962 case OPCODE_ENDIF:
2963 assert(if_depth > 0);
2964 brw_ENDIF(p, if_inst[--if_depth]);
2965 break;
2966 case OPCODE_BGNSUB:
2967 brw_save_label(p, inst->Comment, p->nr_insn);
2968 break;
2969 case OPCODE_ENDSUB:
2970 /* no-op */
2971 break;
2972 case OPCODE_CAL:
2973 brw_push_insn_state(p);
2974 brw_set_mask_control(p, BRW_MASK_DISABLE);
2975 brw_set_access_mode(p, BRW_ALIGN_1);
2976 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2977 brw_set_access_mode(p, BRW_ALIGN_16);
2978 brw_ADD(p, get_addr_reg(stack_index),
2979 get_addr_reg(stack_index), brw_imm_d(4));
2980 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2981 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2982 brw_pop_insn_state(p);
2983 break;
2984
2985 case OPCODE_RET:
2986 brw_push_insn_state(p);
2987 brw_set_mask_control(p, BRW_MASK_DISABLE);
2988 brw_ADD(p, get_addr_reg(stack_index),
2989 get_addr_reg(stack_index), brw_imm_d(-4));
2990 brw_set_access_mode(p, BRW_ALIGN_1);
2991 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2992 brw_set_access_mode(p, BRW_ALIGN_16);
2993 brw_pop_insn_state(p);
2994
2995 break;
2996 case OPCODE_BGNLOOP:
2997 /* XXX may need to invalidate the current_constant regs */
2998 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2999 break;
3000 case OPCODE_BRK:
3001 brw_BREAK(p);
3002 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3003 break;
3004 case OPCODE_CONT:
3005 brw_CONT(p);
3006 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3007 break;
3008 case OPCODE_ENDLOOP:
3009 {
3010 struct brw_instruction *inst0, *inst1;
3011 GLuint br = 1;
3012
3013 if (BRW_IS_IGDNG(brw))
3014 br = 2;
3015
3016 loop_depth--;
3017 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
3018 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3019 while (inst0 > loop_inst[loop_depth]) {
3020 inst0--;
3021 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
3022 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3023 inst0->bits3.if_else.pop_count = 0;
3024 }
3025 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3026 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3027 inst0->bits3.if_else.pop_count = 0;
3028 }
3029 }
3030 }
3031 break;
3032 default:
3033 _mesa_printf("unsupported IR in fragment shader %d\n",
3034 inst->Opcode);
3035 }
3036
3037 if (inst->CondUpdate)
3038 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3039 else
3040 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3041 }
3042 post_wm_emit(c);
3043
3044 if (INTEL_DEBUG & DEBUG_WM) {
3045 _mesa_printf("wm-native:\n");
3046 for (i = 0; i < p->nr_insn; i++)
3047 brw_disasm(stderr, &p->store[i]);
3048 _mesa_printf("\n");
3049 }
3050 }
3051
3052 /**
3053 * Do GPU code generation for shaders that use GLSL features such as
3054 * flow control. Other shaders will be compiled with the
3055 */
3056 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3057 {
3058 if (INTEL_DEBUG & DEBUG_WM) {
3059 _mesa_printf("brw_wm_glsl_emit:\n");
3060 }
3061
3062 /* initial instruction translation/simplification */
3063 brw_wm_pass_fp(c);
3064
3065 /* actual code generation */
3066 brw_wm_emit_glsl(brw, c);
3067
3068 if (INTEL_DEBUG & DEBUG_WM) {
3069 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3070 }
3071
3072 c->prog_data.total_grf = num_grf_used(c);
3073 c->prog_data.total_scratch = 0;
3074 }