r300g: Use radeon compiler for fragment programs
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13
14 /**
15 * Determine if the given fragment program uses GLSL features such
16 * as flow conditionals, loops, subroutines.
17 * Some GLSL shaders may use these features, others might not.
18 */
19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
20 {
21 int i;
22 for (i = 0; i < fp->Base.NumInstructions; i++) {
23 const struct prog_instruction *inst = &fp->Base.Instructions[i];
24 switch (inst->Opcode) {
25 case OPCODE_IF:
26 case OPCODE_ENDIF:
27 case OPCODE_CAL:
28 case OPCODE_BRK:
29 case OPCODE_RET:
30 case OPCODE_DDX:
31 case OPCODE_DDY:
32 case OPCODE_NOISE1:
33 case OPCODE_NOISE2:
34 case OPCODE_NOISE3:
35 case OPCODE_NOISE4:
36 case OPCODE_BGNLOOP:
37 return GL_TRUE;
38 default:
39 break;
40 }
41 }
42 return GL_FALSE;
43 }
44
45
46
47 static void
48 reclaim_temps(struct brw_wm_compile *c);
49
50
51 /** Mark GRF register as used. */
52 static void
53 prealloc_grf(struct brw_wm_compile *c, int r)
54 {
55 c->used_grf[r] = GL_TRUE;
56 }
57
58
59 /** Mark given GRF register as not in use. */
60 static void
61 release_grf(struct brw_wm_compile *c, int r)
62 {
63 /*assert(c->used_grf[r]);*/
64 c->used_grf[r] = GL_FALSE;
65 c->first_free_grf = MIN2(c->first_free_grf, r);
66 }
67
68
69 /** Return index of a free GRF, mark it as used. */
70 static int
71 alloc_grf(struct brw_wm_compile *c)
72 {
73 GLuint r;
74 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
75 if (!c->used_grf[r]) {
76 c->used_grf[r] = GL_TRUE;
77 c->first_free_grf = r + 1; /* a guess */
78 return r;
79 }
80 }
81
82 /* no free temps, try to reclaim some */
83 reclaim_temps(c);
84 c->first_free_grf = 0;
85
86 /* try alloc again */
87 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
88 if (!c->used_grf[r]) {
89 c->used_grf[r] = GL_TRUE;
90 c->first_free_grf = r + 1; /* a guess */
91 return r;
92 }
93 }
94
95 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
96 assert(c->used_grf[r]);
97 }
98
99 /* really, no free GRF regs found */
100 if (!c->out_of_regs) {
101 /* print warning once per compilation */
102 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
103 c->out_of_regs = GL_TRUE;
104 }
105
106 return -1;
107 }
108
109
110 /** Return number of GRF registers used */
111 static int
112 num_grf_used(const struct brw_wm_compile *c)
113 {
114 int r;
115 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
116 if (c->used_grf[r])
117 return r + 1;
118 return 0;
119 }
120
121
122
123 /**
124 * Record the mapping of a Mesa register to a hardware register.
125 */
126 static void set_reg(struct brw_wm_compile *c, int file, int index,
127 int component, struct brw_reg reg)
128 {
129 c->wm_regs[file][index][component].reg = reg;
130 c->wm_regs[file][index][component].inited = GL_TRUE;
131 }
132
133 /**
134 * Examine instruction's write mask to find index of first component
135 * enabled for writing.
136 */
137 static int get_scalar_dst_index(const struct prog_instruction *inst)
138 {
139 int i;
140 for (i = 0; i < 4; i++)
141 if (inst->DstReg.WriteMask & (1<<i))
142 break;
143 return i;
144 }
145
146 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
147 {
148 struct brw_reg reg;
149
150 /* if we need to allocate another temp, grow the tmp_regs[] array */
151 if (c->tmp_index == c->tmp_max) {
152 int r = alloc_grf(c);
153 if (r < 0) {
154 /*printf("Out of temps in %s\n", __FUNCTION__);*/
155 r = 50; /* XXX random register! */
156 }
157 c->tmp_regs[ c->tmp_max++ ] = r;
158 }
159
160 /* form the GRF register */
161 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
162 /*printf("alloc_temp %d\n", reg.nr);*/
163 assert(reg.nr < BRW_WM_MAX_GRF);
164 return reg;
165
166 }
167
168 /**
169 * Save current temp register info.
170 * There must be a matching call to release_tmps().
171 */
172 static int mark_tmps(struct brw_wm_compile *c)
173 {
174 return c->tmp_index;
175 }
176
177 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
178 {
179 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
180 }
181
182 static void release_tmps(struct brw_wm_compile *c, int mark)
183 {
184 c->tmp_index = mark;
185 }
186
187 /**
188 * Convert Mesa src register to brw register.
189 *
190 * Since we're running in SOA mode each Mesa register corresponds to four
191 * hardware registers. We allocate the hardware registers as needed here.
192 *
193 * \param file register file, one of PROGRAM_x
194 * \param index register number
195 * \param component src component (X=0, Y=1, Z=2, W=3)
196 * \param nr not used?!?
197 * \param neg negate value?
198 * \param abs take absolute value?
199 */
200 static struct brw_reg
201 get_reg(struct brw_wm_compile *c, int file, int index, int component,
202 int nr, GLuint neg, GLuint abs)
203 {
204 struct brw_reg reg;
205 switch (file) {
206 case PROGRAM_STATE_VAR:
207 case PROGRAM_CONSTANT:
208 case PROGRAM_UNIFORM:
209 file = PROGRAM_STATE_VAR;
210 break;
211 case PROGRAM_UNDEFINED:
212 return brw_null_reg();
213 case PROGRAM_TEMPORARY:
214 case PROGRAM_INPUT:
215 case PROGRAM_OUTPUT:
216 case PROGRAM_PAYLOAD:
217 break;
218 default:
219 _mesa_problem(NULL, "Unexpected file in get_reg()");
220 return brw_null_reg();
221 }
222
223 assert(index < 256);
224 assert(component < 4);
225
226 /* see if we've already allocated a HW register for this Mesa register */
227 if (c->wm_regs[file][index][component].inited) {
228 /* yes, re-use */
229 reg = c->wm_regs[file][index][component].reg;
230 }
231 else {
232 /* no, allocate new register */
233 int grf = alloc_grf(c);
234 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
235 if (grf < 0) {
236 /* totally out of temps */
237 grf = 51; /* XXX random register! */
238 }
239
240 reg = brw_vec8_grf(grf, 0);
241 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
242
243 set_reg(c, file, index, component, reg);
244 }
245
246 if (neg & (1 << component)) {
247 reg = negate(reg);
248 }
249 if (abs)
250 reg = brw_abs(reg);
251 return reg;
252 }
253
254
255
256 /**
257 * This is called if we run out of GRF registers. Examine the live intervals
258 * of temp regs in the program and free those which won't be used again.
259 */
260 static void
261 reclaim_temps(struct brw_wm_compile *c)
262 {
263 GLint intBegin[MAX_PROGRAM_TEMPS];
264 GLint intEnd[MAX_PROGRAM_TEMPS];
265 int index;
266
267 /*printf("Reclaim temps:\n");*/
268
269 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
270 intBegin, intEnd);
271
272 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
273 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
274 /* program temp[i] can be freed */
275 int component;
276 /*printf(" temp[%d] is dead\n", index);*/
277 for (component = 0; component < 4; component++) {
278 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
279 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
280 release_grf(c, r);
281 /*
282 printf(" Reclaim temp %d, reg %d at inst %d\n",
283 index, r, c->cur_inst);
284 */
285 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
286 }
287 }
288 }
289 }
290 }
291
292
293
294
295 /**
296 * Preallocate registers. This sets up the Mesa to hardware register
297 * mapping for certain registers, such as constants (uniforms/state vars)
298 * and shader inputs.
299 */
300 static void prealloc_reg(struct brw_wm_compile *c)
301 {
302 int i, j;
303 struct brw_reg reg;
304 int urb_read_length = 0;
305 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
306 GLuint reg_index = 0;
307
308 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
309 c->first_free_grf = 0;
310
311 for (i = 0; i < 4; i++) {
312 if (i < c->key.nr_depth_regs)
313 reg = brw_vec8_grf(i * 2, 0);
314 else
315 reg = brw_vec8_grf(0, 0);
316 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
317 }
318 reg_index += 2 * c->key.nr_depth_regs;
319
320 /* constants */
321 {
322 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
323 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
324
325 /* use a real constant buffer, or just use a section of the GRF? */
326 /* XXX this heuristic may need adjustment... */
327 if ((nr_params + nr_temps) * 4 + reg_index > 80)
328 c->fp->use_const_buffer = GL_TRUE;
329 else
330 c->fp->use_const_buffer = GL_FALSE;
331 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
332
333 if (c->fp->use_const_buffer) {
334 /* We'll use a real constant buffer and fetch constants from
335 * it with a dataport read message.
336 */
337
338 /* number of float constants in CURBE */
339 c->prog_data.nr_params = 0;
340 }
341 else {
342 const struct gl_program_parameter_list *plist =
343 c->fp->program.Base.Parameters;
344 int index = 0;
345
346 /* number of float constants in CURBE */
347 c->prog_data.nr_params = 4 * nr_params;
348
349 /* loop over program constants (float[4]) */
350 for (i = 0; i < nr_params; i++) {
351 /* loop over XYZW channels */
352 for (j = 0; j < 4; j++, index++) {
353 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
354 /* Save pointer to parameter/constant value.
355 * Constants will be copied in prepare_constant_buffer()
356 */
357 c->prog_data.param[index] = &plist->ParameterValues[i][j];
358 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
359 }
360 }
361 /* number of constant regs used (each reg is float[8]) */
362 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
363 reg_index += c->nr_creg;
364 }
365 }
366
367 /* fragment shader inputs */
368 for (i = 0; i < VERT_RESULT_MAX; i++) {
369 int fp_input;
370
371 if (i >= VERT_RESULT_VAR0)
372 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
373 else if (i <= VERT_RESULT_TEX7)
374 fp_input = i;
375 else
376 fp_input = -1;
377
378 if (fp_input >= 0 && inputs & (1 << fp_input)) {
379 urb_read_length = reg_index;
380 reg = brw_vec8_grf(reg_index, 0);
381 for (j = 0; j < 4; j++)
382 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
383 }
384 if (c->key.vp_outputs_written & (1 << i)) {
385 reg_index += 2;
386 }
387 }
388
389 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
390 c->prog_data.urb_read_length = urb_read_length;
391 c->prog_data.curb_read_length = c->nr_creg;
392 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
393 reg_index++;
394 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
395 reg_index += 2;
396
397 /* mark GRF regs [0..reg_index-1] as in-use */
398 for (i = 0; i < reg_index; i++)
399 prealloc_grf(c, i);
400
401 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
402 prealloc_grf(c, 126);
403 prealloc_grf(c, 127);
404
405 /* An instruction may reference up to three constants.
406 * They'll be found in these registers.
407 * XXX alloc these on demand!
408 */
409 if (c->fp->use_const_buffer) {
410 for (i = 0; i < 3; i++) {
411 c->current_const[i].index = -1;
412 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
413 }
414 }
415 #if 0
416 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
417 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
418 #endif
419 }
420
421
422 /**
423 * Check if any of the instruction's src registers are constants, uniforms,
424 * or statevars. If so, fetch any constants that we don't already have in
425 * the three GRF slots.
426 */
427 static void fetch_constants(struct brw_wm_compile *c,
428 const struct prog_instruction *inst)
429 {
430 struct brw_compile *p = &c->func;
431 GLuint i;
432
433 /* loop over instruction src regs */
434 for (i = 0; i < 3; i++) {
435 const struct prog_src_register *src = &inst->SrcReg[i];
436 if (src->File == PROGRAM_STATE_VAR ||
437 src->File == PROGRAM_CONSTANT ||
438 src->File == PROGRAM_UNIFORM) {
439 c->current_const[i].index = src->Index;
440
441 #if 0
442 printf(" fetch const[%d] for arg %d into reg %d\n",
443 src->Index, i, c->current_const[i].reg.nr);
444 #endif
445
446 /* need to fetch the constant now */
447 brw_dp_READ_4(p,
448 c->current_const[i].reg, /* writeback dest */
449 src->RelAddr, /* relative indexing? */
450 16 * src->Index, /* byte offset */
451 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
452 );
453 }
454 }
455 }
456
457
458 /**
459 * Convert Mesa dst register to brw register.
460 */
461 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
462 const struct prog_instruction *inst,
463 GLuint component)
464 {
465 const int nr = 1;
466 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
467 0, 0);
468 }
469
470
471 static struct brw_reg
472 get_src_reg_const(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint srcRegIndex, GLuint component)
475 {
476 /* We should have already fetched the constant from the constant
477 * buffer in fetch_constants(). Now we just have to return a
478 * register description that extracts the needed component and
479 * smears it across all eight vector components.
480 */
481 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
482 struct brw_reg const_reg;
483
484 assert(component < 4);
485 assert(srcRegIndex < 3);
486 assert(c->current_const[srcRegIndex].index != -1);
487 const_reg = c->current_const[srcRegIndex].reg;
488
489 /* extract desired float from the const_reg, and smear */
490 const_reg = stride(const_reg, 0, 1, 0);
491 const_reg.subnr = component * 4;
492
493 if (src->Negate & (1 << component))
494 const_reg = negate(const_reg);
495 if (src->Abs)
496 const_reg = brw_abs(const_reg);
497
498 #if 0
499 printf(" form const[%d].%d for arg %d, reg %d\n",
500 c->current_const[srcRegIndex].index,
501 component,
502 srcRegIndex,
503 const_reg.nr);
504 #endif
505
506 return const_reg;
507 }
508
509
510 /**
511 * Convert Mesa src register to brw register.
512 */
513 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
514 const struct prog_instruction *inst,
515 GLuint srcRegIndex, GLuint channel)
516 {
517 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
518 const GLuint nr = 1;
519 const GLuint component = GET_SWZ(src->Swizzle, channel);
520
521 /* Extended swizzle terms */
522 if (component == SWIZZLE_ZERO) {
523 return brw_imm_f(0.0F);
524 }
525 else if (component == SWIZZLE_ONE) {
526 return brw_imm_f(1.0F);
527 }
528
529 if (c->fp->use_const_buffer &&
530 (src->File == PROGRAM_STATE_VAR ||
531 src->File == PROGRAM_CONSTANT ||
532 src->File == PROGRAM_UNIFORM)) {
533 return get_src_reg_const(c, inst, srcRegIndex, component);
534 }
535 else {
536 /* other type of source register */
537 return get_reg(c, src->File, src->Index, component, nr,
538 src->Negate, src->Abs);
539 }
540 }
541
542
543 /**
544 * Same as \sa get_src_reg() but if the register is a literal, emit
545 * a brw_reg encoding the literal.
546 * Note that a brw instruction only allows one src operand to be a literal.
547 * For instructions with more than one operand, only the second can be a
548 * literal. This means that we treat some literals as constants/uniforms
549 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
550 *
551 */
552 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
553 const struct prog_instruction *inst,
554 GLuint srcRegIndex, GLuint channel)
555 {
556 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
557 if (src->File == PROGRAM_CONSTANT) {
558 /* a literal */
559 const int component = GET_SWZ(src->Swizzle, channel);
560 const GLfloat *param =
561 c->fp->program.Base.Parameters->ParameterValues[src->Index];
562 GLfloat value = param[component];
563 if (src->Negate & (1 << channel))
564 value = -value;
565 if (src->Abs)
566 value = FABSF(value);
567 #if 0
568 printf(" form immed value %f for chan %d\n", value, channel);
569 #endif
570 return brw_imm_f(value);
571 }
572 else {
573 return get_src_reg(c, inst, srcRegIndex, channel);
574 }
575 }
576
577
578 /**
579 * Subroutines are minimal support for resusable instruction sequences.
580 * They are implemented as simply as possible to minimise overhead: there
581 * is no explicit support for communication between the caller and callee
582 * other than saving the return address in a temporary register, nor is
583 * there any automatic local storage. This implies that great care is
584 * required before attempting reentrancy or any kind of nested
585 * subroutine invocations.
586 */
587 static void invoke_subroutine( struct brw_wm_compile *c,
588 enum _subroutine subroutine,
589 void (*emit)( struct brw_wm_compile * ) )
590 {
591 struct brw_compile *p = &c->func;
592
593 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
594
595 if( c->subroutines[ subroutine ] ) {
596 /* subroutine previously emitted: reuse existing instructions */
597
598 int mark = mark_tmps( c );
599 struct brw_reg return_address = retype( alloc_tmp( c ),
600 BRW_REGISTER_TYPE_UD );
601 int here = p->nr_insn;
602
603 brw_push_insn_state(p);
604 brw_set_mask_control(p, BRW_MASK_DISABLE);
605 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
606
607 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
608 brw_imm_d( ( c->subroutines[ subroutine ] -
609 here - 1 ) << 4 ) );
610 brw_pop_insn_state(p);
611
612 release_tmps( c, mark );
613 } else {
614 /* previously unused subroutine: emit, and mark for later reuse */
615
616 int mark = mark_tmps( c );
617 struct brw_reg return_address = retype( alloc_tmp( c ),
618 BRW_REGISTER_TYPE_UD );
619 struct brw_instruction *calc;
620 int base = p->nr_insn;
621
622 brw_push_insn_state(p);
623 brw_set_mask_control(p, BRW_MASK_DISABLE);
624 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
625 brw_pop_insn_state(p);
626
627 c->subroutines[ subroutine ] = p->nr_insn;
628
629 emit( c );
630
631 brw_push_insn_state(p);
632 brw_set_mask_control(p, BRW_MASK_DISABLE);
633 brw_MOV( p, brw_ip_reg(), return_address );
634 brw_pop_insn_state(p);
635
636 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
637
638 release_tmps( c, mark );
639 }
640 }
641
642 static void emit_abs( struct brw_wm_compile *c,
643 const struct prog_instruction *inst)
644 {
645 int i;
646 struct brw_compile *p = &c->func;
647 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
648 for (i = 0; i < 4; i++) {
649 if (inst->DstReg.WriteMask & (1<<i)) {
650 struct brw_reg src, dst;
651 dst = get_dst_reg(c, inst, i);
652 src = get_src_reg(c, inst, 0, i);
653 brw_MOV(p, dst, brw_abs(src));
654 }
655 }
656 brw_set_saturate(p, 0);
657 }
658
659 static void emit_trunc( struct brw_wm_compile *c,
660 const struct prog_instruction *inst)
661 {
662 int i;
663 struct brw_compile *p = &c->func;
664 GLuint mask = inst->DstReg.WriteMask;
665 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
666 for (i = 0; i < 4; i++) {
667 if (mask & (1<<i)) {
668 struct brw_reg src, dst;
669 dst = get_dst_reg(c, inst, i);
670 src = get_src_reg(c, inst, 0, i);
671 brw_RNDZ(p, dst, src);
672 }
673 }
674 brw_set_saturate(p, 0);
675 }
676
677 static void emit_mov( struct brw_wm_compile *c,
678 const struct prog_instruction *inst)
679 {
680 int i;
681 struct brw_compile *p = &c->func;
682 GLuint mask = inst->DstReg.WriteMask;
683 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
684 for (i = 0; i < 4; i++) {
685 if (mask & (1<<i)) {
686 struct brw_reg src, dst;
687 dst = get_dst_reg(c, inst, i);
688 /* XXX some moves from immediate value don't work reliably!!! */
689 /*src = get_src_reg_imm(c, inst, 0, i);*/
690 src = get_src_reg(c, inst, 0, i);
691 brw_MOV(p, dst, src);
692 }
693 }
694 brw_set_saturate(p, 0);
695 }
696
697 static void emit_pixel_xy(struct brw_wm_compile *c,
698 const struct prog_instruction *inst)
699 {
700 struct brw_reg r1 = brw_vec1_grf(1, 0);
701 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
702
703 struct brw_reg dst0, dst1;
704 struct brw_compile *p = &c->func;
705 GLuint mask = inst->DstReg.WriteMask;
706
707 dst0 = get_dst_reg(c, inst, 0);
708 dst1 = get_dst_reg(c, inst, 1);
709 /* Calculate pixel centers by adding 1 or 0 to each of the
710 * micro-tile coordinates passed in r1.
711 */
712 if (mask & WRITEMASK_X) {
713 brw_ADD(p,
714 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
715 stride(suboffset(r1_uw, 4), 2, 4, 0),
716 brw_imm_v(0x10101010));
717 }
718
719 if (mask & WRITEMASK_Y) {
720 brw_ADD(p,
721 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
722 stride(suboffset(r1_uw, 5), 2, 4, 0),
723 brw_imm_v(0x11001100));
724 }
725 }
726
727 static void emit_delta_xy(struct brw_wm_compile *c,
728 const struct prog_instruction *inst)
729 {
730 struct brw_reg r1 = brw_vec1_grf(1, 0);
731 struct brw_reg dst0, dst1, src0, src1;
732 struct brw_compile *p = &c->func;
733 GLuint mask = inst->DstReg.WriteMask;
734
735 dst0 = get_dst_reg(c, inst, 0);
736 dst1 = get_dst_reg(c, inst, 1);
737 src0 = get_src_reg(c, inst, 0, 0);
738 src1 = get_src_reg(c, inst, 0, 1);
739 /* Calc delta X,Y by subtracting origin in r1 from the pixel
740 * centers.
741 */
742 if (mask & WRITEMASK_X) {
743 brw_ADD(p,
744 dst0,
745 retype(src0, BRW_REGISTER_TYPE_UW),
746 negate(r1));
747 }
748
749 if (mask & WRITEMASK_Y) {
750 brw_ADD(p,
751 dst1,
752 retype(src1, BRW_REGISTER_TYPE_UW),
753 negate(suboffset(r1,1)));
754
755 }
756 }
757
758 static void fire_fb_write( struct brw_wm_compile *c,
759 GLuint base_reg,
760 GLuint nr,
761 GLuint target,
762 GLuint eot)
763 {
764 struct brw_compile *p = &c->func;
765 /* Pass through control information:
766 */
767 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
768 {
769 brw_push_insn_state(p);
770 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
771 brw_MOV(p,
772 brw_message_reg(base_reg + 1),
773 brw_vec8_grf(1, 0));
774 brw_pop_insn_state(p);
775 }
776 /* Send framebuffer write message: */
777 brw_fb_WRITE(p,
778 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
779 base_reg,
780 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
781 target,
782 nr,
783 0,
784 eot);
785 }
786
787 static void emit_fb_write(struct brw_wm_compile *c,
788 const struct prog_instruction *inst)
789 {
790 struct brw_compile *p = &c->func;
791 int nr = 2;
792 int channel;
793 GLuint target, eot;
794 struct brw_reg src0;
795
796 /* Reserve a space for AA - may not be needed:
797 */
798 if (c->key.aa_dest_stencil_reg)
799 nr += 1;
800
801 brw_push_insn_state(p);
802 for (channel = 0; channel < 4; channel++) {
803 src0 = get_src_reg(c, inst, 0, channel);
804 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
805 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
806 brw_MOV(p, brw_message_reg(nr + channel), src0);
807 }
808 /* skip over the regs populated above: */
809 nr += 8;
810 brw_pop_insn_state(p);
811
812 if (c->key.source_depth_to_render_target) {
813 if (c->key.computes_depth) {
814 src0 = get_src_reg(c, inst, 2, 2);
815 brw_MOV(p, brw_message_reg(nr), src0);
816 }
817 else {
818 src0 = get_src_reg(c, inst, 1, 1);
819 brw_MOV(p, brw_message_reg(nr), src0);
820 }
821
822 nr += 2;
823 }
824
825 if (c->key.dest_depth_reg) {
826 const GLuint comp = c->key.dest_depth_reg / 2;
827 const GLuint off = c->key.dest_depth_reg % 2;
828
829 if (off != 0) {
830 /* XXX this code needs review/testing */
831 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
832 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
833
834 brw_push_insn_state(p);
835 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
836
837 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
838 /* 2nd half? */
839 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
840 brw_pop_insn_state(p);
841 }
842 else
843 {
844 struct brw_reg src = get_src_reg(c, inst, 1, 1);
845 brw_MOV(p, brw_message_reg(nr), src);
846 }
847 nr += 2;
848 }
849
850 target = inst->Aux >> 1;
851 eot = inst->Aux & 1;
852 fire_fb_write(c, 0, nr, target, eot);
853 }
854
855 static void emit_pixel_w( struct brw_wm_compile *c,
856 const struct prog_instruction *inst)
857 {
858 struct brw_compile *p = &c->func;
859 GLuint mask = inst->DstReg.WriteMask;
860 if (mask & WRITEMASK_W) {
861 struct brw_reg dst, src0, delta0, delta1;
862 struct brw_reg interp3;
863
864 dst = get_dst_reg(c, inst, 3);
865 src0 = get_src_reg(c, inst, 0, 0);
866 delta0 = get_src_reg(c, inst, 1, 0);
867 delta1 = get_src_reg(c, inst, 1, 1);
868
869 interp3 = brw_vec1_grf(src0.nr+1, 4);
870 /* Calc 1/w - just linterp wpos[3] optimized by putting the
871 * result straight into a message reg.
872 */
873 brw_LINE(p, brw_null_reg(), interp3, delta0);
874 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
875
876 /* Calc w */
877 brw_math_16( p, dst,
878 BRW_MATH_FUNCTION_INV,
879 BRW_MATH_SATURATE_NONE,
880 2, brw_null_reg(),
881 BRW_MATH_PRECISION_FULL);
882 }
883 }
884
885 static void emit_linterp(struct brw_wm_compile *c,
886 const struct prog_instruction *inst)
887 {
888 struct brw_compile *p = &c->func;
889 GLuint mask = inst->DstReg.WriteMask;
890 struct brw_reg interp[4];
891 struct brw_reg dst, delta0, delta1;
892 struct brw_reg src0;
893 GLuint nr, i;
894
895 src0 = get_src_reg(c, inst, 0, 0);
896 delta0 = get_src_reg(c, inst, 1, 0);
897 delta1 = get_src_reg(c, inst, 1, 1);
898 nr = src0.nr;
899
900 interp[0] = brw_vec1_grf(nr, 0);
901 interp[1] = brw_vec1_grf(nr, 4);
902 interp[2] = brw_vec1_grf(nr+1, 0);
903 interp[3] = brw_vec1_grf(nr+1, 4);
904
905 for(i = 0; i < 4; i++ ) {
906 if (mask & (1<<i)) {
907 dst = get_dst_reg(c, inst, i);
908 brw_LINE(p, brw_null_reg(), interp[i], delta0);
909 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
910 }
911 }
912 }
913
914 static void emit_cinterp(struct brw_wm_compile *c,
915 const struct prog_instruction *inst)
916 {
917 struct brw_compile *p = &c->func;
918 GLuint mask = inst->DstReg.WriteMask;
919
920 struct brw_reg interp[4];
921 struct brw_reg dst, src0;
922 GLuint nr, i;
923
924 src0 = get_src_reg(c, inst, 0, 0);
925 nr = src0.nr;
926
927 interp[0] = brw_vec1_grf(nr, 0);
928 interp[1] = brw_vec1_grf(nr, 4);
929 interp[2] = brw_vec1_grf(nr+1, 0);
930 interp[3] = brw_vec1_grf(nr+1, 4);
931
932 for(i = 0; i < 4; i++ ) {
933 if (mask & (1<<i)) {
934 dst = get_dst_reg(c, inst, i);
935 brw_MOV(p, dst, suboffset(interp[i],3));
936 }
937 }
938 }
939
940 static void emit_pinterp(struct brw_wm_compile *c,
941 const struct prog_instruction *inst)
942 {
943 struct brw_compile *p = &c->func;
944 GLuint mask = inst->DstReg.WriteMask;
945
946 struct brw_reg interp[4];
947 struct brw_reg dst, delta0, delta1;
948 struct brw_reg src0, w;
949 GLuint nr, i;
950
951 src0 = get_src_reg(c, inst, 0, 0);
952 delta0 = get_src_reg(c, inst, 1, 0);
953 delta1 = get_src_reg(c, inst, 1, 1);
954 w = get_src_reg(c, inst, 2, 3);
955 nr = src0.nr;
956
957 interp[0] = brw_vec1_grf(nr, 0);
958 interp[1] = brw_vec1_grf(nr, 4);
959 interp[2] = brw_vec1_grf(nr+1, 0);
960 interp[3] = brw_vec1_grf(nr+1, 4);
961
962 for(i = 0; i < 4; i++ ) {
963 if (mask & (1<<i)) {
964 dst = get_dst_reg(c, inst, i);
965 brw_LINE(p, brw_null_reg(), interp[i], delta0);
966 brw_MAC(p, dst, suboffset(interp[i],1),
967 delta1);
968 brw_MUL(p, dst, dst, w);
969 }
970 }
971 }
972
973 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
974 static void emit_frontfacing(struct brw_wm_compile *c,
975 const struct prog_instruction *inst)
976 {
977 struct brw_compile *p = &c->func;
978 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
979 struct brw_reg dst;
980 GLuint mask = inst->DstReg.WriteMask;
981 int i;
982
983 for (i = 0; i < 4; i++) {
984 if (mask & (1<<i)) {
985 dst = get_dst_reg(c, inst, i);
986 brw_MOV(p, dst, brw_imm_f(0.0));
987 }
988 }
989
990 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
991 * us front face
992 */
993 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
994 for (i = 0; i < 4; i++) {
995 if (mask & (1<<i)) {
996 dst = get_dst_reg(c, inst, i);
997 brw_MOV(p, dst, brw_imm_f(1.0));
998 }
999 }
1000 brw_set_predicate_control_flag_value(p, 0xff);
1001 }
1002
1003 static void emit_xpd(struct brw_wm_compile *c,
1004 const struct prog_instruction *inst)
1005 {
1006 int i;
1007 struct brw_compile *p = &c->func;
1008 GLuint mask = inst->DstReg.WriteMask;
1009 for (i = 0; i < 4; i++) {
1010 GLuint i2 = (i+2)%3;
1011 GLuint i1 = (i+1)%3;
1012 if (mask & (1<<i)) {
1013 struct brw_reg src0, src1, dst;
1014 dst = get_dst_reg(c, inst, i);
1015 src0 = negate(get_src_reg(c, inst, 0, i2));
1016 src1 = get_src_reg_imm(c, inst, 1, i1);
1017 brw_MUL(p, brw_null_reg(), src0, src1);
1018 src0 = get_src_reg(c, inst, 0, i1);
1019 src1 = get_src_reg_imm(c, inst, 1, i2);
1020 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1021 brw_MAC(p, dst, src0, src1);
1022 brw_set_saturate(p, 0);
1023 }
1024 }
1025 brw_set_saturate(p, 0);
1026 }
1027
1028 static void emit_dp3(struct brw_wm_compile *c,
1029 const struct prog_instruction *inst)
1030 {
1031 struct brw_reg src0[3], src1[3], dst;
1032 int i;
1033 struct brw_compile *p = &c->func;
1034 for (i = 0; i < 3; i++) {
1035 src0[i] = get_src_reg(c, inst, 0, i);
1036 src1[i] = get_src_reg_imm(c, inst, 1, i);
1037 }
1038
1039 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1040 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1041 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1042 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1043 brw_MAC(p, dst, src0[2], src1[2]);
1044 brw_set_saturate(p, 0);
1045 }
1046
1047 static void emit_dp4(struct brw_wm_compile *c,
1048 const struct prog_instruction *inst)
1049 {
1050 struct brw_reg src0[4], src1[4], dst;
1051 int i;
1052 struct brw_compile *p = &c->func;
1053 for (i = 0; i < 4; i++) {
1054 src0[i] = get_src_reg(c, inst, 0, i);
1055 src1[i] = get_src_reg_imm(c, inst, 1, i);
1056 }
1057 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1058 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1059 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1060 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1061 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1062 brw_MAC(p, dst, src0[3], src1[3]);
1063 brw_set_saturate(p, 0);
1064 }
1065
1066 static void emit_dph(struct brw_wm_compile *c,
1067 const struct prog_instruction *inst)
1068 {
1069 struct brw_reg src0[4], src1[4], dst;
1070 int i;
1071 struct brw_compile *p = &c->func;
1072 for (i = 0; i < 4; i++) {
1073 src0[i] = get_src_reg(c, inst, 0, i);
1074 src1[i] = get_src_reg_imm(c, inst, 1, i);
1075 }
1076 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1077 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1078 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1079 brw_MAC(p, dst, src0[2], src1[2]);
1080 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1081 brw_ADD(p, dst, dst, src1[3]);
1082 brw_set_saturate(p, 0);
1083 }
1084
1085 /**
1086 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1087 * Note that the result of the function is smeared across the dest
1088 * register's X, Y, Z and W channels (subject to writemasking of course).
1089 */
1090 static void emit_math1(struct brw_wm_compile *c,
1091 const struct prog_instruction *inst, GLuint func)
1092 {
1093 struct brw_compile *p = &c->func;
1094 struct brw_reg src0, dst, tmp;
1095 const int mark = mark_tmps( c );
1096 int i;
1097
1098 tmp = alloc_tmp(c);
1099
1100 /* Get first component of source register */
1101 src0 = get_src_reg(c, inst, 0, 0);
1102
1103 /* tmp = func(src0) */
1104 brw_MOV(p, brw_message_reg(2), src0);
1105 brw_math(p,
1106 tmp,
1107 func,
1108 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1109 2,
1110 brw_null_reg(),
1111 BRW_MATH_DATA_VECTOR,
1112 BRW_MATH_PRECISION_FULL);
1113
1114 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1115
1116 /* replicate tmp value across enabled dest channels */
1117 for (i = 0; i < 4; i++) {
1118 if (inst->DstReg.WriteMask & (1 << i)) {
1119 dst = get_dst_reg(c, inst, i);
1120 brw_MOV(p, dst, tmp);
1121 }
1122 }
1123
1124 release_tmps(c, mark);
1125 }
1126
1127 static void emit_rcp(struct brw_wm_compile *c,
1128 const struct prog_instruction *inst)
1129 {
1130 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1131 }
1132
1133 static void emit_rsq(struct brw_wm_compile *c,
1134 const struct prog_instruction *inst)
1135 {
1136 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1137 }
1138
1139 static void emit_sin(struct brw_wm_compile *c,
1140 const struct prog_instruction *inst)
1141 {
1142 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1143 }
1144
1145 static void emit_cos(struct brw_wm_compile *c,
1146 const struct prog_instruction *inst)
1147 {
1148 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1149 }
1150
1151 static void emit_ex2(struct brw_wm_compile *c,
1152 const struct prog_instruction *inst)
1153 {
1154 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1155 }
1156
1157 static void emit_lg2(struct brw_wm_compile *c,
1158 const struct prog_instruction *inst)
1159 {
1160 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1161 }
1162
1163 static void emit_add(struct brw_wm_compile *c,
1164 const struct prog_instruction *inst)
1165 {
1166 struct brw_compile *p = &c->func;
1167 struct brw_reg src0, src1, dst;
1168 GLuint mask = inst->DstReg.WriteMask;
1169 int i;
1170 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1171 for (i = 0 ; i < 4; i++) {
1172 if (mask & (1<<i)) {
1173 dst = get_dst_reg(c, inst, i);
1174 src0 = get_src_reg(c, inst, 0, i);
1175 src1 = get_src_reg_imm(c, inst, 1, i);
1176 brw_ADD(p, dst, src0, src1);
1177 }
1178 }
1179 brw_set_saturate(p, 0);
1180 }
1181
1182 static void emit_arl(struct brw_wm_compile *c,
1183 const struct prog_instruction *inst)
1184 {
1185 struct brw_compile *p = &c->func;
1186 struct brw_reg src0, addr_reg;
1187 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1188 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1189 BRW_ARF_ADDRESS, 0);
1190 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1191 brw_MOV(p, addr_reg, src0);
1192 brw_set_saturate(p, 0);
1193 }
1194
1195 static void emit_sub(struct brw_wm_compile *c,
1196 const struct prog_instruction *inst)
1197 {
1198 struct brw_compile *p = &c->func;
1199 struct brw_reg src0, src1, dst;
1200 GLuint mask = inst->DstReg.WriteMask;
1201 int i;
1202 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1203 for (i = 0 ; i < 4; i++) {
1204 if (mask & (1<<i)) {
1205 dst = get_dst_reg(c, inst, i);
1206 src0 = get_src_reg(c, inst, 0, i);
1207 src1 = get_src_reg_imm(c, inst, 1, i);
1208 brw_ADD(p, dst, src0, negate(src1));
1209 }
1210 }
1211 brw_set_saturate(p, 0);
1212 }
1213
1214 static void emit_mul(struct brw_wm_compile *c,
1215 const struct prog_instruction *inst)
1216 {
1217 struct brw_compile *p = &c->func;
1218 struct brw_reg src0, src1, dst;
1219 GLuint mask = inst->DstReg.WriteMask;
1220 int i;
1221 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1222 for (i = 0 ; i < 4; i++) {
1223 if (mask & (1<<i)) {
1224 dst = get_dst_reg(c, inst, i);
1225 src0 = get_src_reg(c, inst, 0, i);
1226 src1 = get_src_reg_imm(c, inst, 1, i);
1227 brw_MUL(p, dst, src0, src1);
1228 }
1229 }
1230 brw_set_saturate(p, 0);
1231 }
1232
1233 static void emit_frc(struct brw_wm_compile *c,
1234 const struct prog_instruction *inst)
1235 {
1236 struct brw_compile *p = &c->func;
1237 struct brw_reg src0, dst;
1238 GLuint mask = inst->DstReg.WriteMask;
1239 int i;
1240 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1241 for (i = 0 ; i < 4; i++) {
1242 if (mask & (1<<i)) {
1243 dst = get_dst_reg(c, inst, i);
1244 src0 = get_src_reg_imm(c, inst, 0, i);
1245 brw_FRC(p, dst, src0);
1246 }
1247 }
1248 if (inst->SaturateMode != SATURATE_OFF)
1249 brw_set_saturate(p, 0);
1250 }
1251
1252 static void emit_flr(struct brw_wm_compile *c,
1253 const struct prog_instruction *inst)
1254 {
1255 struct brw_compile *p = &c->func;
1256 struct brw_reg src0, dst;
1257 GLuint mask = inst->DstReg.WriteMask;
1258 int i;
1259 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1260 for (i = 0 ; i < 4; i++) {
1261 if (mask & (1<<i)) {
1262 dst = get_dst_reg(c, inst, i);
1263 src0 = get_src_reg_imm(c, inst, 0, i);
1264 brw_RNDD(p, dst, src0);
1265 }
1266 }
1267 brw_set_saturate(p, 0);
1268 }
1269
1270
1271 static void emit_min_max(struct brw_wm_compile *c,
1272 const struct prog_instruction *inst)
1273 {
1274 struct brw_compile *p = &c->func;
1275 const GLuint mask = inst->DstReg.WriteMask;
1276 const int mark = mark_tmps(c);
1277 int i;
1278 brw_push_insn_state(p);
1279 for (i = 0; i < 4; i++) {
1280 if (mask & (1<<i)) {
1281 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1282 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1283 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1284 struct brw_reg dst;
1285 /* if dst==src0 or dst==src1 we need to use a temp reg */
1286 GLboolean use_temp = brw_same_reg(dst, src0) ||
1287 brw_same_reg(dst, src1);
1288 if (use_temp)
1289 dst = alloc_tmp(c);
1290 else
1291 dst = real_dst;
1292
1293 /*
1294 printf(" Min/max: dst %d src0 %d src1 %d\n",
1295 dst.nr, src0.nr, src1.nr);
1296 */
1297 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1298 brw_MOV(p, dst, src0);
1299 brw_set_saturate(p, 0);
1300
1301 if (inst->Opcode == OPCODE_MIN)
1302 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1303 else
1304 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1305
1306 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1307 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1308 brw_MOV(p, dst, src1);
1309 brw_set_saturate(p, 0);
1310 brw_set_predicate_control_flag_value(p, 0xff);
1311 if (use_temp)
1312 brw_MOV(p, real_dst, dst);
1313 }
1314 }
1315 brw_pop_insn_state(p);
1316 release_tmps(c, mark);
1317 }
1318
1319 static void emit_pow(struct brw_wm_compile *c,
1320 const struct prog_instruction *inst)
1321 {
1322 struct brw_compile *p = &c->func;
1323 struct brw_reg dst, src0, src1;
1324 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1325 src0 = get_src_reg_imm(c, inst, 0, 0);
1326 src1 = get_src_reg_imm(c, inst, 1, 0);
1327
1328 brw_MOV(p, brw_message_reg(2), src0);
1329 brw_MOV(p, brw_message_reg(3), src1);
1330
1331 brw_math(p,
1332 dst,
1333 BRW_MATH_FUNCTION_POW,
1334 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1335 2,
1336 brw_null_reg(),
1337 BRW_MATH_DATA_VECTOR,
1338 BRW_MATH_PRECISION_FULL);
1339 }
1340
1341 static void emit_lrp(struct brw_wm_compile *c,
1342 const struct prog_instruction *inst)
1343 {
1344 struct brw_compile *p = &c->func;
1345 GLuint mask = inst->DstReg.WriteMask;
1346 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1347 int i;
1348 int mark = mark_tmps(c);
1349 for (i = 0; i < 4; i++) {
1350 if (mask & (1<<i)) {
1351 dst = get_dst_reg(c, inst, i);
1352 src0 = get_src_reg(c, inst, 0, i);
1353
1354 src1 = get_src_reg_imm(c, inst, 1, i);
1355
1356 if (src1.nr == dst.nr) {
1357 tmp1 = alloc_tmp(c);
1358 brw_MOV(p, tmp1, src1);
1359 } else
1360 tmp1 = src1;
1361
1362 src2 = get_src_reg(c, inst, 2, i);
1363 if (src2.nr == dst.nr) {
1364 tmp2 = alloc_tmp(c);
1365 brw_MOV(p, tmp2, src2);
1366 } else
1367 tmp2 = src2;
1368
1369 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1370 brw_MUL(p, brw_null_reg(), dst, tmp2);
1371 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1372 brw_MAC(p, dst, src0, tmp1);
1373 brw_set_saturate(p, 0);
1374 }
1375 release_tmps(c, mark);
1376 }
1377 }
1378
1379 /**
1380 * For GLSL shaders, this KIL will be unconditional.
1381 * It may be contained inside an IF/ENDIF structure of course.
1382 */
1383 static void emit_kil(struct brw_wm_compile *c)
1384 {
1385 struct brw_compile *p = &c->func;
1386 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1387 brw_push_insn_state(p);
1388 brw_set_mask_control(p, BRW_MASK_DISABLE);
1389 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1390 brw_AND(p, depth, c->emit_mask_reg, depth);
1391 brw_pop_insn_state(p);
1392 }
1393
1394 static void emit_mad(struct brw_wm_compile *c,
1395 const struct prog_instruction *inst)
1396 {
1397 struct brw_compile *p = &c->func;
1398 GLuint mask = inst->DstReg.WriteMask;
1399 struct brw_reg dst, src0, src1, src2;
1400 int i;
1401
1402 for (i = 0; i < 4; i++) {
1403 if (mask & (1<<i)) {
1404 dst = get_dst_reg(c, inst, i);
1405 src0 = get_src_reg(c, inst, 0, i);
1406 src1 = get_src_reg_imm(c, inst, 1, i);
1407 src2 = get_src_reg_imm(c, inst, 2, i);
1408 brw_MUL(p, dst, src0, src1);
1409
1410 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1411 brw_ADD(p, dst, dst, src2);
1412 brw_set_saturate(p, 0);
1413 }
1414 }
1415 }
1416
1417 static void emit_sop(struct brw_wm_compile *c,
1418 const struct prog_instruction *inst, GLuint cond)
1419 {
1420 struct brw_compile *p = &c->func;
1421 GLuint mask = inst->DstReg.WriteMask;
1422 struct brw_reg dst, src0, src1;
1423 int i;
1424
1425 for (i = 0; i < 4; i++) {
1426 if (mask & (1<<i)) {
1427 dst = get_dst_reg(c, inst, i);
1428 src0 = get_src_reg(c, inst, 0, i);
1429 src1 = get_src_reg_imm(c, inst, 1, i);
1430 brw_push_insn_state(p);
1431 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1432 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1433 brw_MOV(p, dst, brw_imm_f(0.0));
1434 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1435 brw_MOV(p, dst, brw_imm_f(1.0));
1436 brw_pop_insn_state(p);
1437 }
1438 }
1439 }
1440
1441 static void emit_slt(struct brw_wm_compile *c,
1442 const struct prog_instruction *inst)
1443 {
1444 emit_sop(c, inst, BRW_CONDITIONAL_L);
1445 }
1446
1447 static void emit_sle(struct brw_wm_compile *c,
1448 const struct prog_instruction *inst)
1449 {
1450 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1451 }
1452
1453 static void emit_sgt(struct brw_wm_compile *c,
1454 const struct prog_instruction *inst)
1455 {
1456 emit_sop(c, inst, BRW_CONDITIONAL_G);
1457 }
1458
1459 static void emit_sge(struct brw_wm_compile *c,
1460 const struct prog_instruction *inst)
1461 {
1462 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1463 }
1464
1465 static void emit_seq(struct brw_wm_compile *c,
1466 const struct prog_instruction *inst)
1467 {
1468 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1469 }
1470
1471 static void emit_sne(struct brw_wm_compile *c,
1472 const struct prog_instruction *inst)
1473 {
1474 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1475 }
1476
1477 static void emit_ddx(struct brw_wm_compile *c,
1478 const struct prog_instruction *inst)
1479 {
1480 struct brw_compile *p = &c->func;
1481 GLuint mask = inst->DstReg.WriteMask;
1482 struct brw_reg interp[4];
1483 struct brw_reg dst;
1484 struct brw_reg src0, w;
1485 GLuint nr, i;
1486 src0 = get_src_reg(c, inst, 0, 0);
1487 w = get_src_reg(c, inst, 1, 3);
1488 nr = src0.nr;
1489 interp[0] = brw_vec1_grf(nr, 0);
1490 interp[1] = brw_vec1_grf(nr, 4);
1491 interp[2] = brw_vec1_grf(nr+1, 0);
1492 interp[3] = brw_vec1_grf(nr+1, 4);
1493 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1494 for(i = 0; i < 4; i++ ) {
1495 if (mask & (1<<i)) {
1496 dst = get_dst_reg(c, inst, i);
1497 brw_MOV(p, dst, interp[i]);
1498 brw_MUL(p, dst, dst, w);
1499 }
1500 }
1501 brw_set_saturate(p, 0);
1502 }
1503
1504 static void emit_ddy(struct brw_wm_compile *c,
1505 const struct prog_instruction *inst)
1506 {
1507 struct brw_compile *p = &c->func;
1508 GLuint mask = inst->DstReg.WriteMask;
1509 struct brw_reg interp[4];
1510 struct brw_reg dst;
1511 struct brw_reg src0, w;
1512 GLuint nr, i;
1513
1514 src0 = get_src_reg(c, inst, 0, 0);
1515 nr = src0.nr;
1516 w = get_src_reg(c, inst, 1, 3);
1517 interp[0] = brw_vec1_grf(nr, 0);
1518 interp[1] = brw_vec1_grf(nr, 4);
1519 interp[2] = brw_vec1_grf(nr+1, 0);
1520 interp[3] = brw_vec1_grf(nr+1, 4);
1521 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1522 for(i = 0; i < 4; i++ ) {
1523 if (mask & (1<<i)) {
1524 dst = get_dst_reg(c, inst, i);
1525 brw_MOV(p, dst, suboffset(interp[i], 1));
1526 brw_MUL(p, dst, dst, w);
1527 }
1528 }
1529 brw_set_saturate(p, 0);
1530 }
1531
1532 static INLINE struct brw_reg high_words( struct brw_reg reg )
1533 {
1534 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1535 0, 8, 2 );
1536 }
1537
1538 static INLINE struct brw_reg low_words( struct brw_reg reg )
1539 {
1540 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1541 }
1542
1543 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1544 {
1545 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1546 }
1547
1548 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1549 {
1550 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1551 0, 16, 2 );
1552 }
1553
1554 /* One-, two- and three-dimensional Perlin noise, similar to the description
1555 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1556 static void noise1_sub( struct brw_wm_compile *c ) {
1557
1558 struct brw_compile *p = &c->func;
1559 struct brw_reg param,
1560 x0, x1, /* gradients at each end */
1561 t, tmp[ 2 ], /* float temporaries */
1562 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1563 int i;
1564 int mark = mark_tmps( c );
1565
1566 x0 = alloc_tmp( c );
1567 x1 = alloc_tmp( c );
1568 t = alloc_tmp( c );
1569 tmp[ 0 ] = alloc_tmp( c );
1570 tmp[ 1 ] = alloc_tmp( c );
1571 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1572 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1573 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1574 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1575 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1576
1577 param = lookup_tmp( c, mark - 2 );
1578
1579 brw_set_access_mode( p, BRW_ALIGN_1 );
1580
1581 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1582
1583 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1584 be hashed. Also compute the remainder (offset within the unit
1585 length), interleaved to reduce register dependency penalties. */
1586 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1587 brw_FRC( p, param, param );
1588 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1589 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1590 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1591
1592 /* We're now ready to perform the hashing. The two hashes are
1593 interleaved for performance. The hash function used is
1594 designed to rapidly achieve avalanche and require only 32x16
1595 bit multiplication, and 16-bit swizzles (which we get for
1596 free). We can't use immediate operands in the multiplies,
1597 because immediates are permitted only in src1 and the 16-bit
1598 factor is permitted only in src0. */
1599 for( i = 0; i < 2; i++ )
1600 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1601 for( i = 0; i < 2; i++ )
1602 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1603 high_words( itmp[ i ] ) );
1604 for( i = 0; i < 2; i++ )
1605 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1606 for( i = 0; i < 2; i++ )
1607 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1608 high_words( itmp[ i ] ) );
1609 for( i = 0; i < 2; i++ )
1610 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1611 for( i = 0; i < 2; i++ )
1612 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1613 high_words( itmp[ i ] ) );
1614
1615 /* Now we want to initialise the two gradients based on the
1616 hashes. Format conversion from signed integer to float leaves
1617 everything scaled too high by a factor of pow( 2, 31 ), but
1618 we correct for that right at the end. */
1619 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1620 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1621 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1622
1623 brw_MUL( p, x0, x0, param );
1624 brw_MUL( p, x1, x1, t );
1625
1626 /* We interpolate between the gradients using the polynomial
1627 6t^5 - 15t^4 + 10t^3 (Perlin). */
1628 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1629 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1630 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1631 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1632 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1633 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1634 pipeline */
1635 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1636 brw_MUL( p, param, tmp[ 0 ], param );
1637 brw_MUL( p, x1, x1, param );
1638 brw_ADD( p, x0, x0, x1 );
1639 /* scale by pow( 2, -30 ), to compensate for the format conversion
1640 above and an extra factor of 2 so that a single gradient covers
1641 the [-1,1] range */
1642 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1643
1644 release_tmps( c, mark );
1645 }
1646
1647 static void emit_noise1( struct brw_wm_compile *c,
1648 const struct prog_instruction *inst )
1649 {
1650 struct brw_compile *p = &c->func;
1651 struct brw_reg src, param, dst;
1652 GLuint mask = inst->DstReg.WriteMask;
1653 int i;
1654 int mark = mark_tmps( c );
1655
1656 assert( mark == 0 );
1657
1658 src = get_src_reg( c, inst, 0, 0 );
1659
1660 param = alloc_tmp( c );
1661
1662 brw_MOV( p, param, src );
1663
1664 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1665
1666 /* Fill in the result: */
1667 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1668 for (i = 0 ; i < 4; i++) {
1669 if (mask & (1<<i)) {
1670 dst = get_dst_reg(c, inst, i);
1671 brw_MOV( p, dst, param );
1672 }
1673 }
1674 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1675 brw_set_saturate( p, 0 );
1676
1677 release_tmps( c, mark );
1678 }
1679
1680 static void noise2_sub( struct brw_wm_compile *c ) {
1681
1682 struct brw_compile *p = &c->func;
1683 struct brw_reg param0, param1,
1684 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1685 t, tmp[ 4 ], /* float temporaries */
1686 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1687 int i;
1688 int mark = mark_tmps( c );
1689
1690 x0y0 = alloc_tmp( c );
1691 x0y1 = alloc_tmp( c );
1692 x1y0 = alloc_tmp( c );
1693 x1y1 = alloc_tmp( c );
1694 t = alloc_tmp( c );
1695 for( i = 0; i < 4; i++ ) {
1696 tmp[ i ] = alloc_tmp( c );
1697 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1698 }
1699 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1700 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1701 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1702
1703 param0 = lookup_tmp( c, mark - 3 );
1704 param1 = lookup_tmp( c, mark - 2 );
1705
1706 brw_set_access_mode( p, BRW_ALIGN_1 );
1707
1708 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1709 be hashed. Also compute the remainders (offsets within the unit
1710 square), interleaved to reduce register dependency penalties. */
1711 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1712 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1713 brw_FRC( p, param0, param0 );
1714 brw_FRC( p, param1, param1 );
1715 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1716 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1717 low_words( itmp[ 1 ] ) );
1718 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1719 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1720 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1721 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1722 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1723
1724 /* We're now ready to perform the hashing. The four hashes are
1725 interleaved for performance. The hash function used is
1726 designed to rapidly achieve avalanche and require only 32x16
1727 bit multiplication, and 16-bit swizzles (which we get for
1728 free). We can't use immediate operands in the multiplies,
1729 because immediates are permitted only in src1 and the 16-bit
1730 factor is permitted only in src0. */
1731 for( i = 0; i < 4; i++ )
1732 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1733 for( i = 0; i < 4; i++ )
1734 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1735 high_words( itmp[ i ] ) );
1736 for( i = 0; i < 4; i++ )
1737 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1738 for( i = 0; i < 4; i++ )
1739 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1740 high_words( itmp[ i ] ) );
1741 for( i = 0; i < 4; i++ )
1742 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1743 for( i = 0; i < 4; i++ )
1744 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1745 high_words( itmp[ i ] ) );
1746
1747 /* Now we want to initialise the four gradients based on the
1748 hashes. Format conversion from signed integer to float leaves
1749 everything scaled too high by a factor of pow( 2, 15 ), but
1750 we correct for that right at the end. */
1751 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1752 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1753 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1754 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1755 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1756
1757 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1758 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1759 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1760 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1761
1762 brw_MUL( p, x1y0, x1y0, t );
1763 brw_MUL( p, x1y1, x1y1, t );
1764 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1765 brw_MUL( p, x0y0, x0y0, param0 );
1766 brw_MUL( p, x0y1, x0y1, param0 );
1767
1768 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1769 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1770 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1771 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1772
1773 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1774 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1775 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1776 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1777
1778 /* We interpolate between the gradients using the polynomial
1779 6t^5 - 15t^4 + 10t^3 (Perlin). */
1780 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1781 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1782 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1783 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1784 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1785 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1786 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1787 pipeline */
1788 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1789 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1790 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1791 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1792 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1793 pipeline */
1794 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1795 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1796 brw_MUL( p, param0, tmp[ 0 ], param0 );
1797 brw_MUL( p, param1, tmp[ 1 ], param1 );
1798
1799 /* Here we interpolate in the y dimension... */
1800 brw_MUL( p, x0y1, x0y1, param1 );
1801 brw_MUL( p, x1y1, x1y1, param1 );
1802 brw_ADD( p, x0y0, x0y0, x0y1 );
1803 brw_ADD( p, x1y0, x1y0, x1y1 );
1804
1805 /* And now in x. There are horrible register dependencies here,
1806 but we have nothing else to do. */
1807 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1808 brw_MUL( p, x1y0, x1y0, param0 );
1809 brw_ADD( p, x0y0, x0y0, x1y0 );
1810
1811 /* scale by pow( 2, -15 ), as described above */
1812 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1813
1814 release_tmps( c, mark );
1815 }
1816
1817 static void emit_noise2( struct brw_wm_compile *c,
1818 const struct prog_instruction *inst )
1819 {
1820 struct brw_compile *p = &c->func;
1821 struct brw_reg src0, src1, param0, param1, dst;
1822 GLuint mask = inst->DstReg.WriteMask;
1823 int i;
1824 int mark = mark_tmps( c );
1825
1826 assert( mark == 0 );
1827
1828 src0 = get_src_reg( c, inst, 0, 0 );
1829 src1 = get_src_reg( c, inst, 0, 1 );
1830
1831 param0 = alloc_tmp( c );
1832 param1 = alloc_tmp( c );
1833
1834 brw_MOV( p, param0, src0 );
1835 brw_MOV( p, param1, src1 );
1836
1837 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1838
1839 /* Fill in the result: */
1840 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1841 for (i = 0 ; i < 4; i++) {
1842 if (mask & (1<<i)) {
1843 dst = get_dst_reg(c, inst, i);
1844 brw_MOV( p, dst, param0 );
1845 }
1846 }
1847 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1848 brw_set_saturate( p, 0 );
1849
1850 release_tmps( c, mark );
1851 }
1852
1853 /**
1854 * The three-dimensional case is much like the one- and two- versions above,
1855 * but since the number of corners is rapidly growing we now pack 16 16-bit
1856 * hashes into each register to extract more parallelism from the EUs.
1857 */
1858 static void noise3_sub( struct brw_wm_compile *c ) {
1859
1860 struct brw_compile *p = &c->func;
1861 struct brw_reg param0, param1, param2,
1862 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1863 xi, yi, zi, /* interpolation coefficients */
1864 t, tmp[ 8 ], /* float temporaries */
1865 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1866 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1867 int i;
1868 int mark = mark_tmps( c );
1869
1870 x0y0 = alloc_tmp( c );
1871 x0y1 = alloc_tmp( c );
1872 x1y0 = alloc_tmp( c );
1873 x1y1 = alloc_tmp( c );
1874 xi = alloc_tmp( c );
1875 yi = alloc_tmp( c );
1876 zi = alloc_tmp( c );
1877 t = alloc_tmp( c );
1878 for( i = 0; i < 8; i++ ) {
1879 tmp[ i ] = alloc_tmp( c );
1880 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1881 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1882 }
1883
1884 param0 = lookup_tmp( c, mark - 4 );
1885 param1 = lookup_tmp( c, mark - 3 );
1886 param2 = lookup_tmp( c, mark - 2 );
1887
1888 brw_set_access_mode( p, BRW_ALIGN_1 );
1889
1890 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1891 be hashed. Also compute the remainders (offsets within the unit
1892 cube), interleaved to reduce register dependency penalties. */
1893 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1894 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1895 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1896 brw_FRC( p, param0, param0 );
1897 brw_FRC( p, param1, param1 );
1898 brw_FRC( p, param2, param2 );
1899 /* Since we now have only 16 bits of precision in the hash, we must
1900 be more careful about thorough mixing to maintain entropy as we
1901 squash the input vector into a small scalar. */
1902 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1903 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1904 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1905 brw_imm_uw( 0x9B93 ) );
1906 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1907 brw_imm_uw( 0xBC8F ) );
1908
1909 /* Temporarily disable the execution mask while we work with ExecSize=16
1910 channels (the mask is set for ExecSize=8 and is probably incorrect).
1911 Although this might cause execution of unwanted channels, the code
1912 writes only to temporary registers and has no side effects, so
1913 disabling the mask is harmless. */
1914 brw_push_insn_state( p );
1915 brw_set_mask_control( p, BRW_MASK_DISABLE );
1916 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1917 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1918 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1919
1920 /* We're now ready to perform the hashing. The eight hashes are
1921 interleaved for performance. The hash function used is
1922 designed to rapidly achieve avalanche and require only 16x16
1923 bit multiplication, and 8-bit swizzles (which we get for
1924 free). */
1925 for( i = 0; i < 4; i++ )
1926 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1927 for( i = 0; i < 4; i++ )
1928 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1929 odd_bytes( wtmp[ i ] ) );
1930 for( i = 0; i < 4; i++ )
1931 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1932 for( i = 0; i < 4; i++ )
1933 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1934 odd_bytes( wtmp[ i ] ) );
1935 brw_pop_insn_state( p );
1936
1937 /* Now we want to initialise the four rear gradients based on the
1938 hashes. Format conversion from signed integer to float leaves
1939 everything scaled too high by a factor of pow( 2, 15 ), but
1940 we correct for that right at the end. */
1941 /* x component */
1942 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1943 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1944 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1945 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1946 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1947
1948 brw_push_insn_state( p );
1949 brw_set_mask_control( p, BRW_MASK_DISABLE );
1950 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1951 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1952 brw_pop_insn_state( p );
1953
1954 brw_MUL( p, x1y0, x1y0, t );
1955 brw_MUL( p, x1y1, x1y1, t );
1956 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1957 brw_MUL( p, x0y0, x0y0, param0 );
1958 brw_MUL( p, x0y1, x0y1, param0 );
1959
1960 /* y component */
1961 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1962 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1963 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1964 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1965
1966 brw_push_insn_state( p );
1967 brw_set_mask_control( p, BRW_MASK_DISABLE );
1968 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1969 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1970 brw_pop_insn_state( p );
1971
1972 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1973 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1974 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1975 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1976 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1977
1978 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1979 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1980 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1981 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1982
1983 /* z component */
1984 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1985 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1986 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1987 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1988
1989 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1990 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1991 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1992 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1993
1994 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1995 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1996 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1997 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1998
1999 /* We interpolate between the gradients using the polynomial
2000 6t^5 - 15t^4 + 10t^3 (Perlin). */
2001 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2002 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2003 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2004 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2005 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2006 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2007 brw_MUL( p, xi, xi, param0 );
2008 brw_MUL( p, yi, yi, param1 );
2009 brw_MUL( p, zi, zi, param2 );
2010 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2011 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2012 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2013 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2014 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2015 brw_MUL( p, xi, xi, param0 );
2016 brw_MUL( p, yi, yi, param1 );
2017 brw_MUL( p, zi, zi, param2 );
2018 brw_MUL( p, xi, xi, param0 );
2019 brw_MUL( p, yi, yi, param1 );
2020 brw_MUL( p, zi, zi, param2 );
2021 brw_MUL( p, xi, xi, param0 );
2022 brw_MUL( p, yi, yi, param1 );
2023 brw_MUL( p, zi, zi, param2 );
2024
2025 /* Here we interpolate in the y dimension... */
2026 brw_MUL( p, x0y1, x0y1, yi );
2027 brw_MUL( p, x1y1, x1y1, yi );
2028 brw_ADD( p, x0y0, x0y0, x0y1 );
2029 brw_ADD( p, x1y0, x1y0, x1y1 );
2030
2031 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2032 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2033 brw_MUL( p, x1y0, x1y0, xi );
2034 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2035
2036 /* Now do the same thing for the front four gradients... */
2037 /* x component */
2038 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2039 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2040 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2041 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2042
2043 brw_push_insn_state( p );
2044 brw_set_mask_control( p, BRW_MASK_DISABLE );
2045 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2046 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2047 brw_pop_insn_state( p );
2048
2049 brw_MUL( p, x1y0, x1y0, t );
2050 brw_MUL( p, x1y1, x1y1, t );
2051 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2052 brw_MUL( p, x0y0, x0y0, param0 );
2053 brw_MUL( p, x0y1, x0y1, param0 );
2054
2055 /* y component */
2056 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2057 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2058 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2059 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2060
2061 brw_push_insn_state( p );
2062 brw_set_mask_control( p, BRW_MASK_DISABLE );
2063 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2064 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2065 brw_pop_insn_state( p );
2066
2067 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2068 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2069 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2070 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2071 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2072
2073 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2074 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2075 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2076 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2077
2078 /* z component */
2079 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2080 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2081 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2082 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2083
2084 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2085 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2086 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2087 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2088
2089 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2090 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2091 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2092 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2093
2094 /* The interpolation coefficients are still around from last time, so
2095 again interpolate in the y dimension... */
2096 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2097 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2098 brw_MUL( p, x0y1, x0y1, yi );
2099 brw_MUL( p, x1y1, x1y1, yi );
2100 brw_ADD( p, x0y0, x0y0, x0y1 );
2101 brw_ADD( p, x1y0, x1y0, x1y1 );
2102
2103 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2104 time put the front face in tmp[ 1 ] and we're nearly there... */
2105 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2106 brw_MUL( p, x1y0, x1y0, xi );
2107 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2108
2109 /* The final interpolation, in the z dimension: */
2110 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2111 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2112 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2113
2114 /* scale by pow( 2, -15 ), as described above */
2115 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2116
2117 release_tmps( c, mark );
2118 }
2119
2120 static void emit_noise3( struct brw_wm_compile *c,
2121 const struct prog_instruction *inst )
2122 {
2123 struct brw_compile *p = &c->func;
2124 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2125 GLuint mask = inst->DstReg.WriteMask;
2126 int i;
2127 int mark = mark_tmps( c );
2128
2129 assert( mark == 0 );
2130
2131 src0 = get_src_reg( c, inst, 0, 0 );
2132 src1 = get_src_reg( c, inst, 0, 1 );
2133 src2 = get_src_reg( c, inst, 0, 2 );
2134
2135 param0 = alloc_tmp( c );
2136 param1 = alloc_tmp( c );
2137 param2 = alloc_tmp( c );
2138
2139 brw_MOV( p, param0, src0 );
2140 brw_MOV( p, param1, src1 );
2141 brw_MOV( p, param2, src2 );
2142
2143 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2144
2145 /* Fill in the result: */
2146 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2147 for (i = 0 ; i < 4; i++) {
2148 if (mask & (1<<i)) {
2149 dst = get_dst_reg(c, inst, i);
2150 brw_MOV( p, dst, param0 );
2151 }
2152 }
2153 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2154 brw_set_saturate( p, 0 );
2155
2156 release_tmps( c, mark );
2157 }
2158
2159 /**
2160 * For the four-dimensional case, the little micro-optimisation benefits
2161 * we obtain by unrolling all the loops aren't worth the massive bloat it
2162 * now causes. Instead, we loop twice around performing a similar operation
2163 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2164 * code to glue it all together.
2165 */
2166 static void noise4_sub( struct brw_wm_compile *c )
2167 {
2168 struct brw_compile *p = &c->func;
2169 struct brw_reg param[ 4 ],
2170 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2171 w0, /* noise for the w=0 cube */
2172 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2173 interp[ 4 ], /* interpolation coefficients */
2174 t, tmp[ 8 ], /* float temporaries */
2175 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2176 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2177 int i, j;
2178 int mark = mark_tmps( c );
2179 GLuint loop, origin;
2180
2181 x0y0 = alloc_tmp( c );
2182 x0y1 = alloc_tmp( c );
2183 x1y0 = alloc_tmp( c );
2184 x1y1 = alloc_tmp( c );
2185 t = alloc_tmp( c );
2186 w0 = alloc_tmp( c );
2187 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2188 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2189
2190 for( i = 0; i < 4; i++ ) {
2191 param[ i ] = lookup_tmp( c, mark - 5 + i );
2192 interp[ i ] = alloc_tmp( c );
2193 }
2194
2195 for( i = 0; i < 8; i++ ) {
2196 tmp[ i ] = alloc_tmp( c );
2197 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2198 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2199 }
2200
2201 brw_set_access_mode( p, BRW_ALIGN_1 );
2202
2203 /* We only want 16 bits of precision from the integral part of each
2204 co-ordinate, but unfortunately the RNDD semantics would saturate
2205 at 16 bits if we performed the operation directly to a 16-bit
2206 destination. Therefore, we round to 32-bit temporaries where
2207 appropriate, and then store only the lower 16 bits. */
2208 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2209 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2210 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2211 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2212 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2213 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2214
2215 /* Modify the flag register here, because the side effect is useful
2216 later (see below). We know for certain that all flags will be
2217 cleared, since the FRC instruction cannot possibly generate
2218 negative results. Even for exceptional inputs (infinities, denormals,
2219 NaNs), the architecture guarantees that the L conditional is false. */
2220 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2221 brw_FRC( p, param[ 0 ], param[ 0 ] );
2222 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2223 for( i = 1; i < 4; i++ )
2224 brw_FRC( p, param[ i ], param[ i ] );
2225
2226 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2227 of all. */
2228 for( i = 0; i < 4; i++ )
2229 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2230 for( i = 0; i < 4; i++ )
2231 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2232 for( i = 0; i < 4; i++ )
2233 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2234 for( i = 0; i < 4; i++ )
2235 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2236 for( j = 0; j < 3; j++ )
2237 for( i = 0; i < 4; i++ )
2238 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2239
2240 /* Mark the current address, as it will be a jump destination. The
2241 following code will be executed twice: first, with the flag
2242 register clear indicating the w=0 case, and second with flags
2243 set for w=1. */
2244 loop = p->nr_insn;
2245
2246 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2247 be hashed. Since we have only 16 bits of precision in the hash, we
2248 must be careful about thorough mixing to maintain entropy as we
2249 squash the input vector into a small scalar. */
2250 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2251 brw_imm_uw( 0xBC8F ) );
2252 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2253 brw_imm_uw( 0xD0BD ) );
2254 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2255 brw_imm_uw( 0x9B93 ) );
2256 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2257 brw_imm_uw( 0xA359 ) );
2258 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2259 brw_imm_uw( 0xBC8F ) );
2260
2261 /* Temporarily disable the execution mask while we work with ExecSize=16
2262 channels (the mask is set for ExecSize=8 and is probably incorrect).
2263 Although this might cause execution of unwanted channels, the code
2264 writes only to temporary registers and has no side effects, so
2265 disabling the mask is harmless. */
2266 brw_push_insn_state( p );
2267 brw_set_mask_control( p, BRW_MASK_DISABLE );
2268 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2269 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2270 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2271
2272 /* We're now ready to perform the hashing. The eight hashes are
2273 interleaved for performance. The hash function used is
2274 designed to rapidly achieve avalanche and require only 16x16
2275 bit multiplication, and 8-bit swizzles (which we get for
2276 free). */
2277 for( i = 0; i < 4; i++ )
2278 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2279 for( i = 0; i < 4; i++ )
2280 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2281 odd_bytes( wtmp[ i ] ) );
2282 for( i = 0; i < 4; i++ )
2283 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2284 for( i = 0; i < 4; i++ )
2285 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2286 odd_bytes( wtmp[ i ] ) );
2287 brw_pop_insn_state( p );
2288
2289 /* Now we want to initialise the four rear gradients based on the
2290 hashes. Format conversion from signed integer to float leaves
2291 everything scaled too high by a factor of pow( 2, 15 ), but
2292 we correct for that right at the end. */
2293 /* x component */
2294 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2295 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2296 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2297 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2298 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2299
2300 brw_push_insn_state( p );
2301 brw_set_mask_control( p, BRW_MASK_DISABLE );
2302 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2303 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2304 brw_pop_insn_state( p );
2305
2306 brw_MUL( p, x1y0, x1y0, t );
2307 brw_MUL( p, x1y1, x1y1, t );
2308 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2309 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2310 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2311
2312 /* y component */
2313 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2314 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2315 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2316 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2317
2318 brw_push_insn_state( p );
2319 brw_set_mask_control( p, BRW_MASK_DISABLE );
2320 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2321 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2322 brw_pop_insn_state( p );
2323
2324 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2325 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2326 /* prepare t for the w component (used below): w the first time through
2327 the loop; w - 1 the second time) */
2328 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2329 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2330 p->current->header.predicate_inverse = 1;
2331 brw_MOV( p, t, param[ 3 ] );
2332 p->current->header.predicate_inverse = 0;
2333 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2334 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2335 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2336
2337 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2338 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2339 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2340 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2341
2342 /* z component */
2343 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2344 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2345 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2346 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2347
2348 brw_push_insn_state( p );
2349 brw_set_mask_control( p, BRW_MASK_DISABLE );
2350 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2351 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2352 brw_pop_insn_state( p );
2353
2354 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2355 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2356 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2357 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2358
2359 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2360 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2361 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2362 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2363
2364 /* w component */
2365 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2366 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2367 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2368 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2369
2370 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2371 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2372 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2373 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2374 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2375
2376 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2377 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2378 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2379 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2380
2381 /* Here we interpolate in the y dimension... */
2382 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2383 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2384 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2385 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2386 brw_ADD( p, x0y0, x0y0, x0y1 );
2387 brw_ADD( p, x1y0, x1y0, x1y1 );
2388
2389 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2390 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2391 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2392 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2393
2394 /* Now do the same thing for the front four gradients... */
2395 /* x component */
2396 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2397 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2398 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2399 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2400
2401 brw_push_insn_state( p );
2402 brw_set_mask_control( p, BRW_MASK_DISABLE );
2403 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2404 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2405 brw_pop_insn_state( p );
2406
2407 brw_MUL( p, x1y0, x1y0, t );
2408 brw_MUL( p, x1y1, x1y1, t );
2409 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2410 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2411 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2412
2413 /* y component */
2414 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2415 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2416 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2417 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2418
2419 brw_push_insn_state( p );
2420 brw_set_mask_control( p, BRW_MASK_DISABLE );
2421 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2422 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2423 brw_pop_insn_state( p );
2424
2425 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2426 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2427 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2428 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2429 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2430
2431 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2432 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2433 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2434 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2435
2436 /* z component */
2437 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2438 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2439 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2440 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2441
2442 brw_push_insn_state( p );
2443 brw_set_mask_control( p, BRW_MASK_DISABLE );
2444 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2445 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2446 brw_pop_insn_state( p );
2447
2448 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2449 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2450 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2451 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2452 /* prepare t for the w component (used below): w the first time through
2453 the loop; w - 1 the second time) */
2454 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2455 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2456 p->current->header.predicate_inverse = 1;
2457 brw_MOV( p, t, param[ 3 ] );
2458 p->current->header.predicate_inverse = 0;
2459 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2460
2461 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2462 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2463 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2464 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2465
2466 /* w component */
2467 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2468 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2469 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2470 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2471
2472 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2473 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2474 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2475 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2476
2477 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2478 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2479 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2480 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2481
2482 /* Interpolate in the y dimension: */
2483 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2484 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2485 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2486 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2487 brw_ADD( p, x0y0, x0y0, x0y1 );
2488 brw_ADD( p, x1y0, x1y0, x1y1 );
2489
2490 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2491 time put the front face in tmp[ 1 ] and we're nearly there... */
2492 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2493 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2494 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2495
2496 /* Another interpolation, in the z dimension: */
2497 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2498 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2499 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2500
2501 /* Exit the loop if we've computed both cubes... */
2502 origin = p->nr_insn;
2503 brw_push_insn_state( p );
2504 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2505 brw_set_mask_control( p, BRW_MASK_DISABLE );
2506 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2507 brw_pop_insn_state( p );
2508
2509 /* Save the result for the w=0 case, and increment the w coordinate: */
2510 brw_MOV( p, w0, tmp[ 0 ] );
2511 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2512 brw_imm_uw( 1 ) );
2513
2514 /* Loop around for the other cube. Explicitly set the flag register
2515 (unfortunately we must spend an extra instruction to do this: we
2516 can't rely on a side effect of the previous MOV or ADD because
2517 conditional modifiers which are normally true might be false in
2518 exceptional circumstances, e.g. given a NaN input; the add to
2519 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2520 brw_push_insn_state( p );
2521 brw_set_mask_control( p, BRW_MASK_DISABLE );
2522 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2523 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2524 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2525 brw_pop_insn_state( p );
2526
2527 /* Patch the previous conditional branch now that we know the
2528 destination address. */
2529 brw_set_src1( p->store + origin,
2530 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2531
2532 /* The very last interpolation. */
2533 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2534 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2535 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2536
2537 /* scale by pow( 2, -15 ), as described above */
2538 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2539
2540 release_tmps( c, mark );
2541 }
2542
2543 static void emit_noise4( struct brw_wm_compile *c,
2544 const struct prog_instruction *inst )
2545 {
2546 struct brw_compile *p = &c->func;
2547 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2548 GLuint mask = inst->DstReg.WriteMask;
2549 int i;
2550 int mark = mark_tmps( c );
2551
2552 assert( mark == 0 );
2553
2554 src0 = get_src_reg( c, inst, 0, 0 );
2555 src1 = get_src_reg( c, inst, 0, 1 );
2556 src2 = get_src_reg( c, inst, 0, 2 );
2557 src3 = get_src_reg( c, inst, 0, 3 );
2558
2559 param0 = alloc_tmp( c );
2560 param1 = alloc_tmp( c );
2561 param2 = alloc_tmp( c );
2562 param3 = alloc_tmp( c );
2563
2564 brw_MOV( p, param0, src0 );
2565 brw_MOV( p, param1, src1 );
2566 brw_MOV( p, param2, src2 );
2567 brw_MOV( p, param3, src3 );
2568
2569 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2570
2571 /* Fill in the result: */
2572 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2573 for (i = 0 ; i < 4; i++) {
2574 if (mask & (1<<i)) {
2575 dst = get_dst_reg(c, inst, i);
2576 brw_MOV( p, dst, param0 );
2577 }
2578 }
2579 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2580 brw_set_saturate( p, 0 );
2581
2582 release_tmps( c, mark );
2583 }
2584
2585 static void emit_wpos_xy(struct brw_wm_compile *c,
2586 const struct prog_instruction *inst)
2587 {
2588 struct brw_compile *p = &c->func;
2589 GLuint mask = inst->DstReg.WriteMask;
2590 struct brw_reg src0[2], dst[2];
2591
2592 dst[0] = get_dst_reg(c, inst, 0);
2593 dst[1] = get_dst_reg(c, inst, 1);
2594
2595 src0[0] = get_src_reg(c, inst, 0, 0);
2596 src0[1] = get_src_reg(c, inst, 0, 1);
2597
2598 /* Calculate the pixel offset from window bottom left into destination
2599 * X and Y channels.
2600 */
2601 if (mask & WRITEMASK_X) {
2602 /* X' = X - origin_x */
2603 brw_ADD(p,
2604 dst[0],
2605 retype(src0[0], BRW_REGISTER_TYPE_W),
2606 brw_imm_d(0 - c->key.origin_x));
2607 }
2608
2609 if (mask & WRITEMASK_Y) {
2610 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2611 brw_ADD(p,
2612 dst[1],
2613 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2614 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2615 }
2616 }
2617
2618 /* TODO
2619 BIAS on SIMD8 not working yet...
2620 */
2621 static void emit_txb(struct brw_wm_compile *c,
2622 const struct prog_instruction *inst)
2623 {
2624 struct brw_compile *p = &c->func;
2625 struct brw_reg dst[4], src[4], payload_reg;
2626 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2627 GLuint i;
2628 GLuint msg_type;
2629
2630 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2631
2632 for (i = 0; i < 4; i++)
2633 dst[i] = get_dst_reg(c, inst, i);
2634 for (i = 0; i < 4; i++)
2635 src[i] = get_src_reg(c, inst, 0, i);
2636
2637 switch (inst->TexSrcTarget) {
2638 case TEXTURE_1D_INDEX:
2639 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2640 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2641 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2642 break;
2643 case TEXTURE_2D_INDEX:
2644 case TEXTURE_RECT_INDEX:
2645 brw_MOV(p, brw_message_reg(2), src[0]);
2646 brw_MOV(p, brw_message_reg(3), src[1]);
2647 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2648 break;
2649 default:
2650 brw_MOV(p, brw_message_reg(2), src[0]);
2651 brw_MOV(p, brw_message_reg(3), src[1]);
2652 brw_MOV(p, brw_message_reg(4), src[2]);
2653 break;
2654 }
2655 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2656 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2657
2658 if (BRW_IS_IGDNG(p->brw)) {
2659 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2660 } else {
2661 /* Does it work well on SIMD8? */
2662 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2663 }
2664
2665 brw_SAMPLE(p,
2666 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2667 1, /* msg_reg_nr */
2668 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2669 SURF_INDEX_TEXTURE(unit),
2670 unit, /* sampler */
2671 inst->DstReg.WriteMask, /* writemask */
2672 msg_type, /* msg_type */
2673 4, /* response_length */
2674 4, /* msg_length */
2675 0, /* eot */
2676 1,
2677 BRW_SAMPLER_SIMD_MODE_SIMD8);
2678 }
2679
2680
2681 static void emit_tex(struct brw_wm_compile *c,
2682 const struct prog_instruction *inst)
2683 {
2684 struct brw_compile *p = &c->func;
2685 struct brw_reg dst[4], src[4], payload_reg;
2686 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2687 GLuint msg_len;
2688 GLuint i, nr;
2689 GLuint emit;
2690 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2691 GLuint msg_type;
2692
2693 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2694
2695 for (i = 0; i < 4; i++)
2696 dst[i] = get_dst_reg(c, inst, i);
2697 for (i = 0; i < 4; i++)
2698 src[i] = get_src_reg(c, inst, 0, i);
2699
2700 switch (inst->TexSrcTarget) {
2701 case TEXTURE_1D_INDEX:
2702 emit = WRITEMASK_X;
2703 nr = 1;
2704 break;
2705 case TEXTURE_2D_INDEX:
2706 case TEXTURE_RECT_INDEX:
2707 emit = WRITEMASK_XY;
2708 nr = 2;
2709 break;
2710 default:
2711 emit = WRITEMASK_XYZ;
2712 nr = 3;
2713 break;
2714 }
2715 msg_len = 1;
2716
2717 /* move/load S, T, R coords */
2718 for (i = 0; i < nr; i++) {
2719 static const GLuint swz[4] = {0,1,2,2};
2720 if (emit & (1<<i))
2721 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2722 else
2723 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2724 msg_len += 1;
2725 }
2726
2727 if (shadow) {
2728 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2729 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2730 }
2731
2732 if (BRW_IS_IGDNG(p->brw)) {
2733 if (shadow)
2734 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2735 else
2736 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2737 } else {
2738 /* Does it work for shadow on SIMD8 ? */
2739 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2740 }
2741
2742 brw_SAMPLE(p,
2743 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2744 1, /* msg_reg_nr */
2745 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2746 SURF_INDEX_TEXTURE(unit),
2747 unit, /* sampler */
2748 inst->DstReg.WriteMask, /* writemask */
2749 msg_type, /* msg_type */
2750 4, /* response_length */
2751 shadow ? 6 : 4, /* msg_length */
2752 0, /* eot */
2753 1,
2754 BRW_SAMPLER_SIMD_MODE_SIMD8);
2755
2756 if (shadow)
2757 brw_MOV(p, dst[3], brw_imm_f(1.0));
2758 }
2759
2760
2761 /**
2762 * Resolve subroutine calls after code emit is done.
2763 */
2764 static void post_wm_emit( struct brw_wm_compile *c )
2765 {
2766 brw_resolve_cals(&c->func);
2767 }
2768
2769 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2770 {
2771 #define MAX_IF_DEPTH 32
2772 #define MAX_LOOP_DEPTH 32
2773 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2774 GLuint i, if_depth = 0, loop_depth = 0;
2775 struct brw_compile *p = &c->func;
2776 struct brw_indirect stack_index = brw_indirect(0, 0);
2777
2778 c->out_of_regs = GL_FALSE;
2779
2780 prealloc_reg(c);
2781 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2782 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2783
2784 for (i = 0; i < c->nr_fp_insns; i++) {
2785 const struct prog_instruction *inst = &c->prog_instructions[i];
2786
2787 c->cur_inst = i;
2788
2789 #if 0
2790 _mesa_printf("Inst %d: ", i);
2791 _mesa_print_instruction(inst);
2792 #endif
2793
2794 /* fetch any constants that this instruction needs */
2795 if (c->fp->use_const_buffer)
2796 fetch_constants(c, inst);
2797
2798 if (inst->CondUpdate)
2799 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2800 else
2801 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2802
2803 switch (inst->Opcode) {
2804 case WM_PIXELXY:
2805 emit_pixel_xy(c, inst);
2806 break;
2807 case WM_DELTAXY:
2808 emit_delta_xy(c, inst);
2809 break;
2810 case WM_PIXELW:
2811 emit_pixel_w(c, inst);
2812 break;
2813 case WM_LINTERP:
2814 emit_linterp(c, inst);
2815 break;
2816 case WM_PINTERP:
2817 emit_pinterp(c, inst);
2818 break;
2819 case WM_CINTERP:
2820 emit_cinterp(c, inst);
2821 break;
2822 case WM_WPOSXY:
2823 emit_wpos_xy(c, inst);
2824 break;
2825 case WM_FB_WRITE:
2826 emit_fb_write(c, inst);
2827 break;
2828 case WM_FRONTFACING:
2829 emit_frontfacing(c, inst);
2830 break;
2831 case OPCODE_ABS:
2832 emit_abs(c, inst);
2833 break;
2834 case OPCODE_ADD:
2835 emit_add(c, inst);
2836 break;
2837 case OPCODE_ARL:
2838 emit_arl(c, inst);
2839 break;
2840 case OPCODE_SUB:
2841 emit_sub(c, inst);
2842 break;
2843 case OPCODE_FRC:
2844 emit_frc(c, inst);
2845 break;
2846 case OPCODE_FLR:
2847 emit_flr(c, inst);
2848 break;
2849 case OPCODE_LRP:
2850 emit_lrp(c, inst);
2851 break;
2852 case OPCODE_TRUNC:
2853 emit_trunc(c, inst);
2854 break;
2855 case OPCODE_MOV:
2856 case OPCODE_SWZ:
2857 emit_mov(c, inst);
2858 break;
2859 case OPCODE_DP3:
2860 emit_dp3(c, inst);
2861 break;
2862 case OPCODE_DP4:
2863 emit_dp4(c, inst);
2864 break;
2865 case OPCODE_XPD:
2866 emit_xpd(c, inst);
2867 break;
2868 case OPCODE_DPH:
2869 emit_dph(c, inst);
2870 break;
2871 case OPCODE_RCP:
2872 emit_rcp(c, inst);
2873 break;
2874 case OPCODE_RSQ:
2875 emit_rsq(c, inst);
2876 break;
2877 case OPCODE_SIN:
2878 emit_sin(c, inst);
2879 break;
2880 case OPCODE_COS:
2881 emit_cos(c, inst);
2882 break;
2883 case OPCODE_EX2:
2884 emit_ex2(c, inst);
2885 break;
2886 case OPCODE_LG2:
2887 emit_lg2(c, inst);
2888 break;
2889 case OPCODE_MIN:
2890 case OPCODE_MAX:
2891 emit_min_max(c, inst);
2892 break;
2893 case OPCODE_DDX:
2894 emit_ddx(c, inst);
2895 break;
2896 case OPCODE_DDY:
2897 emit_ddy(c, inst);
2898 break;
2899 case OPCODE_SLT:
2900 emit_slt(c, inst);
2901 break;
2902 case OPCODE_SLE:
2903 emit_sle(c, inst);
2904 break;
2905 case OPCODE_SGT:
2906 emit_sgt(c, inst);
2907 break;
2908 case OPCODE_SGE:
2909 emit_sge(c, inst);
2910 break;
2911 case OPCODE_SEQ:
2912 emit_seq(c, inst);
2913 break;
2914 case OPCODE_SNE:
2915 emit_sne(c, inst);
2916 break;
2917 case OPCODE_MUL:
2918 emit_mul(c, inst);
2919 break;
2920 case OPCODE_POW:
2921 emit_pow(c, inst);
2922 break;
2923 case OPCODE_MAD:
2924 emit_mad(c, inst);
2925 break;
2926 case OPCODE_NOISE1:
2927 emit_noise1(c, inst);
2928 break;
2929 case OPCODE_NOISE2:
2930 emit_noise2(c, inst);
2931 break;
2932 case OPCODE_NOISE3:
2933 emit_noise3(c, inst);
2934 break;
2935 case OPCODE_NOISE4:
2936 emit_noise4(c, inst);
2937 break;
2938 case OPCODE_TEX:
2939 emit_tex(c, inst);
2940 break;
2941 case OPCODE_TXB:
2942 emit_txb(c, inst);
2943 break;
2944 case OPCODE_KIL_NV:
2945 emit_kil(c);
2946 break;
2947 case OPCODE_IF:
2948 assert(if_depth < MAX_IF_DEPTH);
2949 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2950 break;
2951 case OPCODE_ELSE:
2952 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2953 break;
2954 case OPCODE_ENDIF:
2955 assert(if_depth > 0);
2956 brw_ENDIF(p, if_inst[--if_depth]);
2957 break;
2958 case OPCODE_BGNSUB:
2959 brw_save_label(p, inst->Comment, p->nr_insn);
2960 break;
2961 case OPCODE_ENDSUB:
2962 /* no-op */
2963 break;
2964 case OPCODE_CAL:
2965 brw_push_insn_state(p);
2966 brw_set_mask_control(p, BRW_MASK_DISABLE);
2967 brw_set_access_mode(p, BRW_ALIGN_1);
2968 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2969 brw_set_access_mode(p, BRW_ALIGN_16);
2970 brw_ADD(p, get_addr_reg(stack_index),
2971 get_addr_reg(stack_index), brw_imm_d(4));
2972 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2973 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2974 brw_pop_insn_state(p);
2975 break;
2976
2977 case OPCODE_RET:
2978 brw_push_insn_state(p);
2979 brw_set_mask_control(p, BRW_MASK_DISABLE);
2980 brw_ADD(p, get_addr_reg(stack_index),
2981 get_addr_reg(stack_index), brw_imm_d(-4));
2982 brw_set_access_mode(p, BRW_ALIGN_1);
2983 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2984 brw_set_access_mode(p, BRW_ALIGN_16);
2985 brw_pop_insn_state(p);
2986
2987 break;
2988 case OPCODE_BGNLOOP:
2989 /* XXX may need to invalidate the current_constant regs */
2990 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2991 break;
2992 case OPCODE_BRK:
2993 brw_BREAK(p);
2994 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2995 break;
2996 case OPCODE_CONT:
2997 brw_CONT(p);
2998 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2999 break;
3000 case OPCODE_ENDLOOP:
3001 {
3002 struct brw_instruction *inst0, *inst1;
3003 GLuint br = 1;
3004
3005 if (BRW_IS_IGDNG(brw))
3006 br = 2;
3007
3008 loop_depth--;
3009 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
3010 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
3011 while (inst0 > loop_inst[loop_depth]) {
3012 inst0--;
3013 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
3014 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3015 inst0->bits3.if_else.pop_count = 0;
3016 }
3017 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3018 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3019 inst0->bits3.if_else.pop_count = 0;
3020 }
3021 }
3022 }
3023 break;
3024 default:
3025 _mesa_printf("unsupported IR in fragment shader %d\n",
3026 inst->Opcode);
3027 }
3028
3029 if (inst->CondUpdate)
3030 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3031 else
3032 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3033 }
3034 post_wm_emit(c);
3035 }
3036
3037
3038 /**
3039 * Do GPU code generation for shaders that use GLSL features such as
3040 * flow control. Other shaders will be compiled with the
3041 */
3042 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3043 {
3044 if (INTEL_DEBUG & DEBUG_WM) {
3045 _mesa_printf("brw_wm_glsl_emit:\n");
3046 }
3047
3048 /* initial instruction translation/simplification */
3049 brw_wm_pass_fp(c);
3050
3051 /* actual code generation */
3052 brw_wm_emit_glsl(brw, c);
3053
3054 if (INTEL_DEBUG & DEBUG_WM) {
3055 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3056 }
3057
3058 c->prog_data.total_grf = num_grf_used(c);
3059 c->prog_data.total_scratch = 0;
3060 }