Merge branch 'mesa_7_6_branch'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & (1 << i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553
554 /**
555 * Same as \sa get_src_reg() but if the register is a literal, emit
556 * a brw_reg encoding the literal.
557 * Note that a brw instruction only allows one src operand to be a literal.
558 * For instructions with more than one operand, only the second can be a
559 * literal. This means that we treat some literals as constants/uniforms
560 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
561 *
562 */
563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
564 const struct prog_instruction *inst,
565 GLuint srcRegIndex, GLuint channel)
566 {
567 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
568 if (src->File == PROGRAM_CONSTANT) {
569 /* a literal */
570 const int component = GET_SWZ(src->Swizzle, channel);
571 const GLfloat *param =
572 c->fp->program.Base.Parameters->ParameterValues[src->Index];
573 GLfloat value = param[component];
574 if (src->Negate & (1 << channel))
575 value = -value;
576 if (src->Abs)
577 value = FABSF(value);
578 #if 0
579 printf(" form immed value %f for chan %d\n", value, channel);
580 #endif
581 return brw_imm_f(value);
582 }
583 else {
584 return get_src_reg(c, inst, srcRegIndex, channel);
585 }
586 }
587
588
589 /**
590 * Subroutines are minimal support for resusable instruction sequences.
591 * They are implemented as simply as possible to minimise overhead: there
592 * is no explicit support for communication between the caller and callee
593 * other than saving the return address in a temporary register, nor is
594 * there any automatic local storage. This implies that great care is
595 * required before attempting reentrancy or any kind of nested
596 * subroutine invocations.
597 */
598 static void invoke_subroutine( struct brw_wm_compile *c,
599 enum _subroutine subroutine,
600 void (*emit)( struct brw_wm_compile * ) )
601 {
602 struct brw_compile *p = &c->func;
603
604 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
605
606 if( c->subroutines[ subroutine ] ) {
607 /* subroutine previously emitted: reuse existing instructions */
608
609 int mark = mark_tmps( c );
610 struct brw_reg return_address = retype( alloc_tmp( c ),
611 BRW_REGISTER_TYPE_UD );
612 int here = p->nr_insn;
613
614 brw_push_insn_state(p);
615 brw_set_mask_control(p, BRW_MASK_DISABLE);
616 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
617
618 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
619 brw_imm_d( ( c->subroutines[ subroutine ] -
620 here - 1 ) << 4 ) );
621 brw_pop_insn_state(p);
622
623 release_tmps( c, mark );
624 } else {
625 /* previously unused subroutine: emit, and mark for later reuse */
626
627 int mark = mark_tmps( c );
628 struct brw_reg return_address = retype( alloc_tmp( c ),
629 BRW_REGISTER_TYPE_UD );
630 struct brw_instruction *calc;
631 int base = p->nr_insn;
632
633 brw_push_insn_state(p);
634 brw_set_mask_control(p, BRW_MASK_DISABLE);
635 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
636 brw_pop_insn_state(p);
637
638 c->subroutines[ subroutine ] = p->nr_insn;
639
640 emit( c );
641
642 brw_push_insn_state(p);
643 brw_set_mask_control(p, BRW_MASK_DISABLE);
644 brw_MOV( p, brw_ip_reg(), return_address );
645 brw_pop_insn_state(p);
646
647 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
648
649 release_tmps( c, mark );
650 }
651 }
652
653 static void emit_trunc( struct brw_wm_compile *c,
654 const struct prog_instruction *inst)
655 {
656 int i;
657 struct brw_compile *p = &c->func;
658 GLuint mask = inst->DstReg.WriteMask;
659 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
660 for (i = 0; i < 4; i++) {
661 if (mask & (1<<i)) {
662 struct brw_reg src, dst;
663 dst = get_dst_reg(c, inst, i);
664 src = get_src_reg(c, inst, 0, i);
665 brw_RNDZ(p, dst, src);
666 }
667 }
668 brw_set_saturate(p, 0);
669 }
670
671 static void emit_mov( struct brw_wm_compile *c,
672 const struct prog_instruction *inst)
673 {
674 int i;
675 struct brw_compile *p = &c->func;
676 GLuint mask = inst->DstReg.WriteMask;
677 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
678 for (i = 0; i < 4; i++) {
679 if (mask & (1<<i)) {
680 struct brw_reg src, dst;
681 dst = get_dst_reg(c, inst, i);
682 /* XXX some moves from immediate value don't work reliably!!! */
683 /*src = get_src_reg_imm(c, inst, 0, i);*/
684 src = get_src_reg(c, inst, 0, i);
685 brw_MOV(p, dst, src);
686 }
687 }
688 brw_set_saturate(p, 0);
689 }
690
691 static void emit_pixel_xy(struct brw_wm_compile *c,
692 const struct prog_instruction *inst)
693 {
694 struct brw_reg r1 = brw_vec1_grf(1, 0);
695 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
696
697 struct brw_reg dst0, dst1;
698 struct brw_compile *p = &c->func;
699 GLuint mask = inst->DstReg.WriteMask;
700
701 dst0 = get_dst_reg(c, inst, 0);
702 dst1 = get_dst_reg(c, inst, 1);
703 /* Calculate pixel centers by adding 1 or 0 to each of the
704 * micro-tile coordinates passed in r1.
705 */
706 if (mask & WRITEMASK_X) {
707 brw_ADD(p,
708 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
709 stride(suboffset(r1_uw, 4), 2, 4, 0),
710 brw_imm_v(0x10101010));
711 }
712
713 if (mask & WRITEMASK_Y) {
714 brw_ADD(p,
715 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
716 stride(suboffset(r1_uw, 5), 2, 4, 0),
717 brw_imm_v(0x11001100));
718 }
719 }
720
721 static void emit_delta_xy(struct brw_wm_compile *c,
722 const struct prog_instruction *inst)
723 {
724 struct brw_reg r1 = brw_vec1_grf(1, 0);
725 struct brw_reg dst0, dst1, src0, src1;
726 struct brw_compile *p = &c->func;
727 GLuint mask = inst->DstReg.WriteMask;
728
729 dst0 = get_dst_reg(c, inst, 0);
730 dst1 = get_dst_reg(c, inst, 1);
731 src0 = get_src_reg(c, inst, 0, 0);
732 src1 = get_src_reg(c, inst, 0, 1);
733 /* Calc delta X,Y by subtracting origin in r1 from the pixel
734 * centers.
735 */
736 if (mask & WRITEMASK_X) {
737 brw_ADD(p,
738 dst0,
739 retype(src0, BRW_REGISTER_TYPE_UW),
740 negate(r1));
741 }
742
743 if (mask & WRITEMASK_Y) {
744 brw_ADD(p,
745 dst1,
746 retype(src1, BRW_REGISTER_TYPE_UW),
747 negate(suboffset(r1,1)));
748
749 }
750 }
751
752 static void fire_fb_write( struct brw_wm_compile *c,
753 GLuint base_reg,
754 GLuint nr,
755 GLuint target,
756 GLuint eot)
757 {
758 struct brw_compile *p = &c->func;
759 /* Pass through control information:
760 */
761 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
762 {
763 brw_push_insn_state(p);
764 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
765 brw_MOV(p,
766 brw_message_reg(base_reg + 1),
767 brw_vec8_grf(1, 0));
768 brw_pop_insn_state(p);
769 }
770 /* Send framebuffer write message: */
771 brw_fb_WRITE(p,
772 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
773 base_reg,
774 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
775 target,
776 nr,
777 0,
778 eot);
779 }
780
781 static void emit_fb_write(struct brw_wm_compile *c,
782 const struct prog_instruction *inst)
783 {
784 struct brw_compile *p = &c->func;
785 int nr = 2;
786 int channel;
787 GLuint target, eot;
788 struct brw_reg src0;
789
790 /* Reserve a space for AA - may not be needed:
791 */
792 if (c->key.aa_dest_stencil_reg)
793 nr += 1;
794
795 brw_push_insn_state(p);
796 for (channel = 0; channel < 4; channel++) {
797 src0 = get_src_reg(c, inst, 0, channel);
798 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
799 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
800 brw_MOV(p, brw_message_reg(nr + channel), src0);
801 }
802 /* skip over the regs populated above: */
803 nr += 8;
804 brw_pop_insn_state(p);
805
806 if (c->key.source_depth_to_render_target) {
807 if (c->key.computes_depth) {
808 src0 = get_src_reg(c, inst, 2, 2);
809 brw_MOV(p, brw_message_reg(nr), src0);
810 }
811 else {
812 src0 = get_src_reg(c, inst, 1, 1);
813 brw_MOV(p, brw_message_reg(nr), src0);
814 }
815
816 nr += 2;
817 }
818
819 if (c->key.dest_depth_reg) {
820 const GLuint comp = c->key.dest_depth_reg / 2;
821 const GLuint off = c->key.dest_depth_reg % 2;
822
823 if (off != 0) {
824 /* XXX this code needs review/testing */
825 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
826 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
827
828 brw_push_insn_state(p);
829 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
830
831 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
832 /* 2nd half? */
833 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
834 brw_pop_insn_state(p);
835 }
836 else
837 {
838 struct brw_reg src = get_src_reg(c, inst, 1, 1);
839 brw_MOV(p, brw_message_reg(nr), src);
840 }
841 nr += 2;
842 }
843
844 target = inst->Aux >> 1;
845 eot = inst->Aux & 1;
846 fire_fb_write(c, 0, nr, target, eot);
847 }
848
849 static void emit_pixel_w( struct brw_wm_compile *c,
850 const struct prog_instruction *inst)
851 {
852 struct brw_compile *p = &c->func;
853 GLuint mask = inst->DstReg.WriteMask;
854 if (mask & WRITEMASK_W) {
855 struct brw_reg dst, src0, delta0, delta1;
856 struct brw_reg interp3;
857
858 dst = get_dst_reg(c, inst, 3);
859 src0 = get_src_reg(c, inst, 0, 0);
860 delta0 = get_src_reg(c, inst, 1, 0);
861 delta1 = get_src_reg(c, inst, 1, 1);
862
863 interp3 = brw_vec1_grf(src0.nr+1, 4);
864 /* Calc 1/w - just linterp wpos[3] optimized by putting the
865 * result straight into a message reg.
866 */
867 brw_LINE(p, brw_null_reg(), interp3, delta0);
868 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
869
870 /* Calc w */
871 brw_math_16( p, dst,
872 BRW_MATH_FUNCTION_INV,
873 BRW_MATH_SATURATE_NONE,
874 2, brw_null_reg(),
875 BRW_MATH_PRECISION_FULL);
876 }
877 }
878
879 static void emit_linterp(struct brw_wm_compile *c,
880 const struct prog_instruction *inst)
881 {
882 struct brw_compile *p = &c->func;
883 GLuint mask = inst->DstReg.WriteMask;
884 struct brw_reg interp[4];
885 struct brw_reg dst, delta0, delta1;
886 struct brw_reg src0;
887 GLuint nr, i;
888
889 src0 = get_src_reg(c, inst, 0, 0);
890 delta0 = get_src_reg(c, inst, 1, 0);
891 delta1 = get_src_reg(c, inst, 1, 1);
892 nr = src0.nr;
893
894 interp[0] = brw_vec1_grf(nr, 0);
895 interp[1] = brw_vec1_grf(nr, 4);
896 interp[2] = brw_vec1_grf(nr+1, 0);
897 interp[3] = brw_vec1_grf(nr+1, 4);
898
899 for(i = 0; i < 4; i++ ) {
900 if (mask & (1<<i)) {
901 dst = get_dst_reg(c, inst, i);
902 brw_LINE(p, brw_null_reg(), interp[i], delta0);
903 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
904 }
905 }
906 }
907
908 static void emit_cinterp(struct brw_wm_compile *c,
909 const struct prog_instruction *inst)
910 {
911 struct brw_compile *p = &c->func;
912 GLuint mask = inst->DstReg.WriteMask;
913
914 struct brw_reg interp[4];
915 struct brw_reg dst, src0;
916 GLuint nr, i;
917
918 src0 = get_src_reg(c, inst, 0, 0);
919 nr = src0.nr;
920
921 interp[0] = brw_vec1_grf(nr, 0);
922 interp[1] = brw_vec1_grf(nr, 4);
923 interp[2] = brw_vec1_grf(nr+1, 0);
924 interp[3] = brw_vec1_grf(nr+1, 4);
925
926 for(i = 0; i < 4; i++ ) {
927 if (mask & (1<<i)) {
928 dst = get_dst_reg(c, inst, i);
929 brw_MOV(p, dst, suboffset(interp[i],3));
930 }
931 }
932 }
933
934 static void emit_pinterp(struct brw_wm_compile *c,
935 const struct prog_instruction *inst)
936 {
937 struct brw_compile *p = &c->func;
938 GLuint mask = inst->DstReg.WriteMask;
939
940 struct brw_reg interp[4];
941 struct brw_reg dst, delta0, delta1;
942 struct brw_reg src0, w;
943 GLuint nr, i;
944
945 src0 = get_src_reg(c, inst, 0, 0);
946 delta0 = get_src_reg(c, inst, 1, 0);
947 delta1 = get_src_reg(c, inst, 1, 1);
948 w = get_src_reg(c, inst, 2, 3);
949 nr = src0.nr;
950
951 interp[0] = brw_vec1_grf(nr, 0);
952 interp[1] = brw_vec1_grf(nr, 4);
953 interp[2] = brw_vec1_grf(nr+1, 0);
954 interp[3] = brw_vec1_grf(nr+1, 4);
955
956 for(i = 0; i < 4; i++ ) {
957 if (mask & (1<<i)) {
958 dst = get_dst_reg(c, inst, i);
959 brw_LINE(p, brw_null_reg(), interp[i], delta0);
960 brw_MAC(p, dst, suboffset(interp[i],1),
961 delta1);
962 brw_MUL(p, dst, dst, w);
963 }
964 }
965 }
966
967 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
968 static void emit_frontfacing(struct brw_wm_compile *c,
969 const struct prog_instruction *inst)
970 {
971 struct brw_compile *p = &c->func;
972 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
973 struct brw_reg dst;
974 GLuint mask = inst->DstReg.WriteMask;
975 int i;
976
977 for (i = 0; i < 4; i++) {
978 if (mask & (1<<i)) {
979 dst = get_dst_reg(c, inst, i);
980 brw_MOV(p, dst, brw_imm_f(0.0));
981 }
982 }
983
984 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
985 * us front face
986 */
987 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
988 for (i = 0; i < 4; i++) {
989 if (mask & (1<<i)) {
990 dst = get_dst_reg(c, inst, i);
991 brw_MOV(p, dst, brw_imm_f(1.0));
992 }
993 }
994 brw_set_predicate_control_flag_value(p, 0xff);
995 }
996
997 static void emit_xpd(struct brw_wm_compile *c,
998 const struct prog_instruction *inst)
999 {
1000 int i;
1001 struct brw_compile *p = &c->func;
1002 GLuint mask = inst->DstReg.WriteMask;
1003 for (i = 0; i < 4; i++) {
1004 GLuint i2 = (i+2)%3;
1005 GLuint i1 = (i+1)%3;
1006 if (mask & (1<<i)) {
1007 struct brw_reg src0, src1, dst;
1008 dst = get_dst_reg(c, inst, i);
1009 src0 = negate(get_src_reg(c, inst, 0, i2));
1010 src1 = get_src_reg_imm(c, inst, 1, i1);
1011 brw_MUL(p, brw_null_reg(), src0, src1);
1012 src0 = get_src_reg(c, inst, 0, i1);
1013 src1 = get_src_reg_imm(c, inst, 1, i2);
1014 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1015 brw_MAC(p, dst, src0, src1);
1016 brw_set_saturate(p, 0);
1017 }
1018 }
1019 brw_set_saturate(p, 0);
1020 }
1021
1022 static void emit_dp3(struct brw_wm_compile *c,
1023 const struct prog_instruction *inst)
1024 {
1025 struct brw_reg src0[3], src1[3], dst;
1026 int i;
1027 struct brw_compile *p = &c->func;
1028 GLuint mask = inst->DstReg.WriteMask;
1029 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1030
1031 if (!(mask & WRITEMASK_XYZW))
1032 return;
1033
1034 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1035
1036 for (i = 0; i < 3; i++) {
1037 src0[i] = get_src_reg(c, inst, 0, i);
1038 src1[i] = get_src_reg_imm(c, inst, 1, i);
1039 }
1040
1041 dst = get_dst_reg(c, inst, dst_chan);
1042 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1043 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1044 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1045 brw_MAC(p, dst, src0[2], src1[2]);
1046 brw_set_saturate(p, 0);
1047 }
1048
1049 static void emit_dp4(struct brw_wm_compile *c,
1050 const struct prog_instruction *inst)
1051 {
1052 struct brw_reg src0[4], src1[4], dst;
1053 int i;
1054 struct brw_compile *p = &c->func;
1055 GLuint mask = inst->DstReg.WriteMask;
1056 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1057
1058 if (!(mask & WRITEMASK_XYZW))
1059 return;
1060
1061 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1062
1063 for (i = 0; i < 4; i++) {
1064 src0[i] = get_src_reg(c, inst, 0, i);
1065 src1[i] = get_src_reg_imm(c, inst, 1, i);
1066 }
1067 dst = get_dst_reg(c, inst, dst_chan);
1068 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1069 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1070 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1071 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1072 brw_MAC(p, dst, src0[3], src1[3]);
1073 brw_set_saturate(p, 0);
1074 }
1075
1076 static void emit_dph(struct brw_wm_compile *c,
1077 const struct prog_instruction *inst)
1078 {
1079 struct brw_reg src0[4], src1[4], dst;
1080 int i;
1081 struct brw_compile *p = &c->func;
1082 GLuint mask = inst->DstReg.WriteMask;
1083 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1084
1085 if (!(mask & WRITEMASK_XYZW))
1086 return;
1087
1088 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1089
1090 for (i = 0; i < 4; i++) {
1091 src0[i] = get_src_reg(c, inst, 0, i);
1092 src1[i] = get_src_reg_imm(c, inst, 1, i);
1093 }
1094 dst = get_dst_reg(c, inst, dst_chan);
1095 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1096 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1097 brw_MAC(p, dst, src0[2], src1[2]);
1098 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1099 brw_ADD(p, dst, dst, src1[3]);
1100 brw_set_saturate(p, 0);
1101 }
1102
1103 /**
1104 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1105 * Note that the result of the function is smeared across the dest
1106 * register's X, Y, Z and W channels (subject to writemasking of course).
1107 */
1108 static void emit_math1(struct brw_wm_compile *c,
1109 const struct prog_instruction *inst, GLuint func)
1110 {
1111 struct brw_compile *p = &c->func;
1112 struct brw_reg src0, dst;
1113 GLuint mask = inst->DstReg.WriteMask;
1114 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1115
1116 if (!(mask & WRITEMASK_XYZW))
1117 return;
1118
1119 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1120
1121 /* Get first component of source register */
1122 dst = get_dst_reg(c, inst, dst_chan);
1123 src0 = get_src_reg(c, inst, 0, 0);
1124
1125 brw_MOV(p, brw_message_reg(2), src0);
1126 brw_math(p,
1127 dst,
1128 func,
1129 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1130 2,
1131 brw_null_reg(),
1132 BRW_MATH_DATA_VECTOR,
1133 BRW_MATH_PRECISION_FULL);
1134 }
1135
1136 static void emit_rcp(struct brw_wm_compile *c,
1137 const struct prog_instruction *inst)
1138 {
1139 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1140 }
1141
1142 static void emit_rsq(struct brw_wm_compile *c,
1143 const struct prog_instruction *inst)
1144 {
1145 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1146 }
1147
1148 static void emit_sin(struct brw_wm_compile *c,
1149 const struct prog_instruction *inst)
1150 {
1151 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1152 }
1153
1154 static void emit_cos(struct brw_wm_compile *c,
1155 const struct prog_instruction *inst)
1156 {
1157 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1158 }
1159
1160 static void emit_ex2(struct brw_wm_compile *c,
1161 const struct prog_instruction *inst)
1162 {
1163 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1164 }
1165
1166 static void emit_lg2(struct brw_wm_compile *c,
1167 const struct prog_instruction *inst)
1168 {
1169 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1170 }
1171
1172 static void emit_add(struct brw_wm_compile *c,
1173 const struct prog_instruction *inst)
1174 {
1175 struct brw_compile *p = &c->func;
1176 struct brw_reg src0, src1, dst;
1177 GLuint mask = inst->DstReg.WriteMask;
1178 int i;
1179 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1180 for (i = 0 ; i < 4; i++) {
1181 if (mask & (1<<i)) {
1182 dst = get_dst_reg(c, inst, i);
1183 src0 = get_src_reg(c, inst, 0, i);
1184 src1 = get_src_reg_imm(c, inst, 1, i);
1185 brw_ADD(p, dst, src0, src1);
1186 }
1187 }
1188 brw_set_saturate(p, 0);
1189 }
1190
1191 static void emit_arl(struct brw_wm_compile *c,
1192 const struct prog_instruction *inst)
1193 {
1194 struct brw_compile *p = &c->func;
1195 struct brw_reg src0, addr_reg;
1196 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1197 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1198 BRW_ARF_ADDRESS, 0);
1199 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1200 brw_MOV(p, addr_reg, src0);
1201 brw_set_saturate(p, 0);
1202 }
1203
1204
1205 static void emit_mul(struct brw_wm_compile *c,
1206 const struct prog_instruction *inst)
1207 {
1208 struct brw_compile *p = &c->func;
1209 struct brw_reg src0, src1, dst;
1210 GLuint mask = inst->DstReg.WriteMask;
1211 int i;
1212 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1213 for (i = 0 ; i < 4; i++) {
1214 if (mask & (1<<i)) {
1215 dst = get_dst_reg(c, inst, i);
1216 src0 = get_src_reg(c, inst, 0, i);
1217 src1 = get_src_reg_imm(c, inst, 1, i);
1218 brw_MUL(p, dst, src0, src1);
1219 }
1220 }
1221 brw_set_saturate(p, 0);
1222 }
1223
1224 static void emit_frc(struct brw_wm_compile *c,
1225 const struct prog_instruction *inst)
1226 {
1227 struct brw_compile *p = &c->func;
1228 struct brw_reg src0, dst;
1229 GLuint mask = inst->DstReg.WriteMask;
1230 int i;
1231 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1232 for (i = 0 ; i < 4; i++) {
1233 if (mask & (1<<i)) {
1234 dst = get_dst_reg(c, inst, i);
1235 src0 = get_src_reg_imm(c, inst, 0, i);
1236 brw_FRC(p, dst, src0);
1237 }
1238 }
1239 if (inst->SaturateMode != SATURATE_OFF)
1240 brw_set_saturate(p, 0);
1241 }
1242
1243 static void emit_flr(struct brw_wm_compile *c,
1244 const struct prog_instruction *inst)
1245 {
1246 struct brw_compile *p = &c->func;
1247 struct brw_reg src0, dst;
1248 GLuint mask = inst->DstReg.WriteMask;
1249 int i;
1250 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1251 for (i = 0 ; i < 4; i++) {
1252 if (mask & (1<<i)) {
1253 dst = get_dst_reg(c, inst, i);
1254 src0 = get_src_reg_imm(c, inst, 0, i);
1255 brw_RNDD(p, dst, src0);
1256 }
1257 }
1258 brw_set_saturate(p, 0);
1259 }
1260
1261
1262 static void emit_min_max(struct brw_wm_compile *c,
1263 const struct prog_instruction *inst)
1264 {
1265 struct brw_compile *p = &c->func;
1266 const GLuint mask = inst->DstReg.WriteMask;
1267 const int mark = mark_tmps(c);
1268 int i;
1269 brw_push_insn_state(p);
1270 for (i = 0; i < 4; i++) {
1271 if (mask & (1<<i)) {
1272 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1273 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1274 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1275 struct brw_reg dst;
1276 /* if dst==src0 or dst==src1 we need to use a temp reg */
1277 GLboolean use_temp = brw_same_reg(dst, src0) ||
1278 brw_same_reg(dst, src1);
1279 if (use_temp)
1280 dst = alloc_tmp(c);
1281 else
1282 dst = real_dst;
1283
1284 /*
1285 printf(" Min/max: dst %d src0 %d src1 %d\n",
1286 dst.nr, src0.nr, src1.nr);
1287 */
1288 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1289 brw_MOV(p, dst, src0);
1290 brw_set_saturate(p, 0);
1291
1292 if (inst->Opcode == OPCODE_MIN)
1293 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1294 else
1295 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1296
1297 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1298 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1299 brw_MOV(p, dst, src1);
1300 brw_set_saturate(p, 0);
1301 brw_set_predicate_control_flag_value(p, 0xff);
1302 if (use_temp)
1303 brw_MOV(p, real_dst, dst);
1304 }
1305 }
1306 brw_pop_insn_state(p);
1307 release_tmps(c, mark);
1308 }
1309
1310 static void emit_pow(struct brw_wm_compile *c,
1311 const struct prog_instruction *inst)
1312 {
1313 struct brw_compile *p = &c->func;
1314 struct brw_reg dst, src0, src1;
1315 GLuint mask = inst->DstReg.WriteMask;
1316 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1317
1318 if (!(mask & WRITEMASK_XYZW))
1319 return;
1320
1321 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1322
1323 dst = get_dst_reg(c, inst, dst_chan);
1324 src0 = get_src_reg_imm(c, inst, 0, 0);
1325 src1 = get_src_reg_imm(c, inst, 1, 0);
1326
1327 brw_MOV(p, brw_message_reg(2), src0);
1328 brw_MOV(p, brw_message_reg(3), src1);
1329
1330 brw_math(p,
1331 dst,
1332 BRW_MATH_FUNCTION_POW,
1333 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1334 2,
1335 brw_null_reg(),
1336 BRW_MATH_DATA_VECTOR,
1337 BRW_MATH_PRECISION_FULL);
1338 }
1339
1340 static void emit_lrp(struct brw_wm_compile *c,
1341 const struct prog_instruction *inst)
1342 {
1343 struct brw_compile *p = &c->func;
1344 GLuint mask = inst->DstReg.WriteMask;
1345 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1346 int i;
1347 int mark = mark_tmps(c);
1348 for (i = 0; i < 4; i++) {
1349 if (mask & (1<<i)) {
1350 dst = get_dst_reg(c, inst, i);
1351 src0 = get_src_reg(c, inst, 0, i);
1352
1353 src1 = get_src_reg_imm(c, inst, 1, i);
1354
1355 if (src1.nr == dst.nr) {
1356 tmp1 = alloc_tmp(c);
1357 brw_MOV(p, tmp1, src1);
1358 } else
1359 tmp1 = src1;
1360
1361 src2 = get_src_reg(c, inst, 2, i);
1362 if (src2.nr == dst.nr) {
1363 tmp2 = alloc_tmp(c);
1364 brw_MOV(p, tmp2, src2);
1365 } else
1366 tmp2 = src2;
1367
1368 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1369 brw_MUL(p, brw_null_reg(), dst, tmp2);
1370 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1371 brw_MAC(p, dst, src0, tmp1);
1372 brw_set_saturate(p, 0);
1373 }
1374 release_tmps(c, mark);
1375 }
1376 }
1377
1378 /**
1379 * For GLSL shaders, this KIL will be unconditional.
1380 * It may be contained inside an IF/ENDIF structure of course.
1381 */
1382 static void emit_kil(struct brw_wm_compile *c)
1383 {
1384 struct brw_compile *p = &c->func;
1385 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1386 brw_push_insn_state(p);
1387 brw_set_mask_control(p, BRW_MASK_DISABLE);
1388 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1389 brw_AND(p, depth, c->emit_mask_reg, depth);
1390 brw_pop_insn_state(p);
1391 }
1392
1393 static void emit_mad(struct brw_wm_compile *c,
1394 const struct prog_instruction *inst)
1395 {
1396 struct brw_compile *p = &c->func;
1397 GLuint mask = inst->DstReg.WriteMask;
1398 struct brw_reg dst, src0, src1, src2;
1399 int i;
1400
1401 for (i = 0; i < 4; i++) {
1402 if (mask & (1<<i)) {
1403 dst = get_dst_reg(c, inst, i);
1404 src0 = get_src_reg(c, inst, 0, i);
1405 src1 = get_src_reg_imm(c, inst, 1, i);
1406 src2 = get_src_reg_imm(c, inst, 2, i);
1407 brw_MUL(p, dst, src0, src1);
1408
1409 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1410 brw_ADD(p, dst, dst, src2);
1411 brw_set_saturate(p, 0);
1412 }
1413 }
1414 }
1415
1416 static void emit_sop(struct brw_wm_compile *c,
1417 const struct prog_instruction *inst, GLuint cond)
1418 {
1419 struct brw_compile *p = &c->func;
1420 GLuint mask = inst->DstReg.WriteMask;
1421 struct brw_reg dst, src0, src1;
1422 int i;
1423
1424 for (i = 0; i < 4; i++) {
1425 if (mask & (1<<i)) {
1426 dst = get_dst_reg(c, inst, i);
1427 src0 = get_src_reg(c, inst, 0, i);
1428 src1 = get_src_reg_imm(c, inst, 1, i);
1429 brw_push_insn_state(p);
1430 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1431 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1432 brw_MOV(p, dst, brw_imm_f(0.0));
1433 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1434 brw_MOV(p, dst, brw_imm_f(1.0));
1435 brw_pop_insn_state(p);
1436 }
1437 }
1438 }
1439
1440 static void emit_slt(struct brw_wm_compile *c,
1441 const struct prog_instruction *inst)
1442 {
1443 emit_sop(c, inst, BRW_CONDITIONAL_L);
1444 }
1445
1446 static void emit_sle(struct brw_wm_compile *c,
1447 const struct prog_instruction *inst)
1448 {
1449 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1450 }
1451
1452 static void emit_sgt(struct brw_wm_compile *c,
1453 const struct prog_instruction *inst)
1454 {
1455 emit_sop(c, inst, BRW_CONDITIONAL_G);
1456 }
1457
1458 static void emit_sge(struct brw_wm_compile *c,
1459 const struct prog_instruction *inst)
1460 {
1461 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1462 }
1463
1464 static void emit_seq(struct brw_wm_compile *c,
1465 const struct prog_instruction *inst)
1466 {
1467 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1468 }
1469
1470 static void emit_sne(struct brw_wm_compile *c,
1471 const struct prog_instruction *inst)
1472 {
1473 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1474 }
1475
1476 static INLINE struct brw_reg high_words( struct brw_reg reg )
1477 {
1478 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1479 0, 8, 2 );
1480 }
1481
1482 static INLINE struct brw_reg low_words( struct brw_reg reg )
1483 {
1484 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1485 }
1486
1487 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1488 {
1489 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1490 }
1491
1492 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1493 {
1494 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1495 0, 16, 2 );
1496 }
1497
1498 /* One-, two- and three-dimensional Perlin noise, similar to the description
1499 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1500 static void noise1_sub( struct brw_wm_compile *c ) {
1501
1502 struct brw_compile *p = &c->func;
1503 struct brw_reg param,
1504 x0, x1, /* gradients at each end */
1505 t, tmp[ 2 ], /* float temporaries */
1506 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1507 int i;
1508 int mark = mark_tmps( c );
1509
1510 x0 = alloc_tmp( c );
1511 x1 = alloc_tmp( c );
1512 t = alloc_tmp( c );
1513 tmp[ 0 ] = alloc_tmp( c );
1514 tmp[ 1 ] = alloc_tmp( c );
1515 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1516 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1517 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1518 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1519 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1520
1521 param = lookup_tmp( c, mark - 2 );
1522
1523 brw_set_access_mode( p, BRW_ALIGN_1 );
1524
1525 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1526
1527 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1528 be hashed. Also compute the remainder (offset within the unit
1529 length), interleaved to reduce register dependency penalties. */
1530 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1531 brw_FRC( p, param, param );
1532 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1533 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1534 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1535
1536 /* We're now ready to perform the hashing. The two hashes are
1537 interleaved for performance. The hash function used is
1538 designed to rapidly achieve avalanche and require only 32x16
1539 bit multiplication, and 16-bit swizzles (which we get for
1540 free). We can't use immediate operands in the multiplies,
1541 because immediates are permitted only in src1 and the 16-bit
1542 factor is permitted only in src0. */
1543 for( i = 0; i < 2; i++ )
1544 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1545 for( i = 0; i < 2; i++ )
1546 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1547 high_words( itmp[ i ] ) );
1548 for( i = 0; i < 2; i++ )
1549 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1550 for( i = 0; i < 2; i++ )
1551 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1552 high_words( itmp[ i ] ) );
1553 for( i = 0; i < 2; i++ )
1554 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1555 for( i = 0; i < 2; i++ )
1556 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1557 high_words( itmp[ i ] ) );
1558
1559 /* Now we want to initialise the two gradients based on the
1560 hashes. Format conversion from signed integer to float leaves
1561 everything scaled too high by a factor of pow( 2, 31 ), but
1562 we correct for that right at the end. */
1563 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1564 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1565 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1566
1567 brw_MUL( p, x0, x0, param );
1568 brw_MUL( p, x1, x1, t );
1569
1570 /* We interpolate between the gradients using the polynomial
1571 6t^5 - 15t^4 + 10t^3 (Perlin). */
1572 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1573 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1574 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1575 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1576 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1577 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1578 pipeline */
1579 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1580 brw_MUL( p, param, tmp[ 0 ], param );
1581 brw_MUL( p, x1, x1, param );
1582 brw_ADD( p, x0, x0, x1 );
1583 /* scale by pow( 2, -30 ), to compensate for the format conversion
1584 above and an extra factor of 2 so that a single gradient covers
1585 the [-1,1] range */
1586 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1587
1588 release_tmps( c, mark );
1589 }
1590
1591 static void emit_noise1( struct brw_wm_compile *c,
1592 const struct prog_instruction *inst )
1593 {
1594 struct brw_compile *p = &c->func;
1595 struct brw_reg src, param, dst;
1596 GLuint mask = inst->DstReg.WriteMask;
1597 int i;
1598 int mark = mark_tmps( c );
1599
1600 assert( mark == 0 );
1601
1602 src = get_src_reg( c, inst, 0, 0 );
1603
1604 param = alloc_tmp( c );
1605
1606 brw_MOV( p, param, src );
1607
1608 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1609
1610 /* Fill in the result: */
1611 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1612 for (i = 0 ; i < 4; i++) {
1613 if (mask & (1<<i)) {
1614 dst = get_dst_reg(c, inst, i);
1615 brw_MOV( p, dst, param );
1616 }
1617 }
1618 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1619 brw_set_saturate( p, 0 );
1620
1621 release_tmps( c, mark );
1622 }
1623
1624 static void noise2_sub( struct brw_wm_compile *c ) {
1625
1626 struct brw_compile *p = &c->func;
1627 struct brw_reg param0, param1,
1628 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1629 t, tmp[ 4 ], /* float temporaries */
1630 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1631 int i;
1632 int mark = mark_tmps( c );
1633
1634 x0y0 = alloc_tmp( c );
1635 x0y1 = alloc_tmp( c );
1636 x1y0 = alloc_tmp( c );
1637 x1y1 = alloc_tmp( c );
1638 t = alloc_tmp( c );
1639 for( i = 0; i < 4; i++ ) {
1640 tmp[ i ] = alloc_tmp( c );
1641 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1642 }
1643 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1644 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1645 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1646
1647 param0 = lookup_tmp( c, mark - 3 );
1648 param1 = lookup_tmp( c, mark - 2 );
1649
1650 brw_set_access_mode( p, BRW_ALIGN_1 );
1651
1652 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1653 be hashed. Also compute the remainders (offsets within the unit
1654 square), interleaved to reduce register dependency penalties. */
1655 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1656 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1657 brw_FRC( p, param0, param0 );
1658 brw_FRC( p, param1, param1 );
1659 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1660 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1661 low_words( itmp[ 1 ] ) );
1662 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1663 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1664 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1665 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1666 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1667
1668 /* We're now ready to perform the hashing. The four hashes are
1669 interleaved for performance. The hash function used is
1670 designed to rapidly achieve avalanche and require only 32x16
1671 bit multiplication, and 16-bit swizzles (which we get for
1672 free). We can't use immediate operands in the multiplies,
1673 because immediates are permitted only in src1 and the 16-bit
1674 factor is permitted only in src0. */
1675 for( i = 0; i < 4; i++ )
1676 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1677 for( i = 0; i < 4; i++ )
1678 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1679 high_words( itmp[ i ] ) );
1680 for( i = 0; i < 4; i++ )
1681 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1682 for( i = 0; i < 4; i++ )
1683 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1684 high_words( itmp[ i ] ) );
1685 for( i = 0; i < 4; i++ )
1686 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1687 for( i = 0; i < 4; i++ )
1688 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1689 high_words( itmp[ i ] ) );
1690
1691 /* Now we want to initialise the four gradients based on the
1692 hashes. Format conversion from signed integer to float leaves
1693 everything scaled too high by a factor of pow( 2, 15 ), but
1694 we correct for that right at the end. */
1695 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1696 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1697 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1698 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1699 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1700
1701 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1702 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1703 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1704 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1705
1706 brw_MUL( p, x1y0, x1y0, t );
1707 brw_MUL( p, x1y1, x1y1, t );
1708 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1709 brw_MUL( p, x0y0, x0y0, param0 );
1710 brw_MUL( p, x0y1, x0y1, param0 );
1711
1712 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1713 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1714 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1715 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1716
1717 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1718 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1719 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1720 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1721
1722 /* We interpolate between the gradients using the polynomial
1723 6t^5 - 15t^4 + 10t^3 (Perlin). */
1724 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1725 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1726 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1727 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1728 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1729 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1730 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1731 pipeline */
1732 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1733 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1734 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1735 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1736 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1737 pipeline */
1738 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1739 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1740 brw_MUL( p, param0, tmp[ 0 ], param0 );
1741 brw_MUL( p, param1, tmp[ 1 ], param1 );
1742
1743 /* Here we interpolate in the y dimension... */
1744 brw_MUL( p, x0y1, x0y1, param1 );
1745 brw_MUL( p, x1y1, x1y1, param1 );
1746 brw_ADD( p, x0y0, x0y0, x0y1 );
1747 brw_ADD( p, x1y0, x1y0, x1y1 );
1748
1749 /* And now in x. There are horrible register dependencies here,
1750 but we have nothing else to do. */
1751 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1752 brw_MUL( p, x1y0, x1y0, param0 );
1753 brw_ADD( p, x0y0, x0y0, x1y0 );
1754
1755 /* scale by pow( 2, -15 ), as described above */
1756 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1757
1758 release_tmps( c, mark );
1759 }
1760
1761 static void emit_noise2( struct brw_wm_compile *c,
1762 const struct prog_instruction *inst )
1763 {
1764 struct brw_compile *p = &c->func;
1765 struct brw_reg src0, src1, param0, param1, dst;
1766 GLuint mask = inst->DstReg.WriteMask;
1767 int i;
1768 int mark = mark_tmps( c );
1769
1770 assert( mark == 0 );
1771
1772 src0 = get_src_reg( c, inst, 0, 0 );
1773 src1 = get_src_reg( c, inst, 0, 1 );
1774
1775 param0 = alloc_tmp( c );
1776 param1 = alloc_tmp( c );
1777
1778 brw_MOV( p, param0, src0 );
1779 brw_MOV( p, param1, src1 );
1780
1781 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1782
1783 /* Fill in the result: */
1784 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1785 for (i = 0 ; i < 4; i++) {
1786 if (mask & (1<<i)) {
1787 dst = get_dst_reg(c, inst, i);
1788 brw_MOV( p, dst, param0 );
1789 }
1790 }
1791 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1792 brw_set_saturate( p, 0 );
1793
1794 release_tmps( c, mark );
1795 }
1796
1797 /**
1798 * The three-dimensional case is much like the one- and two- versions above,
1799 * but since the number of corners is rapidly growing we now pack 16 16-bit
1800 * hashes into each register to extract more parallelism from the EUs.
1801 */
1802 static void noise3_sub( struct brw_wm_compile *c ) {
1803
1804 struct brw_compile *p = &c->func;
1805 struct brw_reg param0, param1, param2,
1806 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1807 xi, yi, zi, /* interpolation coefficients */
1808 t, tmp[ 8 ], /* float temporaries */
1809 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1810 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1811 int i;
1812 int mark = mark_tmps( c );
1813
1814 x0y0 = alloc_tmp( c );
1815 x0y1 = alloc_tmp( c );
1816 x1y0 = alloc_tmp( c );
1817 x1y1 = alloc_tmp( c );
1818 xi = alloc_tmp( c );
1819 yi = alloc_tmp( c );
1820 zi = alloc_tmp( c );
1821 t = alloc_tmp( c );
1822 for( i = 0; i < 8; i++ ) {
1823 tmp[ i ] = alloc_tmp( c );
1824 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1825 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1826 }
1827
1828 param0 = lookup_tmp( c, mark - 4 );
1829 param1 = lookup_tmp( c, mark - 3 );
1830 param2 = lookup_tmp( c, mark - 2 );
1831
1832 brw_set_access_mode( p, BRW_ALIGN_1 );
1833
1834 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1835 be hashed. Also compute the remainders (offsets within the unit
1836 cube), interleaved to reduce register dependency penalties. */
1837 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1838 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1839 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1840 brw_FRC( p, param0, param0 );
1841 brw_FRC( p, param1, param1 );
1842 brw_FRC( p, param2, param2 );
1843 /* Since we now have only 16 bits of precision in the hash, we must
1844 be more careful about thorough mixing to maintain entropy as we
1845 squash the input vector into a small scalar. */
1846 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1847 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1848 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1849 brw_imm_uw( 0x9B93 ) );
1850 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1851 brw_imm_uw( 0xBC8F ) );
1852
1853 /* Temporarily disable the execution mask while we work with ExecSize=16
1854 channels (the mask is set for ExecSize=8 and is probably incorrect).
1855 Although this might cause execution of unwanted channels, the code
1856 writes only to temporary registers and has no side effects, so
1857 disabling the mask is harmless. */
1858 brw_push_insn_state( p );
1859 brw_set_mask_control( p, BRW_MASK_DISABLE );
1860 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1861 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1862 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1863
1864 /* We're now ready to perform the hashing. The eight hashes are
1865 interleaved for performance. The hash function used is
1866 designed to rapidly achieve avalanche and require only 16x16
1867 bit multiplication, and 8-bit swizzles (which we get for
1868 free). */
1869 for( i = 0; i < 4; i++ )
1870 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1871 for( i = 0; i < 4; i++ )
1872 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1873 odd_bytes( wtmp[ i ] ) );
1874 for( i = 0; i < 4; i++ )
1875 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1876 for( i = 0; i < 4; i++ )
1877 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1878 odd_bytes( wtmp[ i ] ) );
1879 brw_pop_insn_state( p );
1880
1881 /* Now we want to initialise the four rear gradients based on the
1882 hashes. Format conversion from signed integer to float leaves
1883 everything scaled too high by a factor of pow( 2, 15 ), but
1884 we correct for that right at the end. */
1885 /* x component */
1886 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1887 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1888 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1889 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1890 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1891
1892 brw_push_insn_state( p );
1893 brw_set_mask_control( p, BRW_MASK_DISABLE );
1894 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1895 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1896 brw_pop_insn_state( p );
1897
1898 brw_MUL( p, x1y0, x1y0, t );
1899 brw_MUL( p, x1y1, x1y1, t );
1900 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1901 brw_MUL( p, x0y0, x0y0, param0 );
1902 brw_MUL( p, x0y1, x0y1, param0 );
1903
1904 /* y component */
1905 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1906 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1907 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1908 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1909
1910 brw_push_insn_state( p );
1911 brw_set_mask_control( p, BRW_MASK_DISABLE );
1912 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1913 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1914 brw_pop_insn_state( p );
1915
1916 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1917 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1918 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1919 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1920 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1921
1922 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1923 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1924 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1925 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1926
1927 /* z component */
1928 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1929 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1930 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1931 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1932
1933 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1934 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1935 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1936 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1937
1938 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1939 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1940 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1941 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1942
1943 /* We interpolate between the gradients using the polynomial
1944 6t^5 - 15t^4 + 10t^3 (Perlin). */
1945 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1946 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1947 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1948 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1949 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1950 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1951 brw_MUL( p, xi, xi, param0 );
1952 brw_MUL( p, yi, yi, param1 );
1953 brw_MUL( p, zi, zi, param2 );
1954 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1955 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1956 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1957 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1958 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1959 brw_MUL( p, xi, xi, param0 );
1960 brw_MUL( p, yi, yi, param1 );
1961 brw_MUL( p, zi, zi, param2 );
1962 brw_MUL( p, xi, xi, param0 );
1963 brw_MUL( p, yi, yi, param1 );
1964 brw_MUL( p, zi, zi, param2 );
1965 brw_MUL( p, xi, xi, param0 );
1966 brw_MUL( p, yi, yi, param1 );
1967 brw_MUL( p, zi, zi, param2 );
1968
1969 /* Here we interpolate in the y dimension... */
1970 brw_MUL( p, x0y1, x0y1, yi );
1971 brw_MUL( p, x1y1, x1y1, yi );
1972 brw_ADD( p, x0y0, x0y0, x0y1 );
1973 brw_ADD( p, x1y0, x1y0, x1y1 );
1974
1975 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1976 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1977 brw_MUL( p, x1y0, x1y0, xi );
1978 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1979
1980 /* Now do the same thing for the front four gradients... */
1981 /* x component */
1982 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1983 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1984 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1985 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1986
1987 brw_push_insn_state( p );
1988 brw_set_mask_control( p, BRW_MASK_DISABLE );
1989 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1990 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1991 brw_pop_insn_state( p );
1992
1993 brw_MUL( p, x1y0, x1y0, t );
1994 brw_MUL( p, x1y1, x1y1, t );
1995 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1996 brw_MUL( p, x0y0, x0y0, param0 );
1997 brw_MUL( p, x0y1, x0y1, param0 );
1998
1999 /* y component */
2000 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2001 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2002 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2003 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2004
2005 brw_push_insn_state( p );
2006 brw_set_mask_control( p, BRW_MASK_DISABLE );
2007 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2008 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2009 brw_pop_insn_state( p );
2010
2011 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2012 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2013 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2014 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2015 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2016
2017 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2018 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2019 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2020 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2021
2022 /* z component */
2023 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2024 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2025 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2026 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2027
2028 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2029 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2030 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2031 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2032
2033 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2034 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2035 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2036 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2037
2038 /* The interpolation coefficients are still around from last time, so
2039 again interpolate in the y dimension... */
2040 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2041 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2042 brw_MUL( p, x0y1, x0y1, yi );
2043 brw_MUL( p, x1y1, x1y1, yi );
2044 brw_ADD( p, x0y0, x0y0, x0y1 );
2045 brw_ADD( p, x1y0, x1y0, x1y1 );
2046
2047 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2048 time put the front face in tmp[ 1 ] and we're nearly there... */
2049 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2050 brw_MUL( p, x1y0, x1y0, xi );
2051 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2052
2053 /* The final interpolation, in the z dimension: */
2054 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2055 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2056 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2057
2058 /* scale by pow( 2, -15 ), as described above */
2059 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2060
2061 release_tmps( c, mark );
2062 }
2063
2064 static void emit_noise3( struct brw_wm_compile *c,
2065 const struct prog_instruction *inst )
2066 {
2067 struct brw_compile *p = &c->func;
2068 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2069 GLuint mask = inst->DstReg.WriteMask;
2070 int i;
2071 int mark = mark_tmps( c );
2072
2073 assert( mark == 0 );
2074
2075 src0 = get_src_reg( c, inst, 0, 0 );
2076 src1 = get_src_reg( c, inst, 0, 1 );
2077 src2 = get_src_reg( c, inst, 0, 2 );
2078
2079 param0 = alloc_tmp( c );
2080 param1 = alloc_tmp( c );
2081 param2 = alloc_tmp( c );
2082
2083 brw_MOV( p, param0, src0 );
2084 brw_MOV( p, param1, src1 );
2085 brw_MOV( p, param2, src2 );
2086
2087 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2088
2089 /* Fill in the result: */
2090 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2091 for (i = 0 ; i < 4; i++) {
2092 if (mask & (1<<i)) {
2093 dst = get_dst_reg(c, inst, i);
2094 brw_MOV( p, dst, param0 );
2095 }
2096 }
2097 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2098 brw_set_saturate( p, 0 );
2099
2100 release_tmps( c, mark );
2101 }
2102
2103 /**
2104 * For the four-dimensional case, the little micro-optimisation benefits
2105 * we obtain by unrolling all the loops aren't worth the massive bloat it
2106 * now causes. Instead, we loop twice around performing a similar operation
2107 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2108 * code to glue it all together.
2109 */
2110 static void noise4_sub( struct brw_wm_compile *c )
2111 {
2112 struct brw_compile *p = &c->func;
2113 struct brw_reg param[ 4 ],
2114 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2115 w0, /* noise for the w=0 cube */
2116 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2117 interp[ 4 ], /* interpolation coefficients */
2118 t, tmp[ 8 ], /* float temporaries */
2119 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2120 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2121 int i, j;
2122 int mark = mark_tmps( c );
2123 GLuint loop, origin;
2124
2125 x0y0 = alloc_tmp( c );
2126 x0y1 = alloc_tmp( c );
2127 x1y0 = alloc_tmp( c );
2128 x1y1 = alloc_tmp( c );
2129 t = alloc_tmp( c );
2130 w0 = alloc_tmp( c );
2131 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2132 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2133
2134 for( i = 0; i < 4; i++ ) {
2135 param[ i ] = lookup_tmp( c, mark - 5 + i );
2136 interp[ i ] = alloc_tmp( c );
2137 }
2138
2139 for( i = 0; i < 8; i++ ) {
2140 tmp[ i ] = alloc_tmp( c );
2141 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2142 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2143 }
2144
2145 brw_set_access_mode( p, BRW_ALIGN_1 );
2146
2147 /* We only want 16 bits of precision from the integral part of each
2148 co-ordinate, but unfortunately the RNDD semantics would saturate
2149 at 16 bits if we performed the operation directly to a 16-bit
2150 destination. Therefore, we round to 32-bit temporaries where
2151 appropriate, and then store only the lower 16 bits. */
2152 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2153 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2154 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2155 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2156 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2157 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2158
2159 /* Modify the flag register here, because the side effect is useful
2160 later (see below). We know for certain that all flags will be
2161 cleared, since the FRC instruction cannot possibly generate
2162 negative results. Even for exceptional inputs (infinities, denormals,
2163 NaNs), the architecture guarantees that the L conditional is false. */
2164 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2165 brw_FRC( p, param[ 0 ], param[ 0 ] );
2166 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2167 for( i = 1; i < 4; i++ )
2168 brw_FRC( p, param[ i ], param[ i ] );
2169
2170 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2171 of all. */
2172 for( i = 0; i < 4; i++ )
2173 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2174 for( i = 0; i < 4; i++ )
2175 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2176 for( i = 0; i < 4; i++ )
2177 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2178 for( i = 0; i < 4; i++ )
2179 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2180 for( j = 0; j < 3; j++ )
2181 for( i = 0; i < 4; i++ )
2182 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2183
2184 /* Mark the current address, as it will be a jump destination. The
2185 following code will be executed twice: first, with the flag
2186 register clear indicating the w=0 case, and second with flags
2187 set for w=1. */
2188 loop = p->nr_insn;
2189
2190 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2191 be hashed. Since we have only 16 bits of precision in the hash, we
2192 must be careful about thorough mixing to maintain entropy as we
2193 squash the input vector into a small scalar. */
2194 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2195 brw_imm_uw( 0xBC8F ) );
2196 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2197 brw_imm_uw( 0xD0BD ) );
2198 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2199 brw_imm_uw( 0x9B93 ) );
2200 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2201 brw_imm_uw( 0xA359 ) );
2202 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2203 brw_imm_uw( 0xBC8F ) );
2204
2205 /* Temporarily disable the execution mask while we work with ExecSize=16
2206 channels (the mask is set for ExecSize=8 and is probably incorrect).
2207 Although this might cause execution of unwanted channels, the code
2208 writes only to temporary registers and has no side effects, so
2209 disabling the mask is harmless. */
2210 brw_push_insn_state( p );
2211 brw_set_mask_control( p, BRW_MASK_DISABLE );
2212 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2213 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2214 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2215
2216 /* We're now ready to perform the hashing. The eight hashes are
2217 interleaved for performance. The hash function used is
2218 designed to rapidly achieve avalanche and require only 16x16
2219 bit multiplication, and 8-bit swizzles (which we get for
2220 free). */
2221 for( i = 0; i < 4; i++ )
2222 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2223 for( i = 0; i < 4; i++ )
2224 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2225 odd_bytes( wtmp[ i ] ) );
2226 for( i = 0; i < 4; i++ )
2227 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2228 for( i = 0; i < 4; i++ )
2229 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2230 odd_bytes( wtmp[ i ] ) );
2231 brw_pop_insn_state( p );
2232
2233 /* Now we want to initialise the four rear gradients based on the
2234 hashes. Format conversion from signed integer to float leaves
2235 everything scaled too high by a factor of pow( 2, 15 ), but
2236 we correct for that right at the end. */
2237 /* x component */
2238 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2239 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2240 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2241 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2242 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2243
2244 brw_push_insn_state( p );
2245 brw_set_mask_control( p, BRW_MASK_DISABLE );
2246 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2247 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2248 brw_pop_insn_state( p );
2249
2250 brw_MUL( p, x1y0, x1y0, t );
2251 brw_MUL( p, x1y1, x1y1, t );
2252 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2253 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2254 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2255
2256 /* y component */
2257 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2258 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2259 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2260 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2261
2262 brw_push_insn_state( p );
2263 brw_set_mask_control( p, BRW_MASK_DISABLE );
2264 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2265 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2266 brw_pop_insn_state( p );
2267
2268 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2269 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2270 /* prepare t for the w component (used below): w the first time through
2271 the loop; w - 1 the second time) */
2272 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2273 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2274 p->current->header.predicate_inverse = 1;
2275 brw_MOV( p, t, param[ 3 ] );
2276 p->current->header.predicate_inverse = 0;
2277 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2278 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2279 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2280
2281 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2282 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2283 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2284 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2285
2286 /* z component */
2287 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2288 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2289 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2290 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2291
2292 brw_push_insn_state( p );
2293 brw_set_mask_control( p, BRW_MASK_DISABLE );
2294 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2295 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2296 brw_pop_insn_state( p );
2297
2298 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2299 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2300 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2301 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2302
2303 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2304 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2305 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2306 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2307
2308 /* w component */
2309 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2310 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2311 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2312 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2313
2314 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2315 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2316 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2317 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2318 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2319
2320 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2321 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2322 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2323 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2324
2325 /* Here we interpolate in the y dimension... */
2326 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2327 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2328 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2329 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2330 brw_ADD( p, x0y0, x0y0, x0y1 );
2331 brw_ADD( p, x1y0, x1y0, x1y1 );
2332
2333 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2334 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2335 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2336 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2337
2338 /* Now do the same thing for the front four gradients... */
2339 /* x component */
2340 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2341 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2342 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2343 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2344
2345 brw_push_insn_state( p );
2346 brw_set_mask_control( p, BRW_MASK_DISABLE );
2347 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2348 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2349 brw_pop_insn_state( p );
2350
2351 brw_MUL( p, x1y0, x1y0, t );
2352 brw_MUL( p, x1y1, x1y1, t );
2353 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2354 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2355 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2356
2357 /* y component */
2358 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2359 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2360 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2361 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2362
2363 brw_push_insn_state( p );
2364 brw_set_mask_control( p, BRW_MASK_DISABLE );
2365 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2366 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2367 brw_pop_insn_state( p );
2368
2369 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2370 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2371 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2372 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2373 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2374
2375 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2376 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2377 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2378 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2379
2380 /* z component */
2381 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2382 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2383 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2384 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2385
2386 brw_push_insn_state( p );
2387 brw_set_mask_control( p, BRW_MASK_DISABLE );
2388 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2389 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2390 brw_pop_insn_state( p );
2391
2392 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2393 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2394 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2395 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2396 /* prepare t for the w component (used below): w the first time through
2397 the loop; w - 1 the second time) */
2398 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2399 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2400 p->current->header.predicate_inverse = 1;
2401 brw_MOV( p, t, param[ 3 ] );
2402 p->current->header.predicate_inverse = 0;
2403 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2404
2405 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2406 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2407 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2408 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2409
2410 /* w component */
2411 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2412 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2413 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2414 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2415
2416 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2417 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2418 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2419 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2420
2421 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2422 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2423 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2424 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2425
2426 /* Interpolate in the y dimension: */
2427 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2428 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2429 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2430 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2431 brw_ADD( p, x0y0, x0y0, x0y1 );
2432 brw_ADD( p, x1y0, x1y0, x1y1 );
2433
2434 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2435 time put the front face in tmp[ 1 ] and we're nearly there... */
2436 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2437 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2438 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2439
2440 /* Another interpolation, in the z dimension: */
2441 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2442 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2443 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2444
2445 /* Exit the loop if we've computed both cubes... */
2446 origin = p->nr_insn;
2447 brw_push_insn_state( p );
2448 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2449 brw_set_mask_control( p, BRW_MASK_DISABLE );
2450 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2451 brw_pop_insn_state( p );
2452
2453 /* Save the result for the w=0 case, and increment the w coordinate: */
2454 brw_MOV( p, w0, tmp[ 0 ] );
2455 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2456 brw_imm_uw( 1 ) );
2457
2458 /* Loop around for the other cube. Explicitly set the flag register
2459 (unfortunately we must spend an extra instruction to do this: we
2460 can't rely on a side effect of the previous MOV or ADD because
2461 conditional modifiers which are normally true might be false in
2462 exceptional circumstances, e.g. given a NaN input; the add to
2463 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2464 brw_push_insn_state( p );
2465 brw_set_mask_control( p, BRW_MASK_DISABLE );
2466 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2467 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2468 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2469 brw_pop_insn_state( p );
2470
2471 /* Patch the previous conditional branch now that we know the
2472 destination address. */
2473 brw_set_src1( p->store + origin,
2474 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2475
2476 /* The very last interpolation. */
2477 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2478 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2479 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2480
2481 /* scale by pow( 2, -15 ), as described above */
2482 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2483
2484 release_tmps( c, mark );
2485 }
2486
2487 static void emit_noise4( struct brw_wm_compile *c,
2488 const struct prog_instruction *inst )
2489 {
2490 struct brw_compile *p = &c->func;
2491 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2492 GLuint mask = inst->DstReg.WriteMask;
2493 int i;
2494 int mark = mark_tmps( c );
2495
2496 assert( mark == 0 );
2497
2498 src0 = get_src_reg( c, inst, 0, 0 );
2499 src1 = get_src_reg( c, inst, 0, 1 );
2500 src2 = get_src_reg( c, inst, 0, 2 );
2501 src3 = get_src_reg( c, inst, 0, 3 );
2502
2503 param0 = alloc_tmp( c );
2504 param1 = alloc_tmp( c );
2505 param2 = alloc_tmp( c );
2506 param3 = alloc_tmp( c );
2507
2508 brw_MOV( p, param0, src0 );
2509 brw_MOV( p, param1, src1 );
2510 brw_MOV( p, param2, src2 );
2511 brw_MOV( p, param3, src3 );
2512
2513 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2514
2515 /* Fill in the result: */
2516 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2517 for (i = 0 ; i < 4; i++) {
2518 if (mask & (1<<i)) {
2519 dst = get_dst_reg(c, inst, i);
2520 brw_MOV( p, dst, param0 );
2521 }
2522 }
2523 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2524 brw_set_saturate( p, 0 );
2525
2526 release_tmps( c, mark );
2527 }
2528
2529 static void emit_wpos_xy(struct brw_wm_compile *c,
2530 const struct prog_instruction *inst)
2531 {
2532 struct brw_compile *p = &c->func;
2533 GLuint mask = inst->DstReg.WriteMask;
2534 struct brw_reg src0[2], dst[2];
2535
2536 dst[0] = get_dst_reg(c, inst, 0);
2537 dst[1] = get_dst_reg(c, inst, 1);
2538
2539 src0[0] = get_src_reg(c, inst, 0, 0);
2540 src0[1] = get_src_reg(c, inst, 0, 1);
2541
2542 /* Calculate the pixel offset from window bottom left into destination
2543 * X and Y channels.
2544 */
2545 if (mask & WRITEMASK_X) {
2546 /* X' = X - origin_x */
2547 brw_ADD(p,
2548 dst[0],
2549 retype(src0[0], BRW_REGISTER_TYPE_W),
2550 brw_imm_d(0 - c->key.origin_x));
2551 }
2552
2553 if (mask & WRITEMASK_Y) {
2554 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2555 brw_ADD(p,
2556 dst[1],
2557 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2558 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2559 }
2560 }
2561
2562 /* TODO
2563 BIAS on SIMD8 not working yet...
2564 */
2565 static void emit_txb(struct brw_wm_compile *c,
2566 const struct prog_instruction *inst)
2567 {
2568 struct brw_compile *p = &c->func;
2569 struct brw_reg dst[4], src[4], payload_reg;
2570 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2571 const GLuint unit = inst->TexSrcUnit;
2572 GLuint i;
2573 GLuint msg_type;
2574
2575 assert(unit < BRW_MAX_TEX_UNIT);
2576
2577 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2578
2579 for (i = 0; i < 4; i++)
2580 dst[i] = get_dst_reg(c, inst, i);
2581 for (i = 0; i < 4; i++)
2582 src[i] = get_src_reg(c, inst, 0, i);
2583
2584 switch (inst->TexSrcTarget) {
2585 case TEXTURE_1D_INDEX:
2586 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2587 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2588 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2589 break;
2590 case TEXTURE_2D_INDEX:
2591 case TEXTURE_RECT_INDEX:
2592 brw_MOV(p, brw_message_reg(2), src[0]);
2593 brw_MOV(p, brw_message_reg(3), src[1]);
2594 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2595 break;
2596 case TEXTURE_3D_INDEX:
2597 case TEXTURE_CUBE_INDEX:
2598 brw_MOV(p, brw_message_reg(2), src[0]);
2599 brw_MOV(p, brw_message_reg(3), src[1]);
2600 brw_MOV(p, brw_message_reg(4), src[2]);
2601 break;
2602 default:
2603 /* invalid target */
2604 abort();
2605 }
2606 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2607 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2608
2609 if (BRW_IS_IGDNG(p->brw)) {
2610 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2611 } else {
2612 /* Does it work well on SIMD8? */
2613 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2614 }
2615
2616 brw_SAMPLE(p,
2617 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2618 1, /* msg_reg_nr */
2619 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2620 SURF_INDEX_TEXTURE(unit),
2621 unit, /* sampler */
2622 inst->DstReg.WriteMask, /* writemask */
2623 msg_type, /* msg_type */
2624 4, /* response_length */
2625 4, /* msg_length */
2626 0, /* eot */
2627 1,
2628 BRW_SAMPLER_SIMD_MODE_SIMD8);
2629 }
2630
2631
2632 static void emit_tex(struct brw_wm_compile *c,
2633 const struct prog_instruction *inst)
2634 {
2635 struct brw_compile *p = &c->func;
2636 struct brw_reg dst[4], src[4], payload_reg;
2637 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2638 const GLuint unit = inst->TexSrcUnit;
2639 GLuint msg_len;
2640 GLuint i, nr;
2641 GLuint emit;
2642 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2643 GLuint msg_type;
2644
2645 assert(unit < BRW_MAX_TEX_UNIT);
2646
2647 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2648
2649 for (i = 0; i < 4; i++)
2650 dst[i] = get_dst_reg(c, inst, i);
2651 for (i = 0; i < 4; i++)
2652 src[i] = get_src_reg(c, inst, 0, i);
2653
2654 switch (inst->TexSrcTarget) {
2655 case TEXTURE_1D_INDEX:
2656 emit = WRITEMASK_X;
2657 nr = 1;
2658 break;
2659 case TEXTURE_2D_INDEX:
2660 case TEXTURE_RECT_INDEX:
2661 emit = WRITEMASK_XY;
2662 nr = 2;
2663 break;
2664 case TEXTURE_3D_INDEX:
2665 case TEXTURE_CUBE_INDEX:
2666 emit = WRITEMASK_XYZ;
2667 nr = 3;
2668 break;
2669 default:
2670 /* invalid target */
2671 abort();
2672 }
2673 msg_len = 1;
2674
2675 /* move/load S, T, R coords */
2676 for (i = 0; i < nr; i++) {
2677 static const GLuint swz[4] = {0,1,2,2};
2678 if (emit & (1<<i))
2679 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2680 else
2681 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2682 msg_len += 1;
2683 }
2684
2685 if (shadow) {
2686 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2687 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2688 }
2689
2690 if (BRW_IS_IGDNG(p->brw)) {
2691 if (shadow)
2692 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2693 else
2694 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2695 } else {
2696 /* Does it work for shadow on SIMD8 ? */
2697 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2698 }
2699
2700 brw_SAMPLE(p,
2701 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2702 1, /* msg_reg_nr */
2703 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2704 SURF_INDEX_TEXTURE(unit),
2705 unit, /* sampler */
2706 inst->DstReg.WriteMask, /* writemask */
2707 msg_type, /* msg_type */
2708 4, /* response_length */
2709 shadow ? 6 : 4, /* msg_length */
2710 0, /* eot */
2711 1,
2712 BRW_SAMPLER_SIMD_MODE_SIMD8);
2713
2714 if (shadow)
2715 brw_MOV(p, dst[3], brw_imm_f(1.0));
2716 }
2717
2718
2719 /**
2720 * Resolve subroutine calls after code emit is done.
2721 */
2722 static void post_wm_emit( struct brw_wm_compile *c )
2723 {
2724 brw_resolve_cals(&c->func);
2725 }
2726
2727 static void
2728 get_argument_regs(struct brw_wm_compile *c,
2729 const struct prog_instruction *inst,
2730 int index,
2731 struct brw_reg *regs,
2732 int mask)
2733 {
2734 int i;
2735
2736 for (i = 0; i < 4; i++) {
2737 if (mask & (1 << i))
2738 regs[i] = get_src_reg(c, inst, index, i);
2739 }
2740 }
2741
2742 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2743 {
2744 #define MAX_IF_DEPTH 32
2745 #define MAX_LOOP_DEPTH 32
2746 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2747 GLuint i, if_depth = 0, loop_depth = 0;
2748 struct brw_compile *p = &c->func;
2749 struct brw_indirect stack_index = brw_indirect(0, 0);
2750
2751 c->out_of_regs = GL_FALSE;
2752
2753 prealloc_reg(c);
2754 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2755 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2756
2757 for (i = 0; i < c->nr_fp_insns; i++) {
2758 const struct prog_instruction *inst = &c->prog_instructions[i];
2759 int dst_flags;
2760 struct brw_reg args[3][4], dst[4];
2761 int j;
2762
2763 c->cur_inst = i;
2764
2765 #if 0
2766 _mesa_printf("Inst %d: ", i);
2767 _mesa_print_instruction(inst);
2768 #endif
2769
2770 /* fetch any constants that this instruction needs */
2771 if (c->fp->use_const_buffer)
2772 fetch_constants(c, inst);
2773
2774 if (inst->CondUpdate)
2775 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2776 else
2777 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2778
2779 dst_flags = inst->DstReg.WriteMask;
2780 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2781 dst_flags |= SATURATE;
2782
2783 switch (inst->Opcode) {
2784 case WM_PIXELXY:
2785 emit_pixel_xy(c, inst);
2786 break;
2787 case WM_DELTAXY:
2788 emit_delta_xy(c, inst);
2789 break;
2790 case WM_PIXELW:
2791 emit_pixel_w(c, inst);
2792 break;
2793 case WM_LINTERP:
2794 emit_linterp(c, inst);
2795 break;
2796 case WM_PINTERP:
2797 emit_pinterp(c, inst);
2798 break;
2799 case WM_CINTERP:
2800 emit_cinterp(c, inst);
2801 break;
2802 case WM_WPOSXY:
2803 emit_wpos_xy(c, inst);
2804 break;
2805 case WM_FB_WRITE:
2806 emit_fb_write(c, inst);
2807 break;
2808 case WM_FRONTFACING:
2809 emit_frontfacing(c, inst);
2810 break;
2811 case OPCODE_ADD:
2812 emit_add(c, inst);
2813 break;
2814 case OPCODE_ARL:
2815 emit_arl(c, inst);
2816 break;
2817 case OPCODE_FRC:
2818 emit_frc(c, inst);
2819 break;
2820 case OPCODE_FLR:
2821 emit_flr(c, inst);
2822 break;
2823 case OPCODE_LRP:
2824 emit_lrp(c, inst);
2825 break;
2826 case OPCODE_TRUNC:
2827 emit_trunc(c, inst);
2828 break;
2829 case OPCODE_MOV:
2830 case OPCODE_SWZ:
2831 emit_mov(c, inst);
2832 break;
2833 case OPCODE_DP3:
2834 emit_dp3(c, inst);
2835 break;
2836 case OPCODE_DP4:
2837 emit_dp4(c, inst);
2838 break;
2839 case OPCODE_XPD:
2840 emit_xpd(c, inst);
2841 break;
2842 case OPCODE_DPH:
2843 emit_dph(c, inst);
2844 break;
2845 case OPCODE_RCP:
2846 emit_rcp(c, inst);
2847 break;
2848 case OPCODE_RSQ:
2849 emit_rsq(c, inst);
2850 break;
2851 case OPCODE_SIN:
2852 emit_sin(c, inst);
2853 break;
2854 case OPCODE_COS:
2855 emit_cos(c, inst);
2856 break;
2857 case OPCODE_EX2:
2858 emit_ex2(c, inst);
2859 break;
2860 case OPCODE_LG2:
2861 emit_lg2(c, inst);
2862 break;
2863 case OPCODE_MIN:
2864 case OPCODE_MAX:
2865 emit_min_max(c, inst);
2866 break;
2867 case OPCODE_DDX:
2868 case OPCODE_DDY:
2869 for (j = 0; j < 4; j++) {
2870 if (inst->DstReg.WriteMask & (1 << j))
2871 dst[j] = get_dst_reg(c, inst, j);
2872 else
2873 dst[j] = brw_null_reg();
2874 }
2875 get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
2876 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2877 args[0]);
2878 break;
2879 case OPCODE_SLT:
2880 emit_slt(c, inst);
2881 break;
2882 case OPCODE_SLE:
2883 emit_sle(c, inst);
2884 break;
2885 case OPCODE_SGT:
2886 emit_sgt(c, inst);
2887 break;
2888 case OPCODE_SGE:
2889 emit_sge(c, inst);
2890 break;
2891 case OPCODE_SEQ:
2892 emit_seq(c, inst);
2893 break;
2894 case OPCODE_SNE:
2895 emit_sne(c, inst);
2896 break;
2897 case OPCODE_MUL:
2898 emit_mul(c, inst);
2899 break;
2900 case OPCODE_POW:
2901 emit_pow(c, inst);
2902 break;
2903 case OPCODE_MAD:
2904 emit_mad(c, inst);
2905 break;
2906 case OPCODE_NOISE1:
2907 emit_noise1(c, inst);
2908 break;
2909 case OPCODE_NOISE2:
2910 emit_noise2(c, inst);
2911 break;
2912 case OPCODE_NOISE3:
2913 emit_noise3(c, inst);
2914 break;
2915 case OPCODE_NOISE4:
2916 emit_noise4(c, inst);
2917 break;
2918 case OPCODE_TEX:
2919 emit_tex(c, inst);
2920 break;
2921 case OPCODE_TXB:
2922 emit_txb(c, inst);
2923 break;
2924 case OPCODE_KIL_NV:
2925 emit_kil(c);
2926 break;
2927 case OPCODE_IF:
2928 assert(if_depth < MAX_IF_DEPTH);
2929 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2930 break;
2931 case OPCODE_ELSE:
2932 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2933 break;
2934 case OPCODE_ENDIF:
2935 assert(if_depth > 0);
2936 brw_ENDIF(p, if_inst[--if_depth]);
2937 break;
2938 case OPCODE_BGNSUB:
2939 brw_save_label(p, inst->Comment, p->nr_insn);
2940 break;
2941 case OPCODE_ENDSUB:
2942 /* no-op */
2943 break;
2944 case OPCODE_CAL:
2945 brw_push_insn_state(p);
2946 brw_set_mask_control(p, BRW_MASK_DISABLE);
2947 brw_set_access_mode(p, BRW_ALIGN_1);
2948 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2949 brw_set_access_mode(p, BRW_ALIGN_16);
2950 brw_ADD(p, get_addr_reg(stack_index),
2951 get_addr_reg(stack_index), brw_imm_d(4));
2952 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2953 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2954 brw_pop_insn_state(p);
2955 break;
2956
2957 case OPCODE_RET:
2958 brw_push_insn_state(p);
2959 brw_set_mask_control(p, BRW_MASK_DISABLE);
2960 brw_ADD(p, get_addr_reg(stack_index),
2961 get_addr_reg(stack_index), brw_imm_d(-4));
2962 brw_set_access_mode(p, BRW_ALIGN_1);
2963 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2964 brw_set_access_mode(p, BRW_ALIGN_16);
2965 brw_pop_insn_state(p);
2966
2967 break;
2968 case OPCODE_BGNLOOP:
2969 /* XXX may need to invalidate the current_constant regs */
2970 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2971 break;
2972 case OPCODE_BRK:
2973 brw_BREAK(p);
2974 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2975 break;
2976 case OPCODE_CONT:
2977 brw_CONT(p);
2978 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2979 break;
2980 case OPCODE_ENDLOOP:
2981 {
2982 struct brw_instruction *inst0, *inst1;
2983 GLuint br = 1;
2984
2985 if (BRW_IS_IGDNG(brw))
2986 br = 2;
2987
2988 loop_depth--;
2989 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2990 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2991 while (inst0 > loop_inst[loop_depth]) {
2992 inst0--;
2993 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2994 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2995 inst0->bits3.if_else.pop_count = 0;
2996 }
2997 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2998 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2999 inst0->bits3.if_else.pop_count = 0;
3000 }
3001 }
3002 }
3003 break;
3004 default:
3005 _mesa_printf("unsupported IR in fragment shader %d\n",
3006 inst->Opcode);
3007 }
3008
3009 if (inst->CondUpdate)
3010 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3011 else
3012 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3013 }
3014 post_wm_emit(c);
3015
3016 if (INTEL_DEBUG & DEBUG_WM) {
3017 _mesa_printf("wm-native:\n");
3018 for (i = 0; i < p->nr_insn; i++)
3019 brw_disasm(stderr, &p->store[i]);
3020 _mesa_printf("\n");
3021 }
3022 }
3023
3024 /**
3025 * Do GPU code generation for shaders that use GLSL features such as
3026 * flow control. Other shaders will be compiled with the
3027 */
3028 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3029 {
3030 if (INTEL_DEBUG & DEBUG_WM) {
3031 _mesa_printf("brw_wm_glsl_emit:\n");
3032 }
3033
3034 /* initial instruction translation/simplification */
3035 brw_wm_pass_fp(c);
3036
3037 /* actual code generation */
3038 brw_wm_emit_glsl(brw, c);
3039
3040 if (INTEL_DEBUG & DEBUG_WM) {
3041 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3042 }
3043
3044 c->prog_data.total_grf = num_grf_used(c);
3045 c->prog_data.total_scratch = 0;
3046 }