117460842a30fcaf71ff8ac026c342e042a367f8
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(const struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195
196 /* use a real constant buffer, or just use a section of the GRF? */
197 c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
198
199 if (c->fp->use_const_buffer) {
200 /* We'll use a real constant buffer and fetch constants from
201 * it with a dataport read message.
202 */
203
204 /* number of float constants in CURBE */
205 c->prog_data.nr_params = 0;
206 }
207 else {
208 const struct gl_program_parameter_list *plist =
209 c->fp->program.Base.Parameters;
210 int index = 0;
211
212 /* number of float constants in CURBE */
213 c->prog_data.nr_params = 4 * nr_params;
214
215 /* loop over program constants (float[4]) */
216 for (i = 0; i < nr_params; i++) {
217 /* loop over XYZW channels */
218 for (j = 0; j < 4; j++, index++) {
219 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
220 /* Save pointer to parameter/constant value.
221 * Constants will be copied in prepare_constant_buffer()
222 */
223 c->prog_data.param[index] = &plist->ParameterValues[i][j];
224 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
225 }
226 }
227 /* number of constant regs used (each reg is float[8]) */
228 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
229 c->reg_index += c->nr_creg;
230 }
231 }
232
233 /* fragment shader inputs */
234 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
235 if (inputs & (1<<i)) {
236 nr_interp_regs++;
237 reg = brw_vec8_grf(c->reg_index, 0);
238 for (j = 0; j < 4; j++)
239 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
240 c->reg_index += 2;
241 }
242 }
243
244 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
245 c->prog_data.urb_read_length = nr_interp_regs * 2;
246 c->prog_data.curb_read_length = c->nr_creg;
247 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
248 c->reg_index++;
249 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
250 c->reg_index += 2;
251
252 /* An instruction may reference up to three constants.
253 * They'll be found in these registers.
254 * XXX alloc these on demand!
255 */
256 if (c->fp->use_const_buffer) {
257 for (i = 0; i < 3; i++) {
258 c->current_const[i].index = -1;
259 c->current_const[i].reg = alloc_tmp(c);
260 }
261 }
262 #if 0
263 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
264 printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
265 #endif
266 }
267
268
269 /**
270 * Check if any of the instruction's src registers are constants, uniforms,
271 * or statevars. If so, fetch any constants that we don't already have in
272 * the three GRF slots.
273 */
274 static void fetch_constants(struct brw_wm_compile *c,
275 const struct prog_instruction *inst)
276 {
277 struct brw_compile *p = &c->func;
278 GLuint i;
279
280 /* loop over instruction src regs */
281 for (i = 0; i < 3; i++) {
282 const struct prog_src_register *src = &inst->SrcReg[i];
283 if (src->File == PROGRAM_STATE_VAR ||
284 src->File == PROGRAM_CONSTANT ||
285 src->File == PROGRAM_UNIFORM) {
286 if (c->current_const[i].index != src->Index) {
287 c->current_const[i].index = src->Index;
288
289 #if 0
290 printf(" fetch const[%d] for arg %d into reg %d\n",
291 src->Index, i, c->current_const[i].reg.nr);
292 #endif
293
294 /* need to fetch the constant now */
295 brw_dp_READ_4(p,
296 c->current_const[i].reg, /* writeback dest */
297 1, /* msg_reg */
298 src->RelAddr, /* relative indexing? */
299 16 * src->Index, /* byte offset */
300 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
301 );
302 }
303 }
304 }
305 }
306
307
308 /**
309 * Convert Mesa dst register to brw register.
310 */
311 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
312 const struct prog_instruction *inst,
313 GLuint component)
314 {
315 const int nr = 1;
316 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
317 0, 0);
318 }
319
320
321 static struct brw_reg
322 get_src_reg_const(struct brw_wm_compile *c,
323 const struct prog_instruction *inst,
324 GLuint srcRegIndex, GLuint component)
325 {
326 /* We should have already fetched the constant from the constant
327 * buffer in fetch_constants(). Now we just have to return a
328 * register description that extracts the needed component and
329 * smears it across all eight vector components.
330 */
331 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
332 struct brw_reg const_reg;
333
334 assert(component < 4);
335 assert(srcRegIndex < 3);
336 assert(c->current_const[srcRegIndex].index != -1);
337 const_reg = c->current_const[srcRegIndex].reg;
338
339 /* extract desired float from the const_reg, and smear */
340 const_reg = stride(const_reg, 0, 1, 0);
341 const_reg.subnr = component * 4;
342
343 if (src->Negate & (1 << component))
344 const_reg = negate(const_reg);
345 if (src->Abs)
346 const_reg = brw_abs(const_reg);
347
348 #if 0
349 printf(" form const[%d].%d for arg %d, reg %d\n",
350 c->current_const[srcRegIndex].index,
351 component,
352 srcRegIndex,
353 const_reg.nr);
354 #endif
355
356 return const_reg;
357 }
358
359
360 /**
361 * Convert Mesa src register to brw register.
362 */
363 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
364 const struct prog_instruction *inst,
365 GLuint srcRegIndex, GLuint channel)
366 {
367 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
368 const GLuint nr = 1;
369 const GLuint component = GET_SWZ(src->Swizzle, channel);
370
371 if (c->fp->use_const_buffer &&
372 (src->File == PROGRAM_STATE_VAR ||
373 src->File == PROGRAM_CONSTANT ||
374 src->File == PROGRAM_UNIFORM)) {
375 return get_src_reg_const(c, inst, srcRegIndex, component);
376 }
377 else {
378 /* other type of source register */
379 return get_reg(c, src->File, src->Index, component, nr,
380 src->Negate, src->Abs);
381 }
382 }
383
384
385 /**
386 * Same as \sa get_src_reg() but if the register is a literal, emit
387 * a brw_reg encoding the literal.
388 * Note that a brw instruction only allows one src operand to be a literal.
389 * For instructions with more than one operand, only the second can be a
390 * literal. This means that we treat some literals as constants/uniforms
391 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
392 *
393 */
394 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
395 const struct prog_instruction *inst,
396 GLuint srcRegIndex, GLuint channel)
397 {
398 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
399 if (src->File == PROGRAM_CONSTANT) {
400 /* a literal */
401 const int component = GET_SWZ(src->Swizzle, channel);
402 const GLfloat *param =
403 c->fp->program.Base.Parameters->ParameterValues[src->Index];
404 GLfloat value = param[component];
405 if (src->Negate & (1 << channel))
406 value = -value;
407 if (src->Abs)
408 value = FABSF(value);
409 #if 0
410 printf(" form immed value %f for chan %d\n", value, channel);
411 #endif
412 return brw_imm_f(value);
413 }
414 else {
415 return get_src_reg(c, inst, srcRegIndex, channel);
416 }
417 }
418
419
420 /**
421 * Subroutines are minimal support for resusable instruction sequences.
422 * They are implemented as simply as possible to minimise overhead: there
423 * is no explicit support for communication between the caller and callee
424 * other than saving the return address in a temporary register, nor is
425 * there any automatic local storage. This implies that great care is
426 * required before attempting reentrancy or any kind of nested
427 * subroutine invocations.
428 */
429 static void invoke_subroutine( struct brw_wm_compile *c,
430 enum _subroutine subroutine,
431 void (*emit)( struct brw_wm_compile * ) )
432 {
433 struct brw_compile *p = &c->func;
434
435 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
436
437 if( c->subroutines[ subroutine ] ) {
438 /* subroutine previously emitted: reuse existing instructions */
439
440 int mark = mark_tmps( c );
441 struct brw_reg return_address = retype( alloc_tmp( c ),
442 BRW_REGISTER_TYPE_UD );
443 int here = p->nr_insn;
444
445 brw_push_insn_state(p);
446 brw_set_mask_control(p, BRW_MASK_DISABLE);
447 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
448
449 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
450 brw_imm_d( ( c->subroutines[ subroutine ] -
451 here - 1 ) << 4 ) );
452 brw_pop_insn_state(p);
453
454 release_tmps( c, mark );
455 } else {
456 /* previously unused subroutine: emit, and mark for later reuse */
457
458 int mark = mark_tmps( c );
459 struct brw_reg return_address = retype( alloc_tmp( c ),
460 BRW_REGISTER_TYPE_UD );
461 struct brw_instruction *calc;
462 int base = p->nr_insn;
463
464 brw_push_insn_state(p);
465 brw_set_mask_control(p, BRW_MASK_DISABLE);
466 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
467 brw_pop_insn_state(p);
468
469 c->subroutines[ subroutine ] = p->nr_insn;
470
471 emit( c );
472
473 brw_push_insn_state(p);
474 brw_set_mask_control(p, BRW_MASK_DISABLE);
475 brw_MOV( p, brw_ip_reg(), return_address );
476 brw_pop_insn_state(p);
477
478 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
479
480 release_tmps( c, mark );
481 }
482 }
483
484 static void emit_abs( struct brw_wm_compile *c,
485 const struct prog_instruction *inst)
486 {
487 int i;
488 struct brw_compile *p = &c->func;
489 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
490 for (i = 0; i < 4; i++) {
491 if (inst->DstReg.WriteMask & (1<<i)) {
492 struct brw_reg src, dst;
493 dst = get_dst_reg(c, inst, i);
494 src = get_src_reg(c, inst, 0, i);
495 brw_MOV(p, dst, brw_abs(src));
496 }
497 }
498 brw_set_saturate(p, 0);
499 }
500
501 static void emit_trunc( struct brw_wm_compile *c,
502 const struct prog_instruction *inst)
503 {
504 int i;
505 struct brw_compile *p = &c->func;
506 GLuint mask = inst->DstReg.WriteMask;
507 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
508 for (i = 0; i < 4; i++) {
509 if (mask & (1<<i)) {
510 struct brw_reg src, dst;
511 dst = get_dst_reg(c, inst, i);
512 src = get_src_reg(c, inst, 0, i);
513 brw_RNDZ(p, dst, src);
514 }
515 }
516 brw_set_saturate(p, 0);
517 }
518
519 static void emit_mov( struct brw_wm_compile *c,
520 const struct prog_instruction *inst)
521 {
522 int i;
523 struct brw_compile *p = &c->func;
524 GLuint mask = inst->DstReg.WriteMask;
525 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
526 for (i = 0; i < 4; i++) {
527 if (mask & (1<<i)) {
528 struct brw_reg src, dst;
529 dst = get_dst_reg(c, inst, i);
530 /* XXX some moves from immediate value don't work reliably!!! */
531 /*src = get_src_reg_imm(c, inst, 0, i);*/
532 src = get_src_reg(c, inst, 0, i);
533 brw_MOV(p, dst, src);
534 }
535 }
536 brw_set_saturate(p, 0);
537 }
538
539 static void emit_pixel_xy(struct brw_wm_compile *c,
540 const struct prog_instruction *inst)
541 {
542 struct brw_reg r1 = brw_vec1_grf(1, 0);
543 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
544
545 struct brw_reg dst0, dst1;
546 struct brw_compile *p = &c->func;
547 GLuint mask = inst->DstReg.WriteMask;
548
549 dst0 = get_dst_reg(c, inst, 0);
550 dst1 = get_dst_reg(c, inst, 1);
551 /* Calculate pixel centers by adding 1 or 0 to each of the
552 * micro-tile coordinates passed in r1.
553 */
554 if (mask & WRITEMASK_X) {
555 brw_ADD(p,
556 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
557 stride(suboffset(r1_uw, 4), 2, 4, 0),
558 brw_imm_v(0x10101010));
559 }
560
561 if (mask & WRITEMASK_Y) {
562 brw_ADD(p,
563 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
564 stride(suboffset(r1_uw, 5), 2, 4, 0),
565 brw_imm_v(0x11001100));
566 }
567 }
568
569 static void emit_delta_xy(struct brw_wm_compile *c,
570 const struct prog_instruction *inst)
571 {
572 struct brw_reg r1 = brw_vec1_grf(1, 0);
573 struct brw_reg dst0, dst1, src0, src1;
574 struct brw_compile *p = &c->func;
575 GLuint mask = inst->DstReg.WriteMask;
576
577 dst0 = get_dst_reg(c, inst, 0);
578 dst1 = get_dst_reg(c, inst, 1);
579 src0 = get_src_reg(c, inst, 0, 0);
580 src1 = get_src_reg(c, inst, 0, 1);
581 /* Calc delta X,Y by subtracting origin in r1 from the pixel
582 * centers.
583 */
584 if (mask & WRITEMASK_X) {
585 brw_ADD(p,
586 dst0,
587 retype(src0, BRW_REGISTER_TYPE_UW),
588 negate(r1));
589 }
590
591 if (mask & WRITEMASK_Y) {
592 brw_ADD(p,
593 dst1,
594 retype(src1, BRW_REGISTER_TYPE_UW),
595 negate(suboffset(r1,1)));
596
597 }
598 }
599
600 static void fire_fb_write( struct brw_wm_compile *c,
601 GLuint base_reg,
602 GLuint nr,
603 GLuint target,
604 GLuint eot)
605 {
606 struct brw_compile *p = &c->func;
607 /* Pass through control information:
608 */
609 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
610 {
611 brw_push_insn_state(p);
612 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
613 brw_MOV(p,
614 brw_message_reg(base_reg + 1),
615 brw_vec8_grf(1, 0));
616 brw_pop_insn_state(p);
617 }
618 /* Send framebuffer write message: */
619 brw_fb_WRITE(p,
620 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
621 base_reg,
622 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
623 target,
624 nr,
625 0,
626 eot);
627 }
628
629 static void emit_fb_write(struct brw_wm_compile *c,
630 const struct prog_instruction *inst)
631 {
632 struct brw_compile *p = &c->func;
633 int nr = 2;
634 int channel;
635 GLuint target, eot;
636 struct brw_reg src0;
637
638 /* Reserve a space for AA - may not be needed:
639 */
640 if (c->key.aa_dest_stencil_reg)
641 nr += 1;
642
643 brw_push_insn_state(p);
644 for (channel = 0; channel < 4; channel++) {
645 src0 = get_src_reg(c, inst, 0, channel);
646 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
647 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
648 brw_MOV(p, brw_message_reg(nr + channel), src0);
649 }
650 /* skip over the regs populated above: */
651 nr += 8;
652 brw_pop_insn_state(p);
653
654 if (c->key.source_depth_to_render_target) {
655 if (c->key.computes_depth) {
656 src0 = get_src_reg(c, inst, 2, 2);
657 brw_MOV(p, brw_message_reg(nr), src0);
658 }
659 else {
660 src0 = get_src_reg(c, inst, 1, 1);
661 brw_MOV(p, brw_message_reg(nr), src0);
662 }
663
664 nr += 2;
665 }
666
667 if (c->key.dest_depth_reg) {
668 GLuint comp = c->key.dest_depth_reg / 2;
669 GLuint off = c->key.dest_depth_reg % 2;
670
671 assert(comp == 1);
672 assert(off == 0);
673 #if 0
674 /* XXX do we need this code? comp always 1, off always 0, it seems */
675 if (off != 0) {
676 brw_push_insn_state(p);
677 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
678
679 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
680 /* 2nd half? */
681 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
682 brw_pop_insn_state(p);
683 }
684 else
685 #endif
686 {
687 struct brw_reg src = get_src_reg(c, inst, 1, 1);
688 brw_MOV(p, brw_message_reg(nr), src);
689 }
690 nr += 2;
691 }
692
693 target = inst->Aux >> 1;
694 eot = inst->Aux & 1;
695 fire_fb_write(c, 0, nr, target, eot);
696 }
697
698 static void emit_pixel_w( struct brw_wm_compile *c,
699 const struct prog_instruction *inst)
700 {
701 struct brw_compile *p = &c->func;
702 GLuint mask = inst->DstReg.WriteMask;
703 if (mask & WRITEMASK_W) {
704 struct brw_reg dst, src0, delta0, delta1;
705 struct brw_reg interp3;
706
707 dst = get_dst_reg(c, inst, 3);
708 src0 = get_src_reg(c, inst, 0, 0);
709 delta0 = get_src_reg(c, inst, 1, 0);
710 delta1 = get_src_reg(c, inst, 1, 1);
711
712 interp3 = brw_vec1_grf(src0.nr+1, 4);
713 /* Calc 1/w - just linterp wpos[3] optimized by putting the
714 * result straight into a message reg.
715 */
716 brw_LINE(p, brw_null_reg(), interp3, delta0);
717 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
718
719 /* Calc w */
720 brw_math_16( p, dst,
721 BRW_MATH_FUNCTION_INV,
722 BRW_MATH_SATURATE_NONE,
723 2, brw_null_reg(),
724 BRW_MATH_PRECISION_FULL);
725 }
726 }
727
728 static void emit_linterp(struct brw_wm_compile *c,
729 const struct prog_instruction *inst)
730 {
731 struct brw_compile *p = &c->func;
732 GLuint mask = inst->DstReg.WriteMask;
733 struct brw_reg interp[4];
734 struct brw_reg dst, delta0, delta1;
735 struct brw_reg src0;
736 GLuint nr, i;
737
738 src0 = get_src_reg(c, inst, 0, 0);
739 delta0 = get_src_reg(c, inst, 1, 0);
740 delta1 = get_src_reg(c, inst, 1, 1);
741 nr = src0.nr;
742
743 interp[0] = brw_vec1_grf(nr, 0);
744 interp[1] = brw_vec1_grf(nr, 4);
745 interp[2] = brw_vec1_grf(nr+1, 0);
746 interp[3] = brw_vec1_grf(nr+1, 4);
747
748 for(i = 0; i < 4; i++ ) {
749 if (mask & (1<<i)) {
750 dst = get_dst_reg(c, inst, i);
751 brw_LINE(p, brw_null_reg(), interp[i], delta0);
752 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
753 }
754 }
755 }
756
757 static void emit_cinterp(struct brw_wm_compile *c,
758 const struct prog_instruction *inst)
759 {
760 struct brw_compile *p = &c->func;
761 GLuint mask = inst->DstReg.WriteMask;
762
763 struct brw_reg interp[4];
764 struct brw_reg dst, src0;
765 GLuint nr, i;
766
767 src0 = get_src_reg(c, inst, 0, 0);
768 nr = src0.nr;
769
770 interp[0] = brw_vec1_grf(nr, 0);
771 interp[1] = brw_vec1_grf(nr, 4);
772 interp[2] = brw_vec1_grf(nr+1, 0);
773 interp[3] = brw_vec1_grf(nr+1, 4);
774
775 for(i = 0; i < 4; i++ ) {
776 if (mask & (1<<i)) {
777 dst = get_dst_reg(c, inst, i);
778 brw_MOV(p, dst, suboffset(interp[i],3));
779 }
780 }
781 }
782
783 static void emit_pinterp(struct brw_wm_compile *c,
784 const struct prog_instruction *inst)
785 {
786 struct brw_compile *p = &c->func;
787 GLuint mask = inst->DstReg.WriteMask;
788
789 struct brw_reg interp[4];
790 struct brw_reg dst, delta0, delta1;
791 struct brw_reg src0, w;
792 GLuint nr, i;
793
794 src0 = get_src_reg(c, inst, 0, 0);
795 delta0 = get_src_reg(c, inst, 1, 0);
796 delta1 = get_src_reg(c, inst, 1, 1);
797 w = get_src_reg(c, inst, 2, 3);
798 nr = src0.nr;
799
800 interp[0] = brw_vec1_grf(nr, 0);
801 interp[1] = brw_vec1_grf(nr, 4);
802 interp[2] = brw_vec1_grf(nr+1, 0);
803 interp[3] = brw_vec1_grf(nr+1, 4);
804
805 for(i = 0; i < 4; i++ ) {
806 if (mask & (1<<i)) {
807 dst = get_dst_reg(c, inst, i);
808 brw_LINE(p, brw_null_reg(), interp[i], delta0);
809 brw_MAC(p, dst, suboffset(interp[i],1),
810 delta1);
811 brw_MUL(p, dst, dst, w);
812 }
813 }
814 }
815
816 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
817 static void emit_frontfacing(struct brw_wm_compile *c,
818 const struct prog_instruction *inst)
819 {
820 struct brw_compile *p = &c->func;
821 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
822 struct brw_reg dst;
823 GLuint mask = inst->DstReg.WriteMask;
824 int i;
825
826 for (i = 0; i < 4; i++) {
827 if (mask & (1<<i)) {
828 dst = get_dst_reg(c, inst, i);
829 brw_MOV(p, dst, brw_imm_f(0.0));
830 }
831 }
832
833 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
834 * us front face
835 */
836 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
837 for (i = 0; i < 4; i++) {
838 if (mask & (1<<i)) {
839 dst = get_dst_reg(c, inst, i);
840 brw_MOV(p, dst, brw_imm_f(1.0));
841 }
842 }
843 brw_set_predicate_control_flag_value(p, 0xff);
844 }
845
846 static void emit_xpd(struct brw_wm_compile *c,
847 const struct prog_instruction *inst)
848 {
849 int i;
850 struct brw_compile *p = &c->func;
851 GLuint mask = inst->DstReg.WriteMask;
852 for (i = 0; i < 4; i++) {
853 GLuint i2 = (i+2)%3;
854 GLuint i1 = (i+1)%3;
855 if (mask & (1<<i)) {
856 struct brw_reg src0, src1, dst;
857 dst = get_dst_reg(c, inst, i);
858 src0 = negate(get_src_reg(c, inst, 0, i2));
859 src1 = get_src_reg_imm(c, inst, 1, i1);
860 brw_MUL(p, brw_null_reg(), src0, src1);
861 src0 = get_src_reg(c, inst, 0, i1);
862 src1 = get_src_reg_imm(c, inst, 1, i2);
863 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
864 brw_MAC(p, dst, src0, src1);
865 brw_set_saturate(p, 0);
866 }
867 }
868 brw_set_saturate(p, 0);
869 }
870
871 static void emit_dp3(struct brw_wm_compile *c,
872 const struct prog_instruction *inst)
873 {
874 struct brw_reg src0[3], src1[3], dst;
875 int i;
876 struct brw_compile *p = &c->func;
877 for (i = 0; i < 3; i++) {
878 src0[i] = get_src_reg(c, inst, 0, i);
879 src1[i] = get_src_reg_imm(c, inst, 1, i);
880 }
881
882 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
883 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
884 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
885 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
886 brw_MAC(p, dst, src0[2], src1[2]);
887 brw_set_saturate(p, 0);
888 }
889
890 static void emit_dp4(struct brw_wm_compile *c,
891 const struct prog_instruction *inst)
892 {
893 struct brw_reg src0[4], src1[4], dst;
894 int i;
895 struct brw_compile *p = &c->func;
896 for (i = 0; i < 4; i++) {
897 src0[i] = get_src_reg(c, inst, 0, i);
898 src1[i] = get_src_reg_imm(c, inst, 1, i);
899 }
900 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
901 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
902 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
903 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
904 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
905 brw_MAC(p, dst, src0[3], src1[3]);
906 brw_set_saturate(p, 0);
907 }
908
909 static void emit_dph(struct brw_wm_compile *c,
910 const struct prog_instruction *inst)
911 {
912 struct brw_reg src0[4], src1[4], dst;
913 int i;
914 struct brw_compile *p = &c->func;
915 for (i = 0; i < 4; i++) {
916 src0[i] = get_src_reg(c, inst, 0, i);
917 src1[i] = get_src_reg_imm(c, inst, 1, i);
918 }
919 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
920 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
921 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
922 brw_MAC(p, dst, src0[2], src1[2]);
923 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
924 brw_ADD(p, dst, dst, src1[3]);
925 brw_set_saturate(p, 0);
926 }
927
928 /**
929 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
930 * Note that the result of the function is smeared across the dest
931 * register's X, Y, Z and W channels (subject to writemasking of course).
932 */
933 static void emit_math1(struct brw_wm_compile *c,
934 const struct prog_instruction *inst, GLuint func)
935 {
936 struct brw_compile *p = &c->func;
937 struct brw_reg src0, dst, tmp;
938 const int mark = mark_tmps( c );
939 int i;
940
941 tmp = alloc_tmp(c);
942
943 /* Get first component of source register */
944 src0 = get_src_reg(c, inst, 0, 0);
945
946 /* tmp = func(src0) */
947 brw_MOV(p, brw_message_reg(2), src0);
948 brw_math(p,
949 tmp,
950 func,
951 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
952 2,
953 brw_null_reg(),
954 BRW_MATH_DATA_VECTOR,
955 BRW_MATH_PRECISION_FULL);
956
957 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
958
959 /* replicate tmp value across enabled dest channels */
960 for (i = 0; i < 4; i++) {
961 if (inst->DstReg.WriteMask & (1 << i)) {
962 dst = get_dst_reg(c, inst, i);
963 brw_MOV(p, dst, tmp);
964 }
965 }
966
967 release_tmps(c, mark);
968 }
969
970 static void emit_rcp(struct brw_wm_compile *c,
971 const struct prog_instruction *inst)
972 {
973 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
974 }
975
976 static void emit_rsq(struct brw_wm_compile *c,
977 const struct prog_instruction *inst)
978 {
979 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
980 }
981
982 static void emit_sin(struct brw_wm_compile *c,
983 const struct prog_instruction *inst)
984 {
985 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
986 }
987
988 static void emit_cos(struct brw_wm_compile *c,
989 const struct prog_instruction *inst)
990 {
991 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
992 }
993
994 static void emit_ex2(struct brw_wm_compile *c,
995 const struct prog_instruction *inst)
996 {
997 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
998 }
999
1000 static void emit_lg2(struct brw_wm_compile *c,
1001 const struct prog_instruction *inst)
1002 {
1003 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1004 }
1005
1006 static void emit_add(struct brw_wm_compile *c,
1007 const struct prog_instruction *inst)
1008 {
1009 struct brw_compile *p = &c->func;
1010 struct brw_reg src0, src1, dst;
1011 GLuint mask = inst->DstReg.WriteMask;
1012 int i;
1013 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1014 for (i = 0 ; i < 4; i++) {
1015 if (mask & (1<<i)) {
1016 dst = get_dst_reg(c, inst, i);
1017 src0 = get_src_reg(c, inst, 0, i);
1018 src1 = get_src_reg_imm(c, inst, 1, i);
1019 brw_ADD(p, dst, src0, src1);
1020 }
1021 }
1022 brw_set_saturate(p, 0);
1023 }
1024
1025 static void emit_arl(struct brw_wm_compile *c,
1026 const struct prog_instruction *inst)
1027 {
1028 struct brw_compile *p = &c->func;
1029 struct brw_reg src0, addr_reg;
1030 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1031 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1032 BRW_ARF_ADDRESS, 0);
1033 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1034 brw_MOV(p, addr_reg, src0);
1035 brw_set_saturate(p, 0);
1036 }
1037
1038 static void emit_sub(struct brw_wm_compile *c,
1039 const struct prog_instruction *inst)
1040 {
1041 struct brw_compile *p = &c->func;
1042 struct brw_reg src0, src1, dst;
1043 GLuint mask = inst->DstReg.WriteMask;
1044 int i;
1045 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1046 for (i = 0 ; i < 4; i++) {
1047 if (mask & (1<<i)) {
1048 dst = get_dst_reg(c, inst, i);
1049 src0 = get_src_reg(c, inst, 0, i);
1050 src1 = get_src_reg_imm(c, inst, 1, i);
1051 brw_ADD(p, dst, src0, negate(src1));
1052 }
1053 }
1054 brw_set_saturate(p, 0);
1055 }
1056
1057 static void emit_mul(struct brw_wm_compile *c,
1058 const struct prog_instruction *inst)
1059 {
1060 struct brw_compile *p = &c->func;
1061 struct brw_reg src0, src1, dst;
1062 GLuint mask = inst->DstReg.WriteMask;
1063 int i;
1064 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1065 for (i = 0 ; i < 4; i++) {
1066 if (mask & (1<<i)) {
1067 dst = get_dst_reg(c, inst, i);
1068 src0 = get_src_reg(c, inst, 0, i);
1069 src1 = get_src_reg_imm(c, inst, 1, i);
1070 brw_MUL(p, dst, src0, src1);
1071 }
1072 }
1073 brw_set_saturate(p, 0);
1074 }
1075
1076 static void emit_frc(struct brw_wm_compile *c,
1077 const struct prog_instruction *inst)
1078 {
1079 struct brw_compile *p = &c->func;
1080 struct brw_reg src0, dst;
1081 GLuint mask = inst->DstReg.WriteMask;
1082 int i;
1083 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1084 for (i = 0 ; i < 4; i++) {
1085 if (mask & (1<<i)) {
1086 dst = get_dst_reg(c, inst, i);
1087 src0 = get_src_reg_imm(c, inst, 0, i);
1088 brw_FRC(p, dst, src0);
1089 }
1090 }
1091 if (inst->SaturateMode != SATURATE_OFF)
1092 brw_set_saturate(p, 0);
1093 }
1094
1095 static void emit_flr(struct brw_wm_compile *c,
1096 const struct prog_instruction *inst)
1097 {
1098 struct brw_compile *p = &c->func;
1099 struct brw_reg src0, dst;
1100 GLuint mask = inst->DstReg.WriteMask;
1101 int i;
1102 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1103 for (i = 0 ; i < 4; i++) {
1104 if (mask & (1<<i)) {
1105 dst = get_dst_reg(c, inst, i);
1106 src0 = get_src_reg_imm(c, inst, 0, i);
1107 brw_RNDD(p, dst, src0);
1108 }
1109 }
1110 brw_set_saturate(p, 0);
1111 }
1112
1113
1114 static void emit_min_max(struct brw_wm_compile *c,
1115 const struct prog_instruction *inst)
1116 {
1117 struct brw_compile *p = &c->func;
1118 const GLuint mask = inst->DstReg.WriteMask;
1119 const int mark = mark_tmps(c);
1120 int i;
1121 brw_push_insn_state(p);
1122 for (i = 0; i < 4; i++) {
1123 if (mask & (1<<i)) {
1124 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1125 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1126 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1127 struct brw_reg dst;
1128 /* if dst==src0 or dst==src1 we need to use a temp reg */
1129 GLboolean use_temp = brw_same_reg(dst, src0) ||
1130 brw_same_reg(dst, src1);
1131 if (use_temp)
1132 dst = alloc_tmp(c);
1133 else
1134 dst = real_dst;
1135
1136 /*
1137 printf(" Min/max: dst %d src0 %d src1 %d\n",
1138 dst.nr, src0.nr, src1.nr);
1139 */
1140 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1141 brw_MOV(p, dst, src0);
1142 brw_set_saturate(p, 0);
1143
1144 if (inst->Opcode == OPCODE_MIN)
1145 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1146 else
1147 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1148
1149 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1150 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1151 brw_MOV(p, dst, src1);
1152 brw_set_saturate(p, 0);
1153 brw_set_predicate_control_flag_value(p, 0xff);
1154 if (use_temp)
1155 brw_MOV(p, real_dst, dst);
1156 }
1157 }
1158 brw_pop_insn_state(p);
1159 release_tmps(c, mark);
1160 }
1161
1162 static void emit_pow(struct brw_wm_compile *c,
1163 const struct prog_instruction *inst)
1164 {
1165 struct brw_compile *p = &c->func;
1166 struct brw_reg dst, src0, src1;
1167 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1168 src0 = get_src_reg_imm(c, inst, 0, 0);
1169 src1 = get_src_reg_imm(c, inst, 1, 0);
1170
1171 brw_MOV(p, brw_message_reg(2), src0);
1172 brw_MOV(p, brw_message_reg(3), src1);
1173
1174 brw_math(p,
1175 dst,
1176 BRW_MATH_FUNCTION_POW,
1177 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1178 2,
1179 brw_null_reg(),
1180 BRW_MATH_DATA_VECTOR,
1181 BRW_MATH_PRECISION_FULL);
1182 }
1183
1184 static void emit_lrp(struct brw_wm_compile *c,
1185 const struct prog_instruction *inst)
1186 {
1187 struct brw_compile *p = &c->func;
1188 GLuint mask = inst->DstReg.WriteMask;
1189 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1190 int i;
1191 int mark = mark_tmps(c);
1192 for (i = 0; i < 4; i++) {
1193 if (mask & (1<<i)) {
1194 dst = get_dst_reg(c, inst, i);
1195 src0 = get_src_reg(c, inst, 0, i);
1196
1197 src1 = get_src_reg_imm(c, inst, 1, i);
1198
1199 if (src1.nr == dst.nr) {
1200 tmp1 = alloc_tmp(c);
1201 brw_MOV(p, tmp1, src1);
1202 } else
1203 tmp1 = src1;
1204
1205 src2 = get_src_reg(c, inst, 2, i);
1206 if (src2.nr == dst.nr) {
1207 tmp2 = alloc_tmp(c);
1208 brw_MOV(p, tmp2, src2);
1209 } else
1210 tmp2 = src2;
1211
1212 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1213 brw_MUL(p, brw_null_reg(), dst, tmp2);
1214 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1215 brw_MAC(p, dst, src0, tmp1);
1216 brw_set_saturate(p, 0);
1217 }
1218 release_tmps(c, mark);
1219 }
1220 }
1221
1222 /**
1223 * For GLSL shaders, this KIL will be unconditional.
1224 * It may be contained inside an IF/ENDIF structure of course.
1225 */
1226 static void emit_kil(struct brw_wm_compile *c)
1227 {
1228 struct brw_compile *p = &c->func;
1229 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1230 brw_push_insn_state(p);
1231 brw_set_mask_control(p, BRW_MASK_DISABLE);
1232 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1233 brw_AND(p, depth, c->emit_mask_reg, depth);
1234 brw_pop_insn_state(p);
1235 }
1236
1237 static void emit_mad(struct brw_wm_compile *c,
1238 const struct prog_instruction *inst)
1239 {
1240 struct brw_compile *p = &c->func;
1241 GLuint mask = inst->DstReg.WriteMask;
1242 struct brw_reg dst, src0, src1, src2;
1243 int i;
1244
1245 for (i = 0; i < 4; i++) {
1246 if (mask & (1<<i)) {
1247 dst = get_dst_reg(c, inst, i);
1248 src0 = get_src_reg(c, inst, 0, i);
1249 src1 = get_src_reg_imm(c, inst, 1, i);
1250 src2 = get_src_reg_imm(c, inst, 2, i);
1251 brw_MUL(p, dst, src0, src1);
1252
1253 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1254 brw_ADD(p, dst, dst, src2);
1255 brw_set_saturate(p, 0);
1256 }
1257 }
1258 }
1259
1260 static void emit_sop(struct brw_wm_compile *c,
1261 const struct prog_instruction *inst, GLuint cond)
1262 {
1263 struct brw_compile *p = &c->func;
1264 GLuint mask = inst->DstReg.WriteMask;
1265 struct brw_reg dst, src0, src1;
1266 int i;
1267
1268 for (i = 0; i < 4; i++) {
1269 if (mask & (1<<i)) {
1270 dst = get_dst_reg(c, inst, i);
1271 src0 = get_src_reg(c, inst, 0, i);
1272 src1 = get_src_reg_imm(c, inst, 1, i);
1273 brw_push_insn_state(p);
1274 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1275 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1276 brw_MOV(p, dst, brw_imm_f(0.0));
1277 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1278 brw_MOV(p, dst, brw_imm_f(1.0));
1279 brw_pop_insn_state(p);
1280 }
1281 }
1282 }
1283
1284 static void emit_slt(struct brw_wm_compile *c,
1285 const struct prog_instruction *inst)
1286 {
1287 emit_sop(c, inst, BRW_CONDITIONAL_L);
1288 }
1289
1290 static void emit_sle(struct brw_wm_compile *c,
1291 const struct prog_instruction *inst)
1292 {
1293 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1294 }
1295
1296 static void emit_sgt(struct brw_wm_compile *c,
1297 const struct prog_instruction *inst)
1298 {
1299 emit_sop(c, inst, BRW_CONDITIONAL_G);
1300 }
1301
1302 static void emit_sge(struct brw_wm_compile *c,
1303 const struct prog_instruction *inst)
1304 {
1305 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1306 }
1307
1308 static void emit_seq(struct brw_wm_compile *c,
1309 const struct prog_instruction *inst)
1310 {
1311 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1312 }
1313
1314 static void emit_sne(struct brw_wm_compile *c,
1315 const struct prog_instruction *inst)
1316 {
1317 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1318 }
1319
1320 static void emit_ddx(struct brw_wm_compile *c,
1321 const struct prog_instruction *inst)
1322 {
1323 struct brw_compile *p = &c->func;
1324 GLuint mask = inst->DstReg.WriteMask;
1325 struct brw_reg interp[4];
1326 struct brw_reg dst;
1327 struct brw_reg src0, w;
1328 GLuint nr, i;
1329 src0 = get_src_reg(c, inst, 0, 0);
1330 w = get_src_reg(c, inst, 1, 3);
1331 nr = src0.nr;
1332 interp[0] = brw_vec1_grf(nr, 0);
1333 interp[1] = brw_vec1_grf(nr, 4);
1334 interp[2] = brw_vec1_grf(nr+1, 0);
1335 interp[3] = brw_vec1_grf(nr+1, 4);
1336 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1337 for(i = 0; i < 4; i++ ) {
1338 if (mask & (1<<i)) {
1339 dst = get_dst_reg(c, inst, i);
1340 brw_MOV(p, dst, interp[i]);
1341 brw_MUL(p, dst, dst, w);
1342 }
1343 }
1344 brw_set_saturate(p, 0);
1345 }
1346
1347 static void emit_ddy(struct brw_wm_compile *c,
1348 const struct prog_instruction *inst)
1349 {
1350 struct brw_compile *p = &c->func;
1351 GLuint mask = inst->DstReg.WriteMask;
1352 struct brw_reg interp[4];
1353 struct brw_reg dst;
1354 struct brw_reg src0, w;
1355 GLuint nr, i;
1356
1357 src0 = get_src_reg(c, inst, 0, 0);
1358 nr = src0.nr;
1359 w = get_src_reg(c, inst, 1, 3);
1360 interp[0] = brw_vec1_grf(nr, 0);
1361 interp[1] = brw_vec1_grf(nr, 4);
1362 interp[2] = brw_vec1_grf(nr+1, 0);
1363 interp[3] = brw_vec1_grf(nr+1, 4);
1364 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1365 for(i = 0; i < 4; i++ ) {
1366 if (mask & (1<<i)) {
1367 dst = get_dst_reg(c, inst, i);
1368 brw_MOV(p, dst, suboffset(interp[i], 1));
1369 brw_MUL(p, dst, dst, w);
1370 }
1371 }
1372 brw_set_saturate(p, 0);
1373 }
1374
1375 static INLINE struct brw_reg high_words( struct brw_reg reg )
1376 {
1377 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1378 0, 8, 2 );
1379 }
1380
1381 static INLINE struct brw_reg low_words( struct brw_reg reg )
1382 {
1383 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1384 }
1385
1386 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1387 {
1388 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1389 }
1390
1391 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1392 {
1393 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1394 0, 16, 2 );
1395 }
1396
1397 /* One-, two- and three-dimensional Perlin noise, similar to the description
1398 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1399 static void noise1_sub( struct brw_wm_compile *c ) {
1400
1401 struct brw_compile *p = &c->func;
1402 struct brw_reg param,
1403 x0, x1, /* gradients at each end */
1404 t, tmp[ 2 ], /* float temporaries */
1405 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1406 int i;
1407 int mark = mark_tmps( c );
1408
1409 x0 = alloc_tmp( c );
1410 x1 = alloc_tmp( c );
1411 t = alloc_tmp( c );
1412 tmp[ 0 ] = alloc_tmp( c );
1413 tmp[ 1 ] = alloc_tmp( c );
1414 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1415 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1416 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1417 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1418 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1419
1420 param = lookup_tmp( c, mark - 2 );
1421
1422 brw_set_access_mode( p, BRW_ALIGN_1 );
1423
1424 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1425
1426 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1427 be hashed. Also compute the remainder (offset within the unit
1428 length), interleaved to reduce register dependency penalties. */
1429 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1430 brw_FRC( p, param, param );
1431 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1432 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1433 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1434
1435 /* We're now ready to perform the hashing. The two hashes are
1436 interleaved for performance. The hash function used is
1437 designed to rapidly achieve avalanche and require only 32x16
1438 bit multiplication, and 16-bit swizzles (which we get for
1439 free). We can't use immediate operands in the multiplies,
1440 because immediates are permitted only in src1 and the 16-bit
1441 factor is permitted only in src0. */
1442 for( i = 0; i < 2; i++ )
1443 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1444 for( i = 0; i < 2; i++ )
1445 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1446 high_words( itmp[ i ] ) );
1447 for( i = 0; i < 2; i++ )
1448 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1449 for( i = 0; i < 2; i++ )
1450 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1451 high_words( itmp[ i ] ) );
1452 for( i = 0; i < 2; i++ )
1453 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1454 for( i = 0; i < 2; i++ )
1455 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1456 high_words( itmp[ i ] ) );
1457
1458 /* Now we want to initialise the two gradients based on the
1459 hashes. Format conversion from signed integer to float leaves
1460 everything scaled too high by a factor of pow( 2, 31 ), but
1461 we correct for that right at the end. */
1462 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1463 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1464 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1465
1466 brw_MUL( p, x0, x0, param );
1467 brw_MUL( p, x1, x1, t );
1468
1469 /* We interpolate between the gradients using the polynomial
1470 6t^5 - 15t^4 + 10t^3 (Perlin). */
1471 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1472 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1473 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1474 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1475 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1476 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1477 pipeline */
1478 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1479 brw_MUL( p, param, tmp[ 0 ], param );
1480 brw_MUL( p, x1, x1, param );
1481 brw_ADD( p, x0, x0, x1 );
1482 /* scale by pow( 2, -30 ), to compensate for the format conversion
1483 above and an extra factor of 2 so that a single gradient covers
1484 the [-1,1] range */
1485 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1486
1487 release_tmps( c, mark );
1488 }
1489
1490 static void emit_noise1( struct brw_wm_compile *c,
1491 const struct prog_instruction *inst )
1492 {
1493 struct brw_compile *p = &c->func;
1494 struct brw_reg src, param, dst;
1495 GLuint mask = inst->DstReg.WriteMask;
1496 int i;
1497 int mark = mark_tmps( c );
1498
1499 assert( mark == 0 );
1500
1501 src = get_src_reg( c, inst, 0, 0 );
1502
1503 param = alloc_tmp( c );
1504
1505 brw_MOV( p, param, src );
1506
1507 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1508
1509 /* Fill in the result: */
1510 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1511 for (i = 0 ; i < 4; i++) {
1512 if (mask & (1<<i)) {
1513 dst = get_dst_reg(c, inst, i);
1514 brw_MOV( p, dst, param );
1515 }
1516 }
1517 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1518 brw_set_saturate( p, 0 );
1519
1520 release_tmps( c, mark );
1521 }
1522
1523 static void noise2_sub( struct brw_wm_compile *c ) {
1524
1525 struct brw_compile *p = &c->func;
1526 struct brw_reg param0, param1,
1527 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1528 t, tmp[ 4 ], /* float temporaries */
1529 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1530 int i;
1531 int mark = mark_tmps( c );
1532
1533 x0y0 = alloc_tmp( c );
1534 x0y1 = alloc_tmp( c );
1535 x1y0 = alloc_tmp( c );
1536 x1y1 = alloc_tmp( c );
1537 t = alloc_tmp( c );
1538 for( i = 0; i < 4; i++ ) {
1539 tmp[ i ] = alloc_tmp( c );
1540 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1541 }
1542 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1543 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1544 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1545
1546 param0 = lookup_tmp( c, mark - 3 );
1547 param1 = lookup_tmp( c, mark - 2 );
1548
1549 brw_set_access_mode( p, BRW_ALIGN_1 );
1550
1551 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1552 be hashed. Also compute the remainders (offsets within the unit
1553 square), interleaved to reduce register dependency penalties. */
1554 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1555 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1556 brw_FRC( p, param0, param0 );
1557 brw_FRC( p, param1, param1 );
1558 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1559 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1560 low_words( itmp[ 1 ] ) );
1561 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1562 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1563 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1564 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1565 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1566
1567 /* We're now ready to perform the hashing. The four hashes are
1568 interleaved for performance. The hash function used is
1569 designed to rapidly achieve avalanche and require only 32x16
1570 bit multiplication, and 16-bit swizzles (which we get for
1571 free). We can't use immediate operands in the multiplies,
1572 because immediates are permitted only in src1 and the 16-bit
1573 factor is permitted only in src0. */
1574 for( i = 0; i < 4; i++ )
1575 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1576 for( i = 0; i < 4; i++ )
1577 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1578 high_words( itmp[ i ] ) );
1579 for( i = 0; i < 4; i++ )
1580 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1581 for( i = 0; i < 4; i++ )
1582 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1583 high_words( itmp[ i ] ) );
1584 for( i = 0; i < 4; i++ )
1585 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1586 for( i = 0; i < 4; i++ )
1587 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1588 high_words( itmp[ i ] ) );
1589
1590 /* Now we want to initialise the four gradients based on the
1591 hashes. Format conversion from signed integer to float leaves
1592 everything scaled too high by a factor of pow( 2, 15 ), but
1593 we correct for that right at the end. */
1594 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1595 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1596 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1597 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1598 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1599
1600 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1601 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1602 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1603 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1604
1605 brw_MUL( p, x1y0, x1y0, t );
1606 brw_MUL( p, x1y1, x1y1, t );
1607 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1608 brw_MUL( p, x0y0, x0y0, param0 );
1609 brw_MUL( p, x0y1, x0y1, param0 );
1610
1611 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1612 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1613 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1614 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1615
1616 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1617 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1618 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1619 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1620
1621 /* We interpolate between the gradients using the polynomial
1622 6t^5 - 15t^4 + 10t^3 (Perlin). */
1623 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1624 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1625 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1626 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1627 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1628 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1629 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1630 pipeline */
1631 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1632 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1633 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1634 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1635 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1636 pipeline */
1637 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1638 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1639 brw_MUL( p, param0, tmp[ 0 ], param0 );
1640 brw_MUL( p, param1, tmp[ 1 ], param1 );
1641
1642 /* Here we interpolate in the y dimension... */
1643 brw_MUL( p, x0y1, x0y1, param1 );
1644 brw_MUL( p, x1y1, x1y1, param1 );
1645 brw_ADD( p, x0y0, x0y0, x0y1 );
1646 brw_ADD( p, x1y0, x1y0, x1y1 );
1647
1648 /* And now in x. There are horrible register dependencies here,
1649 but we have nothing else to do. */
1650 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1651 brw_MUL( p, x1y0, x1y0, param0 );
1652 brw_ADD( p, x0y0, x0y0, x1y0 );
1653
1654 /* scale by pow( 2, -15 ), as described above */
1655 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1656
1657 release_tmps( c, mark );
1658 }
1659
1660 static void emit_noise2( struct brw_wm_compile *c,
1661 const struct prog_instruction *inst )
1662 {
1663 struct brw_compile *p = &c->func;
1664 struct brw_reg src0, src1, param0, param1, dst;
1665 GLuint mask = inst->DstReg.WriteMask;
1666 int i;
1667 int mark = mark_tmps( c );
1668
1669 assert( mark == 0 );
1670
1671 src0 = get_src_reg( c, inst, 0, 0 );
1672 src1 = get_src_reg( c, inst, 0, 1 );
1673
1674 param0 = alloc_tmp( c );
1675 param1 = alloc_tmp( c );
1676
1677 brw_MOV( p, param0, src0 );
1678 brw_MOV( p, param1, src1 );
1679
1680 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1681
1682 /* Fill in the result: */
1683 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1684 for (i = 0 ; i < 4; i++) {
1685 if (mask & (1<<i)) {
1686 dst = get_dst_reg(c, inst, i);
1687 brw_MOV( p, dst, param0 );
1688 }
1689 }
1690 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1691 brw_set_saturate( p, 0 );
1692
1693 release_tmps( c, mark );
1694 }
1695
1696 /**
1697 * The three-dimensional case is much like the one- and two- versions above,
1698 * but since the number of corners is rapidly growing we now pack 16 16-bit
1699 * hashes into each register to extract more parallelism from the EUs.
1700 */
1701 static void noise3_sub( struct brw_wm_compile *c ) {
1702
1703 struct brw_compile *p = &c->func;
1704 struct brw_reg param0, param1, param2,
1705 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1706 xi, yi, zi, /* interpolation coefficients */
1707 t, tmp[ 8 ], /* float temporaries */
1708 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1709 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1710 int i;
1711 int mark = mark_tmps( c );
1712
1713 x0y0 = alloc_tmp( c );
1714 x0y1 = alloc_tmp( c );
1715 x1y0 = alloc_tmp( c );
1716 x1y1 = alloc_tmp( c );
1717 xi = alloc_tmp( c );
1718 yi = alloc_tmp( c );
1719 zi = alloc_tmp( c );
1720 t = alloc_tmp( c );
1721 for( i = 0; i < 8; i++ ) {
1722 tmp[ i ] = alloc_tmp( c );
1723 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1724 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1725 }
1726
1727 param0 = lookup_tmp( c, mark - 4 );
1728 param1 = lookup_tmp( c, mark - 3 );
1729 param2 = lookup_tmp( c, mark - 2 );
1730
1731 brw_set_access_mode( p, BRW_ALIGN_1 );
1732
1733 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1734 be hashed. Also compute the remainders (offsets within the unit
1735 cube), interleaved to reduce register dependency penalties. */
1736 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1737 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1738 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1739 brw_FRC( p, param0, param0 );
1740 brw_FRC( p, param1, param1 );
1741 brw_FRC( p, param2, param2 );
1742 /* Since we now have only 16 bits of precision in the hash, we must
1743 be more careful about thorough mixing to maintain entropy as we
1744 squash the input vector into a small scalar. */
1745 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1746 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1747 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1748 brw_imm_uw( 0x9B93 ) );
1749 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1750 brw_imm_uw( 0xBC8F ) );
1751
1752 /* Temporarily disable the execution mask while we work with ExecSize=16
1753 channels (the mask is set for ExecSize=8 and is probably incorrect).
1754 Although this might cause execution of unwanted channels, the code
1755 writes only to temporary registers and has no side effects, so
1756 disabling the mask is harmless. */
1757 brw_push_insn_state( p );
1758 brw_set_mask_control( p, BRW_MASK_DISABLE );
1759 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1760 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1761 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1762
1763 /* We're now ready to perform the hashing. The eight hashes are
1764 interleaved for performance. The hash function used is
1765 designed to rapidly achieve avalanche and require only 16x16
1766 bit multiplication, and 8-bit swizzles (which we get for
1767 free). */
1768 for( i = 0; i < 4; i++ )
1769 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1770 for( i = 0; i < 4; i++ )
1771 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1772 odd_bytes( wtmp[ i ] ) );
1773 for( i = 0; i < 4; i++ )
1774 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1775 for( i = 0; i < 4; i++ )
1776 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1777 odd_bytes( wtmp[ i ] ) );
1778 brw_pop_insn_state( p );
1779
1780 /* Now we want to initialise the four rear gradients based on the
1781 hashes. Format conversion from signed integer to float leaves
1782 everything scaled too high by a factor of pow( 2, 15 ), but
1783 we correct for that right at the end. */
1784 /* x component */
1785 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1786 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1787 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1788 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1789 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1790
1791 brw_push_insn_state( p );
1792 brw_set_mask_control( p, BRW_MASK_DISABLE );
1793 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1794 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1795 brw_pop_insn_state( p );
1796
1797 brw_MUL( p, x1y0, x1y0, t );
1798 brw_MUL( p, x1y1, x1y1, t );
1799 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1800 brw_MUL( p, x0y0, x0y0, param0 );
1801 brw_MUL( p, x0y1, x0y1, param0 );
1802
1803 /* y component */
1804 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1805 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1806 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1807 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1808
1809 brw_push_insn_state( p );
1810 brw_set_mask_control( p, BRW_MASK_DISABLE );
1811 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1812 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1813 brw_pop_insn_state( p );
1814
1815 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1816 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1817 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1818 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1819 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1820
1821 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1822 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1823 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1824 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1825
1826 /* z component */
1827 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1828 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1829 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1830 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1831
1832 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1833 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1834 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1835 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1836
1837 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1838 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1839 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1840 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1841
1842 /* We interpolate between the gradients using the polynomial
1843 6t^5 - 15t^4 + 10t^3 (Perlin). */
1844 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1845 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1846 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1847 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1848 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1849 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1850 brw_MUL( p, xi, xi, param0 );
1851 brw_MUL( p, yi, yi, param1 );
1852 brw_MUL( p, zi, zi, param2 );
1853 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1854 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1855 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1856 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1857 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1858 brw_MUL( p, xi, xi, param0 );
1859 brw_MUL( p, yi, yi, param1 );
1860 brw_MUL( p, zi, zi, param2 );
1861 brw_MUL( p, xi, xi, param0 );
1862 brw_MUL( p, yi, yi, param1 );
1863 brw_MUL( p, zi, zi, param2 );
1864 brw_MUL( p, xi, xi, param0 );
1865 brw_MUL( p, yi, yi, param1 );
1866 brw_MUL( p, zi, zi, param2 );
1867
1868 /* Here we interpolate in the y dimension... */
1869 brw_MUL( p, x0y1, x0y1, yi );
1870 brw_MUL( p, x1y1, x1y1, yi );
1871 brw_ADD( p, x0y0, x0y0, x0y1 );
1872 brw_ADD( p, x1y0, x1y0, x1y1 );
1873
1874 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1875 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1876 brw_MUL( p, x1y0, x1y0, xi );
1877 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1878
1879 /* Now do the same thing for the front four gradients... */
1880 /* x component */
1881 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1882 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1883 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1884 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1885
1886 brw_push_insn_state( p );
1887 brw_set_mask_control( p, BRW_MASK_DISABLE );
1888 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1889 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1890 brw_pop_insn_state( p );
1891
1892 brw_MUL( p, x1y0, x1y0, t );
1893 brw_MUL( p, x1y1, x1y1, t );
1894 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1895 brw_MUL( p, x0y0, x0y0, param0 );
1896 brw_MUL( p, x0y1, x0y1, param0 );
1897
1898 /* y component */
1899 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1900 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1901 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1902 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1903
1904 brw_push_insn_state( p );
1905 brw_set_mask_control( p, BRW_MASK_DISABLE );
1906 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1907 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1908 brw_pop_insn_state( p );
1909
1910 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1911 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1912 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1913 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1914 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1915
1916 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1917 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1918 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1919 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1920
1921 /* z component */
1922 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1923 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1924 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1925 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1926
1927 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1928 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1929 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1930 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1931
1932 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1933 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1934 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1935 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1936
1937 /* The interpolation coefficients are still around from last time, so
1938 again interpolate in the y dimension... */
1939 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1940 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1941 brw_MUL( p, x0y1, x0y1, yi );
1942 brw_MUL( p, x1y1, x1y1, yi );
1943 brw_ADD( p, x0y0, x0y0, x0y1 );
1944 brw_ADD( p, x1y0, x1y0, x1y1 );
1945
1946 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1947 time put the front face in tmp[ 1 ] and we're nearly there... */
1948 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1949 brw_MUL( p, x1y0, x1y0, xi );
1950 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1951
1952 /* The final interpolation, in the z dimension: */
1953 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1954 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1955 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1956
1957 /* scale by pow( 2, -15 ), as described above */
1958 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1959
1960 release_tmps( c, mark );
1961 }
1962
1963 static void emit_noise3( struct brw_wm_compile *c,
1964 const struct prog_instruction *inst )
1965 {
1966 struct brw_compile *p = &c->func;
1967 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1968 GLuint mask = inst->DstReg.WriteMask;
1969 int i;
1970 int mark = mark_tmps( c );
1971
1972 assert( mark == 0 );
1973
1974 src0 = get_src_reg( c, inst, 0, 0 );
1975 src1 = get_src_reg( c, inst, 0, 1 );
1976 src2 = get_src_reg( c, inst, 0, 2 );
1977
1978 param0 = alloc_tmp( c );
1979 param1 = alloc_tmp( c );
1980 param2 = alloc_tmp( c );
1981
1982 brw_MOV( p, param0, src0 );
1983 brw_MOV( p, param1, src1 );
1984 brw_MOV( p, param2, src2 );
1985
1986 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1987
1988 /* Fill in the result: */
1989 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1990 for (i = 0 ; i < 4; i++) {
1991 if (mask & (1<<i)) {
1992 dst = get_dst_reg(c, inst, i);
1993 brw_MOV( p, dst, param0 );
1994 }
1995 }
1996 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1997 brw_set_saturate( p, 0 );
1998
1999 release_tmps( c, mark );
2000 }
2001
2002 /**
2003 * For the four-dimensional case, the little micro-optimisation benefits
2004 * we obtain by unrolling all the loops aren't worth the massive bloat it
2005 * now causes. Instead, we loop twice around performing a similar operation
2006 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2007 * code to glue it all together.
2008 */
2009 static void noise4_sub( struct brw_wm_compile *c )
2010 {
2011 struct brw_compile *p = &c->func;
2012 struct brw_reg param[ 4 ],
2013 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2014 w0, /* noise for the w=0 cube */
2015 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2016 interp[ 4 ], /* interpolation coefficients */
2017 t, tmp[ 8 ], /* float temporaries */
2018 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2019 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2020 int i, j;
2021 int mark = mark_tmps( c );
2022 GLuint loop, origin;
2023
2024 x0y0 = alloc_tmp( c );
2025 x0y1 = alloc_tmp( c );
2026 x1y0 = alloc_tmp( c );
2027 x1y1 = alloc_tmp( c );
2028 t = alloc_tmp( c );
2029 w0 = alloc_tmp( c );
2030 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2031 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2032
2033 for( i = 0; i < 4; i++ ) {
2034 param[ i ] = lookup_tmp( c, mark - 5 + i );
2035 interp[ i ] = alloc_tmp( c );
2036 }
2037
2038 for( i = 0; i < 8; i++ ) {
2039 tmp[ i ] = alloc_tmp( c );
2040 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2041 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2042 }
2043
2044 brw_set_access_mode( p, BRW_ALIGN_1 );
2045
2046 /* We only want 16 bits of precision from the integral part of each
2047 co-ordinate, but unfortunately the RNDD semantics would saturate
2048 at 16 bits if we performed the operation directly to a 16-bit
2049 destination. Therefore, we round to 32-bit temporaries where
2050 appropriate, and then store only the lower 16 bits. */
2051 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2052 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2053 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2054 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2055 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2056 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2057
2058 /* Modify the flag register here, because the side effect is useful
2059 later (see below). We know for certain that all flags will be
2060 cleared, since the FRC instruction cannot possibly generate
2061 negative results. Even for exceptional inputs (infinities, denormals,
2062 NaNs), the architecture guarantees that the L conditional is false. */
2063 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2064 brw_FRC( p, param[ 0 ], param[ 0 ] );
2065 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2066 for( i = 1; i < 4; i++ )
2067 brw_FRC( p, param[ i ], param[ i ] );
2068
2069 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2070 of all. */
2071 for( i = 0; i < 4; i++ )
2072 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2073 for( i = 0; i < 4; i++ )
2074 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2075 for( i = 0; i < 4; i++ )
2076 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2077 for( i = 0; i < 4; i++ )
2078 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2079 for( j = 0; j < 3; j++ )
2080 for( i = 0; i < 4; i++ )
2081 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2082
2083 /* Mark the current address, as it will be a jump destination. The
2084 following code will be executed twice: first, with the flag
2085 register clear indicating the w=0 case, and second with flags
2086 set for w=1. */
2087 loop = p->nr_insn;
2088
2089 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2090 be hashed. Since we have only 16 bits of precision in the hash, we
2091 must be careful about thorough mixing to maintain entropy as we
2092 squash the input vector into a small scalar. */
2093 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2094 brw_imm_uw( 0xBC8F ) );
2095 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2096 brw_imm_uw( 0xD0BD ) );
2097 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2098 brw_imm_uw( 0x9B93 ) );
2099 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2100 brw_imm_uw( 0xA359 ) );
2101 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2102 brw_imm_uw( 0xBC8F ) );
2103
2104 /* Temporarily disable the execution mask while we work with ExecSize=16
2105 channels (the mask is set for ExecSize=8 and is probably incorrect).
2106 Although this might cause execution of unwanted channels, the code
2107 writes only to temporary registers and has no side effects, so
2108 disabling the mask is harmless. */
2109 brw_push_insn_state( p );
2110 brw_set_mask_control( p, BRW_MASK_DISABLE );
2111 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2112 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2113 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2114
2115 /* We're now ready to perform the hashing. The eight hashes are
2116 interleaved for performance. The hash function used is
2117 designed to rapidly achieve avalanche and require only 16x16
2118 bit multiplication, and 8-bit swizzles (which we get for
2119 free). */
2120 for( i = 0; i < 4; i++ )
2121 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2122 for( i = 0; i < 4; i++ )
2123 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2124 odd_bytes( wtmp[ i ] ) );
2125 for( i = 0; i < 4; i++ )
2126 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2127 for( i = 0; i < 4; i++ )
2128 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2129 odd_bytes( wtmp[ i ] ) );
2130 brw_pop_insn_state( p );
2131
2132 /* Now we want to initialise the four rear gradients based on the
2133 hashes. Format conversion from signed integer to float leaves
2134 everything scaled too high by a factor of pow( 2, 15 ), but
2135 we correct for that right at the end. */
2136 /* x component */
2137 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2138 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2139 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2140 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2141 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2142
2143 brw_push_insn_state( p );
2144 brw_set_mask_control( p, BRW_MASK_DISABLE );
2145 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2146 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2147 brw_pop_insn_state( p );
2148
2149 brw_MUL( p, x1y0, x1y0, t );
2150 brw_MUL( p, x1y1, x1y1, t );
2151 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2152 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2153 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2154
2155 /* y component */
2156 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2157 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2158 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2159 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2160
2161 brw_push_insn_state( p );
2162 brw_set_mask_control( p, BRW_MASK_DISABLE );
2163 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2164 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2165 brw_pop_insn_state( p );
2166
2167 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2168 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2169 /* prepare t for the w component (used below): w the first time through
2170 the loop; w - 1 the second time) */
2171 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2172 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2173 p->current->header.predicate_inverse = 1;
2174 brw_MOV( p, t, param[ 3 ] );
2175 p->current->header.predicate_inverse = 0;
2176 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2177 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2178 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2179
2180 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2181 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2182 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2183 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2184
2185 /* z component */
2186 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2187 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2188 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2189 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2190
2191 brw_push_insn_state( p );
2192 brw_set_mask_control( p, BRW_MASK_DISABLE );
2193 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2194 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2195 brw_pop_insn_state( p );
2196
2197 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2198 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2199 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2200 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2201
2202 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2203 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2204 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2205 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2206
2207 /* w component */
2208 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2209 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2210 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2211 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2212
2213 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2214 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2215 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2216 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2217 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2218
2219 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2220 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2221 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2222 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2223
2224 /* Here we interpolate in the y dimension... */
2225 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2226 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2227 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2228 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2229 brw_ADD( p, x0y0, x0y0, x0y1 );
2230 brw_ADD( p, x1y0, x1y0, x1y1 );
2231
2232 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2233 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2234 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2235 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2236
2237 /* Now do the same thing for the front four gradients... */
2238 /* x component */
2239 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2240 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2241 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2242 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2243
2244 brw_push_insn_state( p );
2245 brw_set_mask_control( p, BRW_MASK_DISABLE );
2246 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2247 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2248 brw_pop_insn_state( p );
2249
2250 brw_MUL( p, x1y0, x1y0, t );
2251 brw_MUL( p, x1y1, x1y1, t );
2252 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2253 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2254 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2255
2256 /* y component */
2257 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2258 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2259 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2260 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2261
2262 brw_push_insn_state( p );
2263 brw_set_mask_control( p, BRW_MASK_DISABLE );
2264 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2265 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2266 brw_pop_insn_state( p );
2267
2268 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2269 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2270 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2271 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2272 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2273
2274 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2275 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2276 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2277 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2278
2279 /* z component */
2280 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2281 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2282 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2283 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2284
2285 brw_push_insn_state( p );
2286 brw_set_mask_control( p, BRW_MASK_DISABLE );
2287 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2288 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2289 brw_pop_insn_state( p );
2290
2291 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2292 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2293 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2294 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2295 /* prepare t for the w component (used below): w the first time through
2296 the loop; w - 1 the second time) */
2297 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2298 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2299 p->current->header.predicate_inverse = 1;
2300 brw_MOV( p, t, param[ 3 ] );
2301 p->current->header.predicate_inverse = 0;
2302 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2303
2304 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2305 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2306 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2307 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2308
2309 /* w component */
2310 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2311 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2312 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2313 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2314
2315 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2316 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2317 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2318 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2319
2320 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2321 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2322 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2323 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2324
2325 /* Interpolate in the y dimension: */
2326 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2327 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2328 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2329 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2330 brw_ADD( p, x0y0, x0y0, x0y1 );
2331 brw_ADD( p, x1y0, x1y0, x1y1 );
2332
2333 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2334 time put the front face in tmp[ 1 ] and we're nearly there... */
2335 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2336 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2337 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2338
2339 /* Another interpolation, in the z dimension: */
2340 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2341 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2342 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2343
2344 /* Exit the loop if we've computed both cubes... */
2345 origin = p->nr_insn;
2346 brw_push_insn_state( p );
2347 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2348 brw_set_mask_control( p, BRW_MASK_DISABLE );
2349 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2350 brw_pop_insn_state( p );
2351
2352 /* Save the result for the w=0 case, and increment the w coordinate: */
2353 brw_MOV( p, w0, tmp[ 0 ] );
2354 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2355 brw_imm_uw( 1 ) );
2356
2357 /* Loop around for the other cube. Explicitly set the flag register
2358 (unfortunately we must spend an extra instruction to do this: we
2359 can't rely on a side effect of the previous MOV or ADD because
2360 conditional modifiers which are normally true might be false in
2361 exceptional circumstances, e.g. given a NaN input; the add to
2362 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2363 brw_push_insn_state( p );
2364 brw_set_mask_control( p, BRW_MASK_DISABLE );
2365 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2366 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2367 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2368 brw_pop_insn_state( p );
2369
2370 /* Patch the previous conditional branch now that we know the
2371 destination address. */
2372 brw_set_src1( p->store + origin,
2373 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2374
2375 /* The very last interpolation. */
2376 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2377 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2378 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2379
2380 /* scale by pow( 2, -15 ), as described above */
2381 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2382
2383 release_tmps( c, mark );
2384 }
2385
2386 static void emit_noise4( struct brw_wm_compile *c,
2387 const struct prog_instruction *inst )
2388 {
2389 struct brw_compile *p = &c->func;
2390 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2391 GLuint mask = inst->DstReg.WriteMask;
2392 int i;
2393 int mark = mark_tmps( c );
2394
2395 assert( mark == 0 );
2396
2397 src0 = get_src_reg( c, inst, 0, 0 );
2398 src1 = get_src_reg( c, inst, 0, 1 );
2399 src2 = get_src_reg( c, inst, 0, 2 );
2400 src3 = get_src_reg( c, inst, 0, 3 );
2401
2402 param0 = alloc_tmp( c );
2403 param1 = alloc_tmp( c );
2404 param2 = alloc_tmp( c );
2405 param3 = alloc_tmp( c );
2406
2407 brw_MOV( p, param0, src0 );
2408 brw_MOV( p, param1, src1 );
2409 brw_MOV( p, param2, src2 );
2410 brw_MOV( p, param3, src3 );
2411
2412 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2413
2414 /* Fill in the result: */
2415 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2416 for (i = 0 ; i < 4; i++) {
2417 if (mask & (1<<i)) {
2418 dst = get_dst_reg(c, inst, i);
2419 brw_MOV( p, dst, param0 );
2420 }
2421 }
2422 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2423 brw_set_saturate( p, 0 );
2424
2425 release_tmps( c, mark );
2426 }
2427
2428 static void emit_wpos_xy(struct brw_wm_compile *c,
2429 const struct prog_instruction *inst)
2430 {
2431 struct brw_compile *p = &c->func;
2432 GLuint mask = inst->DstReg.WriteMask;
2433 struct brw_reg src0[2], dst[2];
2434
2435 dst[0] = get_dst_reg(c, inst, 0);
2436 dst[1] = get_dst_reg(c, inst, 1);
2437
2438 src0[0] = get_src_reg(c, inst, 0, 0);
2439 src0[1] = get_src_reg(c, inst, 0, 1);
2440
2441 /* Calculate the pixel offset from window bottom left into destination
2442 * X and Y channels.
2443 */
2444 if (mask & WRITEMASK_X) {
2445 /* X' = X - origin_x */
2446 brw_ADD(p,
2447 dst[0],
2448 retype(src0[0], BRW_REGISTER_TYPE_W),
2449 brw_imm_d(0 - c->key.origin_x));
2450 }
2451
2452 if (mask & WRITEMASK_Y) {
2453 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2454 brw_ADD(p,
2455 dst[1],
2456 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2457 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2458 }
2459 }
2460
2461 /* TODO
2462 BIAS on SIMD8 not working yet...
2463 */
2464 static void emit_txb(struct brw_wm_compile *c,
2465 const struct prog_instruction *inst)
2466 {
2467 struct brw_compile *p = &c->func;
2468 struct brw_reg dst[4], src[4], payload_reg;
2469 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2470 GLuint i;
2471
2472 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2473
2474 for (i = 0; i < 4; i++)
2475 dst[i] = get_dst_reg(c, inst, i);
2476 for (i = 0; i < 4; i++)
2477 src[i] = get_src_reg(c, inst, 0, i);
2478
2479 switch (inst->TexSrcTarget) {
2480 case TEXTURE_1D_INDEX:
2481 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2482 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2483 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2484 break;
2485 case TEXTURE_2D_INDEX:
2486 case TEXTURE_RECT_INDEX:
2487 brw_MOV(p, brw_message_reg(2), src[0]);
2488 brw_MOV(p, brw_message_reg(3), src[1]);
2489 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2490 break;
2491 default:
2492 brw_MOV(p, brw_message_reg(2), src[0]);
2493 brw_MOV(p, brw_message_reg(3), src[1]);
2494 brw_MOV(p, brw_message_reg(4), src[2]);
2495 break;
2496 }
2497 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2498 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2499 brw_SAMPLE(p,
2500 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2501 1, /* msg_reg_nr */
2502 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2503 SURF_INDEX_TEXTURE(unit),
2504 unit, /* sampler */
2505 inst->DstReg.WriteMask, /* writemask */
2506 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2507 4, /* response_length */
2508 4, /* msg_length */
2509 0); /* eot */
2510 }
2511
2512
2513 static void emit_tex(struct brw_wm_compile *c,
2514 const struct prog_instruction *inst)
2515 {
2516 struct brw_compile *p = &c->func;
2517 struct brw_reg dst[4], src[4], payload_reg;
2518 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2519 GLuint msg_len;
2520 GLuint i, nr;
2521 GLuint emit;
2522 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2523
2524 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2525
2526 for (i = 0; i < 4; i++)
2527 dst[i] = get_dst_reg(c, inst, i);
2528 for (i = 0; i < 4; i++)
2529 src[i] = get_src_reg(c, inst, 0, i);
2530
2531 switch (inst->TexSrcTarget) {
2532 case TEXTURE_1D_INDEX:
2533 emit = WRITEMASK_X;
2534 nr = 1;
2535 break;
2536 case TEXTURE_2D_INDEX:
2537 case TEXTURE_RECT_INDEX:
2538 emit = WRITEMASK_XY;
2539 nr = 2;
2540 break;
2541 default:
2542 emit = WRITEMASK_XYZ;
2543 nr = 3;
2544 break;
2545 }
2546 msg_len = 1;
2547
2548 /* move/load S, T, R coords */
2549 for (i = 0; i < nr; i++) {
2550 static const GLuint swz[4] = {0,1,2,2};
2551 if (emit & (1<<i))
2552 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2553 else
2554 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2555 msg_len += 1;
2556 }
2557
2558 if (shadow) {
2559 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2560 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2561 }
2562
2563 brw_SAMPLE(p,
2564 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2565 1, /* msg_reg_nr */
2566 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2567 SURF_INDEX_TEXTURE(unit),
2568 unit, /* sampler */
2569 inst->DstReg.WriteMask, /* writemask */
2570 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2571 4, /* response_length */
2572 shadow ? 6 : 4, /* msg_length */
2573 0); /* eot */
2574
2575 if (shadow)
2576 brw_MOV(p, dst[3], brw_imm_f(1.0));
2577 }
2578
2579
2580 /**
2581 * Resolve subroutine calls after code emit is done.
2582 */
2583 static void post_wm_emit( struct brw_wm_compile *c )
2584 {
2585 brw_resolve_cals(&c->func);
2586 }
2587
2588 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2589 {
2590 #define MAX_IFSN 32
2591 #define MAX_LOOP_DEPTH 32
2592 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2593 struct brw_instruction *inst0, *inst1;
2594 int i, if_insn = 0, loop_insn = 0;
2595 struct brw_compile *p = &c->func;
2596 struct brw_indirect stack_index = brw_indirect(0, 0);
2597
2598 c->reg_index = 0;
2599 prealloc_reg(c);
2600 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2601 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2602
2603 for (i = 0; i < c->nr_fp_insns; i++) {
2604 const struct prog_instruction *inst = &c->prog_instructions[i];
2605
2606 #if 0
2607 _mesa_printf("Inst %d: ", i);
2608 _mesa_print_instruction(inst);
2609 #endif
2610
2611 /* fetch any constants that this instruction needs */
2612 if (c->fp->use_const_buffer)
2613 fetch_constants(c, inst);
2614
2615 if (inst->CondUpdate)
2616 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2617 else
2618 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2619
2620 switch (inst->Opcode) {
2621 case WM_PIXELXY:
2622 emit_pixel_xy(c, inst);
2623 break;
2624 case WM_DELTAXY:
2625 emit_delta_xy(c, inst);
2626 break;
2627 case WM_PIXELW:
2628 emit_pixel_w(c, inst);
2629 break;
2630 case WM_LINTERP:
2631 emit_linterp(c, inst);
2632 break;
2633 case WM_PINTERP:
2634 emit_pinterp(c, inst);
2635 break;
2636 case WM_CINTERP:
2637 emit_cinterp(c, inst);
2638 break;
2639 case WM_WPOSXY:
2640 emit_wpos_xy(c, inst);
2641 break;
2642 case WM_FB_WRITE:
2643 emit_fb_write(c, inst);
2644 break;
2645 case WM_FRONTFACING:
2646 emit_frontfacing(c, inst);
2647 break;
2648 case OPCODE_ABS:
2649 emit_abs(c, inst);
2650 break;
2651 case OPCODE_ADD:
2652 emit_add(c, inst);
2653 break;
2654 case OPCODE_ARL:
2655 emit_arl(c, inst);
2656 break;
2657 case OPCODE_SUB:
2658 emit_sub(c, inst);
2659 break;
2660 case OPCODE_FRC:
2661 emit_frc(c, inst);
2662 break;
2663 case OPCODE_FLR:
2664 emit_flr(c, inst);
2665 break;
2666 case OPCODE_LRP:
2667 emit_lrp(c, inst);
2668 break;
2669 case OPCODE_TRUNC:
2670 emit_trunc(c, inst);
2671 break;
2672 case OPCODE_MOV:
2673 emit_mov(c, inst);
2674 break;
2675 case OPCODE_DP3:
2676 emit_dp3(c, inst);
2677 break;
2678 case OPCODE_DP4:
2679 emit_dp4(c, inst);
2680 break;
2681 case OPCODE_XPD:
2682 emit_xpd(c, inst);
2683 break;
2684 case OPCODE_DPH:
2685 emit_dph(c, inst);
2686 break;
2687 case OPCODE_RCP:
2688 emit_rcp(c, inst);
2689 break;
2690 case OPCODE_RSQ:
2691 emit_rsq(c, inst);
2692 break;
2693 case OPCODE_SIN:
2694 emit_sin(c, inst);
2695 break;
2696 case OPCODE_COS:
2697 emit_cos(c, inst);
2698 break;
2699 case OPCODE_EX2:
2700 emit_ex2(c, inst);
2701 break;
2702 case OPCODE_LG2:
2703 emit_lg2(c, inst);
2704 break;
2705 case OPCODE_MIN:
2706 case OPCODE_MAX:
2707 emit_min_max(c, inst);
2708 break;
2709 case OPCODE_DDX:
2710 emit_ddx(c, inst);
2711 break;
2712 case OPCODE_DDY:
2713 emit_ddy(c, inst);
2714 break;
2715 case OPCODE_SLT:
2716 emit_slt(c, inst);
2717 break;
2718 case OPCODE_SLE:
2719 emit_sle(c, inst);
2720 break;
2721 case OPCODE_SGT:
2722 emit_sgt(c, inst);
2723 break;
2724 case OPCODE_SGE:
2725 emit_sge(c, inst);
2726 break;
2727 case OPCODE_SEQ:
2728 emit_seq(c, inst);
2729 break;
2730 case OPCODE_SNE:
2731 emit_sne(c, inst);
2732 break;
2733 case OPCODE_MUL:
2734 emit_mul(c, inst);
2735 break;
2736 case OPCODE_POW:
2737 emit_pow(c, inst);
2738 break;
2739 case OPCODE_MAD:
2740 emit_mad(c, inst);
2741 break;
2742 case OPCODE_NOISE1:
2743 emit_noise1(c, inst);
2744 break;
2745 case OPCODE_NOISE2:
2746 emit_noise2(c, inst);
2747 break;
2748 case OPCODE_NOISE3:
2749 emit_noise3(c, inst);
2750 break;
2751 case OPCODE_NOISE4:
2752 emit_noise4(c, inst);
2753 break;
2754 case OPCODE_TEX:
2755 emit_tex(c, inst);
2756 break;
2757 case OPCODE_TXB:
2758 emit_txb(c, inst);
2759 break;
2760 case OPCODE_KIL_NV:
2761 emit_kil(c);
2762 break;
2763 case OPCODE_IF:
2764 assert(if_insn < MAX_IFSN);
2765 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2766 break;
2767 case OPCODE_ELSE:
2768 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2769 break;
2770 case OPCODE_ENDIF:
2771 assert(if_insn > 0);
2772 brw_ENDIF(p, if_inst[--if_insn]);
2773 break;
2774 case OPCODE_BGNSUB:
2775 brw_save_label(p, inst->Comment, p->nr_insn);
2776 break;
2777 case OPCODE_ENDSUB:
2778 /* no-op */
2779 break;
2780 case OPCODE_CAL:
2781 brw_push_insn_state(p);
2782 brw_set_mask_control(p, BRW_MASK_DISABLE);
2783 brw_set_access_mode(p, BRW_ALIGN_1);
2784 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2785 brw_set_access_mode(p, BRW_ALIGN_16);
2786 brw_ADD(p, get_addr_reg(stack_index),
2787 get_addr_reg(stack_index), brw_imm_d(4));
2788 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2789 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2790 brw_pop_insn_state(p);
2791 break;
2792
2793 case OPCODE_RET:
2794 brw_push_insn_state(p);
2795 brw_set_mask_control(p, BRW_MASK_DISABLE);
2796 brw_ADD(p, get_addr_reg(stack_index),
2797 get_addr_reg(stack_index), brw_imm_d(-4));
2798 brw_set_access_mode(p, BRW_ALIGN_1);
2799 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2800 brw_set_access_mode(p, BRW_ALIGN_16);
2801 brw_pop_insn_state(p);
2802
2803 break;
2804 case OPCODE_BGNLOOP:
2805 /* XXX may need to invalidate the current_constant regs */
2806 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2807 break;
2808 case OPCODE_BRK:
2809 brw_BREAK(p);
2810 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2811 break;
2812 case OPCODE_CONT:
2813 brw_CONT(p);
2814 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2815 break;
2816 case OPCODE_ENDLOOP:
2817 loop_insn--;
2818 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2819 /* patch all the BREAK instructions from
2820 last BEGINLOOP */
2821 while (inst0 > loop_inst[loop_insn]) {
2822 inst0--;
2823 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2824 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2825 inst0->bits3.if_else.pop_count = 0;
2826 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2827 inst0->bits3.if_else.jump_count = inst1 - inst0;
2828 inst0->bits3.if_else.pop_count = 0;
2829 }
2830 }
2831 break;
2832 default:
2833 _mesa_printf("unsupported IR in fragment shader %d\n",
2834 inst->Opcode);
2835 }
2836 if (inst->CondUpdate)
2837 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2838 else
2839 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2840 }
2841 post_wm_emit(c);
2842
2843 if (c->reg_index >= BRW_WM_MAX_GRF) {
2844 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2845 /* XXX we need to do some proper error recovery here */
2846 }
2847 }
2848
2849
2850 /**
2851 * Do GPU code generation for shaders that use GLSL features such as
2852 * flow control. Other shaders will be compiled with the
2853 */
2854 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2855 {
2856 if (INTEL_DEBUG & DEBUG_WM) {
2857 _mesa_printf("brw_wm_glsl_emit:\n");
2858 }
2859
2860 /* initial instruction translation/simplification */
2861 brw_wm_pass_fp(c);
2862
2863 /* actual code generation */
2864 brw_wm_emit_glsl(brw, c);
2865
2866 if (INTEL_DEBUG & DEBUG_WM) {
2867 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2868 }
2869
2870 c->prog_data.total_grf = c->reg_index;
2871 c->prog_data.total_scratch = 0;
2872 }