i965: fix fetching constants from constant buffer in glsl path
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(const struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int urb_read_length = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195
196 /* use a real constant buffer, or just use a section of the GRF? */
197 c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
198
199 if (c->fp->use_const_buffer) {
200 /* We'll use a real constant buffer and fetch constants from
201 * it with a dataport read message.
202 */
203
204 /* number of float constants in CURBE */
205 c->prog_data.nr_params = 0;
206 }
207 else {
208 const struct gl_program_parameter_list *plist =
209 c->fp->program.Base.Parameters;
210 int index = 0;
211
212 /* number of float constants in CURBE */
213 c->prog_data.nr_params = 4 * nr_params;
214
215 /* loop over program constants (float[4]) */
216 for (i = 0; i < nr_params; i++) {
217 /* loop over XYZW channels */
218 for (j = 0; j < 4; j++, index++) {
219 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
220 /* Save pointer to parameter/constant value.
221 * Constants will be copied in prepare_constant_buffer()
222 */
223 c->prog_data.param[index] = &plist->ParameterValues[i][j];
224 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
225 }
226 }
227 /* number of constant regs used (each reg is float[8]) */
228 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
229 c->reg_index += c->nr_creg;
230 }
231 }
232
233 /* fragment shader inputs */
234 for (i = 0; i < VERT_RESULT_MAX; i++) {
235 int fp_input;
236
237 if (i >= VERT_RESULT_VAR0)
238 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
239 else if (i <= VERT_RESULT_TEX7)
240 fp_input = i;
241 else
242 fp_input = -1;
243
244 if (fp_input >= 0 && inputs & (1 << fp_input)) {
245 urb_read_length = c->reg_index;
246 reg = brw_vec8_grf(c->reg_index, 0);
247 for (j = 0; j < 4; j++)
248 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
249 }
250 if (c->key.vp_outputs_written & (1 << i)) {
251 c->reg_index += 2;
252 }
253 }
254
255 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
256 c->prog_data.urb_read_length = urb_read_length;
257 c->prog_data.curb_read_length = c->nr_creg;
258 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
259 c->reg_index++;
260 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
261 c->reg_index += 2;
262
263 /* An instruction may reference up to three constants.
264 * They'll be found in these registers.
265 * XXX alloc these on demand!
266 */
267 if (c->fp->use_const_buffer) {
268 for (i = 0; i < 3; i++) {
269 c->current_const[i].index = -1;
270 c->current_const[i].reg = alloc_tmp(c);
271 }
272 }
273 #if 0
274 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
275 printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
276 #endif
277 }
278
279
280 /**
281 * Check if any of the instruction's src registers are constants, uniforms,
282 * or statevars. If so, fetch any constants that we don't already have in
283 * the three GRF slots.
284 */
285 static void fetch_constants(struct brw_wm_compile *c,
286 const struct prog_instruction *inst)
287 {
288 struct brw_compile *p = &c->func;
289 GLuint i;
290
291 /* loop over instruction src regs */
292 for (i = 0; i < 3; i++) {
293 const struct prog_src_register *src = &inst->SrcReg[i];
294 if (src->File == PROGRAM_STATE_VAR ||
295 src->File == PROGRAM_CONSTANT ||
296 src->File == PROGRAM_UNIFORM) {
297 c->current_const[i].index = src->Index;
298
299 #if 0
300 printf(" fetch const[%d] for arg %d into reg %d\n",
301 src->Index, i, c->current_const[i].reg.nr);
302 #endif
303
304 /* need to fetch the constant now */
305 brw_dp_READ_4(p,
306 c->current_const[i].reg, /* writeback dest */
307 src->RelAddr, /* relative indexing? */
308 16 * src->Index, /* byte offset */
309 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
310 );
311 }
312 }
313 }
314
315
316 /**
317 * Convert Mesa dst register to brw register.
318 */
319 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
320 const struct prog_instruction *inst,
321 GLuint component)
322 {
323 const int nr = 1;
324 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
325 0, 0);
326 }
327
328
329 static struct brw_reg
330 get_src_reg_const(struct brw_wm_compile *c,
331 const struct prog_instruction *inst,
332 GLuint srcRegIndex, GLuint component)
333 {
334 /* We should have already fetched the constant from the constant
335 * buffer in fetch_constants(). Now we just have to return a
336 * register description that extracts the needed component and
337 * smears it across all eight vector components.
338 */
339 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
340 struct brw_reg const_reg;
341
342 assert(component < 4);
343 assert(srcRegIndex < 3);
344 assert(c->current_const[srcRegIndex].index != -1);
345 const_reg = c->current_const[srcRegIndex].reg;
346
347 /* extract desired float from the const_reg, and smear */
348 const_reg = stride(const_reg, 0, 1, 0);
349 const_reg.subnr = component * 4;
350
351 if (src->Negate & (1 << component))
352 const_reg = negate(const_reg);
353 if (src->Abs)
354 const_reg = brw_abs(const_reg);
355
356 #if 0
357 printf(" form const[%d].%d for arg %d, reg %d\n",
358 c->current_const[srcRegIndex].index,
359 component,
360 srcRegIndex,
361 const_reg.nr);
362 #endif
363
364 return const_reg;
365 }
366
367
368 /**
369 * Convert Mesa src register to brw register.
370 */
371 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
372 const struct prog_instruction *inst,
373 GLuint srcRegIndex, GLuint channel)
374 {
375 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
376 const GLuint nr = 1;
377 const GLuint component = GET_SWZ(src->Swizzle, channel);
378
379 if (c->fp->use_const_buffer &&
380 (src->File == PROGRAM_STATE_VAR ||
381 src->File == PROGRAM_CONSTANT ||
382 src->File == PROGRAM_UNIFORM)) {
383 return get_src_reg_const(c, inst, srcRegIndex, component);
384 }
385 else {
386 /* other type of source register */
387 return get_reg(c, src->File, src->Index, component, nr,
388 src->Negate, src->Abs);
389 }
390 }
391
392
393 /**
394 * Same as \sa get_src_reg() but if the register is a literal, emit
395 * a brw_reg encoding the literal.
396 * Note that a brw instruction only allows one src operand to be a literal.
397 * For instructions with more than one operand, only the second can be a
398 * literal. This means that we treat some literals as constants/uniforms
399 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
400 *
401 */
402 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
403 const struct prog_instruction *inst,
404 GLuint srcRegIndex, GLuint channel)
405 {
406 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
407 if (src->File == PROGRAM_CONSTANT) {
408 /* a literal */
409 const int component = GET_SWZ(src->Swizzle, channel);
410 const GLfloat *param =
411 c->fp->program.Base.Parameters->ParameterValues[src->Index];
412 GLfloat value = param[component];
413 if (src->Negate & (1 << channel))
414 value = -value;
415 if (src->Abs)
416 value = FABSF(value);
417 #if 0
418 printf(" form immed value %f for chan %d\n", value, channel);
419 #endif
420 return brw_imm_f(value);
421 }
422 else {
423 return get_src_reg(c, inst, srcRegIndex, channel);
424 }
425 }
426
427
428 /**
429 * Subroutines are minimal support for resusable instruction sequences.
430 * They are implemented as simply as possible to minimise overhead: there
431 * is no explicit support for communication between the caller and callee
432 * other than saving the return address in a temporary register, nor is
433 * there any automatic local storage. This implies that great care is
434 * required before attempting reentrancy or any kind of nested
435 * subroutine invocations.
436 */
437 static void invoke_subroutine( struct brw_wm_compile *c,
438 enum _subroutine subroutine,
439 void (*emit)( struct brw_wm_compile * ) )
440 {
441 struct brw_compile *p = &c->func;
442
443 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
444
445 if( c->subroutines[ subroutine ] ) {
446 /* subroutine previously emitted: reuse existing instructions */
447
448 int mark = mark_tmps( c );
449 struct brw_reg return_address = retype( alloc_tmp( c ),
450 BRW_REGISTER_TYPE_UD );
451 int here = p->nr_insn;
452
453 brw_push_insn_state(p);
454 brw_set_mask_control(p, BRW_MASK_DISABLE);
455 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
456
457 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
458 brw_imm_d( ( c->subroutines[ subroutine ] -
459 here - 1 ) << 4 ) );
460 brw_pop_insn_state(p);
461
462 release_tmps( c, mark );
463 } else {
464 /* previously unused subroutine: emit, and mark for later reuse */
465
466 int mark = mark_tmps( c );
467 struct brw_reg return_address = retype( alloc_tmp( c ),
468 BRW_REGISTER_TYPE_UD );
469 struct brw_instruction *calc;
470 int base = p->nr_insn;
471
472 brw_push_insn_state(p);
473 brw_set_mask_control(p, BRW_MASK_DISABLE);
474 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
475 brw_pop_insn_state(p);
476
477 c->subroutines[ subroutine ] = p->nr_insn;
478
479 emit( c );
480
481 brw_push_insn_state(p);
482 brw_set_mask_control(p, BRW_MASK_DISABLE);
483 brw_MOV( p, brw_ip_reg(), return_address );
484 brw_pop_insn_state(p);
485
486 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
487
488 release_tmps( c, mark );
489 }
490 }
491
492 static void emit_abs( struct brw_wm_compile *c,
493 const struct prog_instruction *inst)
494 {
495 int i;
496 struct brw_compile *p = &c->func;
497 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
498 for (i = 0; i < 4; i++) {
499 if (inst->DstReg.WriteMask & (1<<i)) {
500 struct brw_reg src, dst;
501 dst = get_dst_reg(c, inst, i);
502 src = get_src_reg(c, inst, 0, i);
503 brw_MOV(p, dst, brw_abs(src));
504 }
505 }
506 brw_set_saturate(p, 0);
507 }
508
509 static void emit_trunc( struct brw_wm_compile *c,
510 const struct prog_instruction *inst)
511 {
512 int i;
513 struct brw_compile *p = &c->func;
514 GLuint mask = inst->DstReg.WriteMask;
515 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
516 for (i = 0; i < 4; i++) {
517 if (mask & (1<<i)) {
518 struct brw_reg src, dst;
519 dst = get_dst_reg(c, inst, i);
520 src = get_src_reg(c, inst, 0, i);
521 brw_RNDZ(p, dst, src);
522 }
523 }
524 brw_set_saturate(p, 0);
525 }
526
527 static void emit_mov( struct brw_wm_compile *c,
528 const struct prog_instruction *inst)
529 {
530 int i;
531 struct brw_compile *p = &c->func;
532 GLuint mask = inst->DstReg.WriteMask;
533 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
534 for (i = 0; i < 4; i++) {
535 if (mask & (1<<i)) {
536 struct brw_reg src, dst;
537 dst = get_dst_reg(c, inst, i);
538 /* XXX some moves from immediate value don't work reliably!!! */
539 /*src = get_src_reg_imm(c, inst, 0, i);*/
540 src = get_src_reg(c, inst, 0, i);
541 brw_MOV(p, dst, src);
542 }
543 }
544 brw_set_saturate(p, 0);
545 }
546
547 static void emit_pixel_xy(struct brw_wm_compile *c,
548 const struct prog_instruction *inst)
549 {
550 struct brw_reg r1 = brw_vec1_grf(1, 0);
551 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
552
553 struct brw_reg dst0, dst1;
554 struct brw_compile *p = &c->func;
555 GLuint mask = inst->DstReg.WriteMask;
556
557 dst0 = get_dst_reg(c, inst, 0);
558 dst1 = get_dst_reg(c, inst, 1);
559 /* Calculate pixel centers by adding 1 or 0 to each of the
560 * micro-tile coordinates passed in r1.
561 */
562 if (mask & WRITEMASK_X) {
563 brw_ADD(p,
564 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
565 stride(suboffset(r1_uw, 4), 2, 4, 0),
566 brw_imm_v(0x10101010));
567 }
568
569 if (mask & WRITEMASK_Y) {
570 brw_ADD(p,
571 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
572 stride(suboffset(r1_uw, 5), 2, 4, 0),
573 brw_imm_v(0x11001100));
574 }
575 }
576
577 static void emit_delta_xy(struct brw_wm_compile *c,
578 const struct prog_instruction *inst)
579 {
580 struct brw_reg r1 = brw_vec1_grf(1, 0);
581 struct brw_reg dst0, dst1, src0, src1;
582 struct brw_compile *p = &c->func;
583 GLuint mask = inst->DstReg.WriteMask;
584
585 dst0 = get_dst_reg(c, inst, 0);
586 dst1 = get_dst_reg(c, inst, 1);
587 src0 = get_src_reg(c, inst, 0, 0);
588 src1 = get_src_reg(c, inst, 0, 1);
589 /* Calc delta X,Y by subtracting origin in r1 from the pixel
590 * centers.
591 */
592 if (mask & WRITEMASK_X) {
593 brw_ADD(p,
594 dst0,
595 retype(src0, BRW_REGISTER_TYPE_UW),
596 negate(r1));
597 }
598
599 if (mask & WRITEMASK_Y) {
600 brw_ADD(p,
601 dst1,
602 retype(src1, BRW_REGISTER_TYPE_UW),
603 negate(suboffset(r1,1)));
604
605 }
606 }
607
608 static void fire_fb_write( struct brw_wm_compile *c,
609 GLuint base_reg,
610 GLuint nr,
611 GLuint target,
612 GLuint eot)
613 {
614 struct brw_compile *p = &c->func;
615 /* Pass through control information:
616 */
617 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
618 {
619 brw_push_insn_state(p);
620 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
621 brw_MOV(p,
622 brw_message_reg(base_reg + 1),
623 brw_vec8_grf(1, 0));
624 brw_pop_insn_state(p);
625 }
626 /* Send framebuffer write message: */
627 brw_fb_WRITE(p,
628 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
629 base_reg,
630 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
631 target,
632 nr,
633 0,
634 eot);
635 }
636
637 static void emit_fb_write(struct brw_wm_compile *c,
638 const struct prog_instruction *inst)
639 {
640 struct brw_compile *p = &c->func;
641 int nr = 2;
642 int channel;
643 GLuint target, eot;
644 struct brw_reg src0;
645
646 /* Reserve a space for AA - may not be needed:
647 */
648 if (c->key.aa_dest_stencil_reg)
649 nr += 1;
650
651 brw_push_insn_state(p);
652 for (channel = 0; channel < 4; channel++) {
653 src0 = get_src_reg(c, inst, 0, channel);
654 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
655 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
656 brw_MOV(p, brw_message_reg(nr + channel), src0);
657 }
658 /* skip over the regs populated above: */
659 nr += 8;
660 brw_pop_insn_state(p);
661
662 if (c->key.source_depth_to_render_target) {
663 if (c->key.computes_depth) {
664 src0 = get_src_reg(c, inst, 2, 2);
665 brw_MOV(p, brw_message_reg(nr), src0);
666 }
667 else {
668 src0 = get_src_reg(c, inst, 1, 1);
669 brw_MOV(p, brw_message_reg(nr), src0);
670 }
671
672 nr += 2;
673 }
674
675 if (c->key.dest_depth_reg) {
676 GLuint comp = c->key.dest_depth_reg / 2;
677 GLuint off = c->key.dest_depth_reg % 2;
678
679 assert(comp == 1);
680 assert(off == 0);
681 #if 0
682 /* XXX do we need this code? comp always 1, off always 0, it seems */
683 if (off != 0) {
684 brw_push_insn_state(p);
685 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
686
687 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
688 /* 2nd half? */
689 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
690 brw_pop_insn_state(p);
691 }
692 else
693 #endif
694 {
695 struct brw_reg src = get_src_reg(c, inst, 1, 1);
696 brw_MOV(p, brw_message_reg(nr), src);
697 }
698 nr += 2;
699 }
700
701 target = inst->Aux >> 1;
702 eot = inst->Aux & 1;
703 fire_fb_write(c, 0, nr, target, eot);
704 }
705
706 static void emit_pixel_w( struct brw_wm_compile *c,
707 const struct prog_instruction *inst)
708 {
709 struct brw_compile *p = &c->func;
710 GLuint mask = inst->DstReg.WriteMask;
711 if (mask & WRITEMASK_W) {
712 struct brw_reg dst, src0, delta0, delta1;
713 struct brw_reg interp3;
714
715 dst = get_dst_reg(c, inst, 3);
716 src0 = get_src_reg(c, inst, 0, 0);
717 delta0 = get_src_reg(c, inst, 1, 0);
718 delta1 = get_src_reg(c, inst, 1, 1);
719
720 interp3 = brw_vec1_grf(src0.nr+1, 4);
721 /* Calc 1/w - just linterp wpos[3] optimized by putting the
722 * result straight into a message reg.
723 */
724 brw_LINE(p, brw_null_reg(), interp3, delta0);
725 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
726
727 /* Calc w */
728 brw_math_16( p, dst,
729 BRW_MATH_FUNCTION_INV,
730 BRW_MATH_SATURATE_NONE,
731 2, brw_null_reg(),
732 BRW_MATH_PRECISION_FULL);
733 }
734 }
735
736 static void emit_linterp(struct brw_wm_compile *c,
737 const struct prog_instruction *inst)
738 {
739 struct brw_compile *p = &c->func;
740 GLuint mask = inst->DstReg.WriteMask;
741 struct brw_reg interp[4];
742 struct brw_reg dst, delta0, delta1;
743 struct brw_reg src0;
744 GLuint nr, i;
745
746 src0 = get_src_reg(c, inst, 0, 0);
747 delta0 = get_src_reg(c, inst, 1, 0);
748 delta1 = get_src_reg(c, inst, 1, 1);
749 nr = src0.nr;
750
751 interp[0] = brw_vec1_grf(nr, 0);
752 interp[1] = brw_vec1_grf(nr, 4);
753 interp[2] = brw_vec1_grf(nr+1, 0);
754 interp[3] = brw_vec1_grf(nr+1, 4);
755
756 for(i = 0; i < 4; i++ ) {
757 if (mask & (1<<i)) {
758 dst = get_dst_reg(c, inst, i);
759 brw_LINE(p, brw_null_reg(), interp[i], delta0);
760 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
761 }
762 }
763 }
764
765 static void emit_cinterp(struct brw_wm_compile *c,
766 const struct prog_instruction *inst)
767 {
768 struct brw_compile *p = &c->func;
769 GLuint mask = inst->DstReg.WriteMask;
770
771 struct brw_reg interp[4];
772 struct brw_reg dst, src0;
773 GLuint nr, i;
774
775 src0 = get_src_reg(c, inst, 0, 0);
776 nr = src0.nr;
777
778 interp[0] = brw_vec1_grf(nr, 0);
779 interp[1] = brw_vec1_grf(nr, 4);
780 interp[2] = brw_vec1_grf(nr+1, 0);
781 interp[3] = brw_vec1_grf(nr+1, 4);
782
783 for(i = 0; i < 4; i++ ) {
784 if (mask & (1<<i)) {
785 dst = get_dst_reg(c, inst, i);
786 brw_MOV(p, dst, suboffset(interp[i],3));
787 }
788 }
789 }
790
791 static void emit_pinterp(struct brw_wm_compile *c,
792 const struct prog_instruction *inst)
793 {
794 struct brw_compile *p = &c->func;
795 GLuint mask = inst->DstReg.WriteMask;
796
797 struct brw_reg interp[4];
798 struct brw_reg dst, delta0, delta1;
799 struct brw_reg src0, w;
800 GLuint nr, i;
801
802 src0 = get_src_reg(c, inst, 0, 0);
803 delta0 = get_src_reg(c, inst, 1, 0);
804 delta1 = get_src_reg(c, inst, 1, 1);
805 w = get_src_reg(c, inst, 2, 3);
806 nr = src0.nr;
807
808 interp[0] = brw_vec1_grf(nr, 0);
809 interp[1] = brw_vec1_grf(nr, 4);
810 interp[2] = brw_vec1_grf(nr+1, 0);
811 interp[3] = brw_vec1_grf(nr+1, 4);
812
813 for(i = 0; i < 4; i++ ) {
814 if (mask & (1<<i)) {
815 dst = get_dst_reg(c, inst, i);
816 brw_LINE(p, brw_null_reg(), interp[i], delta0);
817 brw_MAC(p, dst, suboffset(interp[i],1),
818 delta1);
819 brw_MUL(p, dst, dst, w);
820 }
821 }
822 }
823
824 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
825 static void emit_frontfacing(struct brw_wm_compile *c,
826 const struct prog_instruction *inst)
827 {
828 struct brw_compile *p = &c->func;
829 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
830 struct brw_reg dst;
831 GLuint mask = inst->DstReg.WriteMask;
832 int i;
833
834 for (i = 0; i < 4; i++) {
835 if (mask & (1<<i)) {
836 dst = get_dst_reg(c, inst, i);
837 brw_MOV(p, dst, brw_imm_f(0.0));
838 }
839 }
840
841 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
842 * us front face
843 */
844 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
845 for (i = 0; i < 4; i++) {
846 if (mask & (1<<i)) {
847 dst = get_dst_reg(c, inst, i);
848 brw_MOV(p, dst, brw_imm_f(1.0));
849 }
850 }
851 brw_set_predicate_control_flag_value(p, 0xff);
852 }
853
854 static void emit_xpd(struct brw_wm_compile *c,
855 const struct prog_instruction *inst)
856 {
857 int i;
858 struct brw_compile *p = &c->func;
859 GLuint mask = inst->DstReg.WriteMask;
860 for (i = 0; i < 4; i++) {
861 GLuint i2 = (i+2)%3;
862 GLuint i1 = (i+1)%3;
863 if (mask & (1<<i)) {
864 struct brw_reg src0, src1, dst;
865 dst = get_dst_reg(c, inst, i);
866 src0 = negate(get_src_reg(c, inst, 0, i2));
867 src1 = get_src_reg_imm(c, inst, 1, i1);
868 brw_MUL(p, brw_null_reg(), src0, src1);
869 src0 = get_src_reg(c, inst, 0, i1);
870 src1 = get_src_reg_imm(c, inst, 1, i2);
871 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
872 brw_MAC(p, dst, src0, src1);
873 brw_set_saturate(p, 0);
874 }
875 }
876 brw_set_saturate(p, 0);
877 }
878
879 static void emit_dp3(struct brw_wm_compile *c,
880 const struct prog_instruction *inst)
881 {
882 struct brw_reg src0[3], src1[3], dst;
883 int i;
884 struct brw_compile *p = &c->func;
885 for (i = 0; i < 3; i++) {
886 src0[i] = get_src_reg(c, inst, 0, i);
887 src1[i] = get_src_reg_imm(c, inst, 1, i);
888 }
889
890 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
891 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
892 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
893 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
894 brw_MAC(p, dst, src0[2], src1[2]);
895 brw_set_saturate(p, 0);
896 }
897
898 static void emit_dp4(struct brw_wm_compile *c,
899 const struct prog_instruction *inst)
900 {
901 struct brw_reg src0[4], src1[4], dst;
902 int i;
903 struct brw_compile *p = &c->func;
904 for (i = 0; i < 4; i++) {
905 src0[i] = get_src_reg(c, inst, 0, i);
906 src1[i] = get_src_reg_imm(c, inst, 1, i);
907 }
908 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
909 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
910 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
911 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
912 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
913 brw_MAC(p, dst, src0[3], src1[3]);
914 brw_set_saturate(p, 0);
915 }
916
917 static void emit_dph(struct brw_wm_compile *c,
918 const struct prog_instruction *inst)
919 {
920 struct brw_reg src0[4], src1[4], dst;
921 int i;
922 struct brw_compile *p = &c->func;
923 for (i = 0; i < 4; i++) {
924 src0[i] = get_src_reg(c, inst, 0, i);
925 src1[i] = get_src_reg_imm(c, inst, 1, i);
926 }
927 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
928 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
929 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
930 brw_MAC(p, dst, src0[2], src1[2]);
931 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
932 brw_ADD(p, dst, dst, src1[3]);
933 brw_set_saturate(p, 0);
934 }
935
936 /**
937 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
938 * Note that the result of the function is smeared across the dest
939 * register's X, Y, Z and W channels (subject to writemasking of course).
940 */
941 static void emit_math1(struct brw_wm_compile *c,
942 const struct prog_instruction *inst, GLuint func)
943 {
944 struct brw_compile *p = &c->func;
945 struct brw_reg src0, dst, tmp;
946 const int mark = mark_tmps( c );
947 int i;
948
949 tmp = alloc_tmp(c);
950
951 /* Get first component of source register */
952 src0 = get_src_reg(c, inst, 0, 0);
953
954 /* tmp = func(src0) */
955 brw_MOV(p, brw_message_reg(2), src0);
956 brw_math(p,
957 tmp,
958 func,
959 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
960 2,
961 brw_null_reg(),
962 BRW_MATH_DATA_VECTOR,
963 BRW_MATH_PRECISION_FULL);
964
965 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
966
967 /* replicate tmp value across enabled dest channels */
968 for (i = 0; i < 4; i++) {
969 if (inst->DstReg.WriteMask & (1 << i)) {
970 dst = get_dst_reg(c, inst, i);
971 brw_MOV(p, dst, tmp);
972 }
973 }
974
975 release_tmps(c, mark);
976 }
977
978 static void emit_rcp(struct brw_wm_compile *c,
979 const struct prog_instruction *inst)
980 {
981 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
982 }
983
984 static void emit_rsq(struct brw_wm_compile *c,
985 const struct prog_instruction *inst)
986 {
987 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
988 }
989
990 static void emit_sin(struct brw_wm_compile *c,
991 const struct prog_instruction *inst)
992 {
993 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
994 }
995
996 static void emit_cos(struct brw_wm_compile *c,
997 const struct prog_instruction *inst)
998 {
999 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1000 }
1001
1002 static void emit_ex2(struct brw_wm_compile *c,
1003 const struct prog_instruction *inst)
1004 {
1005 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1006 }
1007
1008 static void emit_lg2(struct brw_wm_compile *c,
1009 const struct prog_instruction *inst)
1010 {
1011 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1012 }
1013
1014 static void emit_add(struct brw_wm_compile *c,
1015 const struct prog_instruction *inst)
1016 {
1017 struct brw_compile *p = &c->func;
1018 struct brw_reg src0, src1, dst;
1019 GLuint mask = inst->DstReg.WriteMask;
1020 int i;
1021 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1022 for (i = 0 ; i < 4; i++) {
1023 if (mask & (1<<i)) {
1024 dst = get_dst_reg(c, inst, i);
1025 src0 = get_src_reg(c, inst, 0, i);
1026 src1 = get_src_reg_imm(c, inst, 1, i);
1027 brw_ADD(p, dst, src0, src1);
1028 }
1029 }
1030 brw_set_saturate(p, 0);
1031 }
1032
1033 static void emit_arl(struct brw_wm_compile *c,
1034 const struct prog_instruction *inst)
1035 {
1036 struct brw_compile *p = &c->func;
1037 struct brw_reg src0, addr_reg;
1038 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1039 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1040 BRW_ARF_ADDRESS, 0);
1041 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1042 brw_MOV(p, addr_reg, src0);
1043 brw_set_saturate(p, 0);
1044 }
1045
1046 static void emit_sub(struct brw_wm_compile *c,
1047 const struct prog_instruction *inst)
1048 {
1049 struct brw_compile *p = &c->func;
1050 struct brw_reg src0, src1, dst;
1051 GLuint mask = inst->DstReg.WriteMask;
1052 int i;
1053 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1054 for (i = 0 ; i < 4; i++) {
1055 if (mask & (1<<i)) {
1056 dst = get_dst_reg(c, inst, i);
1057 src0 = get_src_reg(c, inst, 0, i);
1058 src1 = get_src_reg_imm(c, inst, 1, i);
1059 brw_ADD(p, dst, src0, negate(src1));
1060 }
1061 }
1062 brw_set_saturate(p, 0);
1063 }
1064
1065 static void emit_mul(struct brw_wm_compile *c,
1066 const struct prog_instruction *inst)
1067 {
1068 struct brw_compile *p = &c->func;
1069 struct brw_reg src0, src1, dst;
1070 GLuint mask = inst->DstReg.WriteMask;
1071 int i;
1072 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1073 for (i = 0 ; i < 4; i++) {
1074 if (mask & (1<<i)) {
1075 dst = get_dst_reg(c, inst, i);
1076 src0 = get_src_reg(c, inst, 0, i);
1077 src1 = get_src_reg_imm(c, inst, 1, i);
1078 brw_MUL(p, dst, src0, src1);
1079 }
1080 }
1081 brw_set_saturate(p, 0);
1082 }
1083
1084 static void emit_frc(struct brw_wm_compile *c,
1085 const struct prog_instruction *inst)
1086 {
1087 struct brw_compile *p = &c->func;
1088 struct brw_reg src0, dst;
1089 GLuint mask = inst->DstReg.WriteMask;
1090 int i;
1091 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1092 for (i = 0 ; i < 4; i++) {
1093 if (mask & (1<<i)) {
1094 dst = get_dst_reg(c, inst, i);
1095 src0 = get_src_reg_imm(c, inst, 0, i);
1096 brw_FRC(p, dst, src0);
1097 }
1098 }
1099 if (inst->SaturateMode != SATURATE_OFF)
1100 brw_set_saturate(p, 0);
1101 }
1102
1103 static void emit_flr(struct brw_wm_compile *c,
1104 const struct prog_instruction *inst)
1105 {
1106 struct brw_compile *p = &c->func;
1107 struct brw_reg src0, dst;
1108 GLuint mask = inst->DstReg.WriteMask;
1109 int i;
1110 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1111 for (i = 0 ; i < 4; i++) {
1112 if (mask & (1<<i)) {
1113 dst = get_dst_reg(c, inst, i);
1114 src0 = get_src_reg_imm(c, inst, 0, i);
1115 brw_RNDD(p, dst, src0);
1116 }
1117 }
1118 brw_set_saturate(p, 0);
1119 }
1120
1121
1122 static void emit_min_max(struct brw_wm_compile *c,
1123 const struct prog_instruction *inst)
1124 {
1125 struct brw_compile *p = &c->func;
1126 const GLuint mask = inst->DstReg.WriteMask;
1127 const int mark = mark_tmps(c);
1128 int i;
1129 brw_push_insn_state(p);
1130 for (i = 0; i < 4; i++) {
1131 if (mask & (1<<i)) {
1132 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1133 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1134 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1135 struct brw_reg dst;
1136 /* if dst==src0 or dst==src1 we need to use a temp reg */
1137 GLboolean use_temp = brw_same_reg(dst, src0) ||
1138 brw_same_reg(dst, src1);
1139 if (use_temp)
1140 dst = alloc_tmp(c);
1141 else
1142 dst = real_dst;
1143
1144 /*
1145 printf(" Min/max: dst %d src0 %d src1 %d\n",
1146 dst.nr, src0.nr, src1.nr);
1147 */
1148 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1149 brw_MOV(p, dst, src0);
1150 brw_set_saturate(p, 0);
1151
1152 if (inst->Opcode == OPCODE_MIN)
1153 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1154 else
1155 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1156
1157 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1158 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1159 brw_MOV(p, dst, src1);
1160 brw_set_saturate(p, 0);
1161 brw_set_predicate_control_flag_value(p, 0xff);
1162 if (use_temp)
1163 brw_MOV(p, real_dst, dst);
1164 }
1165 }
1166 brw_pop_insn_state(p);
1167 release_tmps(c, mark);
1168 }
1169
1170 static void emit_pow(struct brw_wm_compile *c,
1171 const struct prog_instruction *inst)
1172 {
1173 struct brw_compile *p = &c->func;
1174 struct brw_reg dst, src0, src1;
1175 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1176 src0 = get_src_reg_imm(c, inst, 0, 0);
1177 src1 = get_src_reg_imm(c, inst, 1, 0);
1178
1179 brw_MOV(p, brw_message_reg(2), src0);
1180 brw_MOV(p, brw_message_reg(3), src1);
1181
1182 brw_math(p,
1183 dst,
1184 BRW_MATH_FUNCTION_POW,
1185 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1186 2,
1187 brw_null_reg(),
1188 BRW_MATH_DATA_VECTOR,
1189 BRW_MATH_PRECISION_FULL);
1190 }
1191
1192 static void emit_lrp(struct brw_wm_compile *c,
1193 const struct prog_instruction *inst)
1194 {
1195 struct brw_compile *p = &c->func;
1196 GLuint mask = inst->DstReg.WriteMask;
1197 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1198 int i;
1199 int mark = mark_tmps(c);
1200 for (i = 0; i < 4; i++) {
1201 if (mask & (1<<i)) {
1202 dst = get_dst_reg(c, inst, i);
1203 src0 = get_src_reg(c, inst, 0, i);
1204
1205 src1 = get_src_reg_imm(c, inst, 1, i);
1206
1207 if (src1.nr == dst.nr) {
1208 tmp1 = alloc_tmp(c);
1209 brw_MOV(p, tmp1, src1);
1210 } else
1211 tmp1 = src1;
1212
1213 src2 = get_src_reg(c, inst, 2, i);
1214 if (src2.nr == dst.nr) {
1215 tmp2 = alloc_tmp(c);
1216 brw_MOV(p, tmp2, src2);
1217 } else
1218 tmp2 = src2;
1219
1220 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1221 brw_MUL(p, brw_null_reg(), dst, tmp2);
1222 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1223 brw_MAC(p, dst, src0, tmp1);
1224 brw_set_saturate(p, 0);
1225 }
1226 release_tmps(c, mark);
1227 }
1228 }
1229
1230 /**
1231 * For GLSL shaders, this KIL will be unconditional.
1232 * It may be contained inside an IF/ENDIF structure of course.
1233 */
1234 static void emit_kil(struct brw_wm_compile *c)
1235 {
1236 struct brw_compile *p = &c->func;
1237 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1238 brw_push_insn_state(p);
1239 brw_set_mask_control(p, BRW_MASK_DISABLE);
1240 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1241 brw_AND(p, depth, c->emit_mask_reg, depth);
1242 brw_pop_insn_state(p);
1243 }
1244
1245 static void emit_mad(struct brw_wm_compile *c,
1246 const struct prog_instruction *inst)
1247 {
1248 struct brw_compile *p = &c->func;
1249 GLuint mask = inst->DstReg.WriteMask;
1250 struct brw_reg dst, src0, src1, src2;
1251 int i;
1252
1253 for (i = 0; i < 4; i++) {
1254 if (mask & (1<<i)) {
1255 dst = get_dst_reg(c, inst, i);
1256 src0 = get_src_reg(c, inst, 0, i);
1257 src1 = get_src_reg_imm(c, inst, 1, i);
1258 src2 = get_src_reg_imm(c, inst, 2, i);
1259 brw_MUL(p, dst, src0, src1);
1260
1261 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1262 brw_ADD(p, dst, dst, src2);
1263 brw_set_saturate(p, 0);
1264 }
1265 }
1266 }
1267
1268 static void emit_sop(struct brw_wm_compile *c,
1269 const struct prog_instruction *inst, GLuint cond)
1270 {
1271 struct brw_compile *p = &c->func;
1272 GLuint mask = inst->DstReg.WriteMask;
1273 struct brw_reg dst, src0, src1;
1274 int i;
1275
1276 for (i = 0; i < 4; i++) {
1277 if (mask & (1<<i)) {
1278 dst = get_dst_reg(c, inst, i);
1279 src0 = get_src_reg(c, inst, 0, i);
1280 src1 = get_src_reg_imm(c, inst, 1, i);
1281 brw_push_insn_state(p);
1282 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1283 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1284 brw_MOV(p, dst, brw_imm_f(0.0));
1285 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1286 brw_MOV(p, dst, brw_imm_f(1.0));
1287 brw_pop_insn_state(p);
1288 }
1289 }
1290 }
1291
1292 static void emit_slt(struct brw_wm_compile *c,
1293 const struct prog_instruction *inst)
1294 {
1295 emit_sop(c, inst, BRW_CONDITIONAL_L);
1296 }
1297
1298 static void emit_sle(struct brw_wm_compile *c,
1299 const struct prog_instruction *inst)
1300 {
1301 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1302 }
1303
1304 static void emit_sgt(struct brw_wm_compile *c,
1305 const struct prog_instruction *inst)
1306 {
1307 emit_sop(c, inst, BRW_CONDITIONAL_G);
1308 }
1309
1310 static void emit_sge(struct brw_wm_compile *c,
1311 const struct prog_instruction *inst)
1312 {
1313 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1314 }
1315
1316 static void emit_seq(struct brw_wm_compile *c,
1317 const struct prog_instruction *inst)
1318 {
1319 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1320 }
1321
1322 static void emit_sne(struct brw_wm_compile *c,
1323 const struct prog_instruction *inst)
1324 {
1325 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1326 }
1327
1328 static void emit_ddx(struct brw_wm_compile *c,
1329 const struct prog_instruction *inst)
1330 {
1331 struct brw_compile *p = &c->func;
1332 GLuint mask = inst->DstReg.WriteMask;
1333 struct brw_reg interp[4];
1334 struct brw_reg dst;
1335 struct brw_reg src0, w;
1336 GLuint nr, i;
1337 src0 = get_src_reg(c, inst, 0, 0);
1338 w = get_src_reg(c, inst, 1, 3);
1339 nr = src0.nr;
1340 interp[0] = brw_vec1_grf(nr, 0);
1341 interp[1] = brw_vec1_grf(nr, 4);
1342 interp[2] = brw_vec1_grf(nr+1, 0);
1343 interp[3] = brw_vec1_grf(nr+1, 4);
1344 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1345 for(i = 0; i < 4; i++ ) {
1346 if (mask & (1<<i)) {
1347 dst = get_dst_reg(c, inst, i);
1348 brw_MOV(p, dst, interp[i]);
1349 brw_MUL(p, dst, dst, w);
1350 }
1351 }
1352 brw_set_saturate(p, 0);
1353 }
1354
1355 static void emit_ddy(struct brw_wm_compile *c,
1356 const struct prog_instruction *inst)
1357 {
1358 struct brw_compile *p = &c->func;
1359 GLuint mask = inst->DstReg.WriteMask;
1360 struct brw_reg interp[4];
1361 struct brw_reg dst;
1362 struct brw_reg src0, w;
1363 GLuint nr, i;
1364
1365 src0 = get_src_reg(c, inst, 0, 0);
1366 nr = src0.nr;
1367 w = get_src_reg(c, inst, 1, 3);
1368 interp[0] = brw_vec1_grf(nr, 0);
1369 interp[1] = brw_vec1_grf(nr, 4);
1370 interp[2] = brw_vec1_grf(nr+1, 0);
1371 interp[3] = brw_vec1_grf(nr+1, 4);
1372 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1373 for(i = 0; i < 4; i++ ) {
1374 if (mask & (1<<i)) {
1375 dst = get_dst_reg(c, inst, i);
1376 brw_MOV(p, dst, suboffset(interp[i], 1));
1377 brw_MUL(p, dst, dst, w);
1378 }
1379 }
1380 brw_set_saturate(p, 0);
1381 }
1382
1383 static INLINE struct brw_reg high_words( struct brw_reg reg )
1384 {
1385 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1386 0, 8, 2 );
1387 }
1388
1389 static INLINE struct brw_reg low_words( struct brw_reg reg )
1390 {
1391 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1392 }
1393
1394 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1395 {
1396 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1397 }
1398
1399 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1400 {
1401 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1402 0, 16, 2 );
1403 }
1404
1405 /* One-, two- and three-dimensional Perlin noise, similar to the description
1406 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1407 static void noise1_sub( struct brw_wm_compile *c ) {
1408
1409 struct brw_compile *p = &c->func;
1410 struct brw_reg param,
1411 x0, x1, /* gradients at each end */
1412 t, tmp[ 2 ], /* float temporaries */
1413 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1414 int i;
1415 int mark = mark_tmps( c );
1416
1417 x0 = alloc_tmp( c );
1418 x1 = alloc_tmp( c );
1419 t = alloc_tmp( c );
1420 tmp[ 0 ] = alloc_tmp( c );
1421 tmp[ 1 ] = alloc_tmp( c );
1422 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1423 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1424 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1425 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1426 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1427
1428 param = lookup_tmp( c, mark - 2 );
1429
1430 brw_set_access_mode( p, BRW_ALIGN_1 );
1431
1432 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1433
1434 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1435 be hashed. Also compute the remainder (offset within the unit
1436 length), interleaved to reduce register dependency penalties. */
1437 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1438 brw_FRC( p, param, param );
1439 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1440 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1441 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1442
1443 /* We're now ready to perform the hashing. The two hashes are
1444 interleaved for performance. The hash function used is
1445 designed to rapidly achieve avalanche and require only 32x16
1446 bit multiplication, and 16-bit swizzles (which we get for
1447 free). We can't use immediate operands in the multiplies,
1448 because immediates are permitted only in src1 and the 16-bit
1449 factor is permitted only in src0. */
1450 for( i = 0; i < 2; i++ )
1451 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1452 for( i = 0; i < 2; i++ )
1453 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1454 high_words( itmp[ i ] ) );
1455 for( i = 0; i < 2; i++ )
1456 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1457 for( i = 0; i < 2; i++ )
1458 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1459 high_words( itmp[ i ] ) );
1460 for( i = 0; i < 2; i++ )
1461 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1462 for( i = 0; i < 2; i++ )
1463 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1464 high_words( itmp[ i ] ) );
1465
1466 /* Now we want to initialise the two gradients based on the
1467 hashes. Format conversion from signed integer to float leaves
1468 everything scaled too high by a factor of pow( 2, 31 ), but
1469 we correct for that right at the end. */
1470 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1471 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1472 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1473
1474 brw_MUL( p, x0, x0, param );
1475 brw_MUL( p, x1, x1, t );
1476
1477 /* We interpolate between the gradients using the polynomial
1478 6t^5 - 15t^4 + 10t^3 (Perlin). */
1479 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1480 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1481 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1482 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1483 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1484 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1485 pipeline */
1486 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1487 brw_MUL( p, param, tmp[ 0 ], param );
1488 brw_MUL( p, x1, x1, param );
1489 brw_ADD( p, x0, x0, x1 );
1490 /* scale by pow( 2, -30 ), to compensate for the format conversion
1491 above and an extra factor of 2 so that a single gradient covers
1492 the [-1,1] range */
1493 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1494
1495 release_tmps( c, mark );
1496 }
1497
1498 static void emit_noise1( struct brw_wm_compile *c,
1499 const struct prog_instruction *inst )
1500 {
1501 struct brw_compile *p = &c->func;
1502 struct brw_reg src, param, dst;
1503 GLuint mask = inst->DstReg.WriteMask;
1504 int i;
1505 int mark = mark_tmps( c );
1506
1507 assert( mark == 0 );
1508
1509 src = get_src_reg( c, inst, 0, 0 );
1510
1511 param = alloc_tmp( c );
1512
1513 brw_MOV( p, param, src );
1514
1515 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1516
1517 /* Fill in the result: */
1518 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1519 for (i = 0 ; i < 4; i++) {
1520 if (mask & (1<<i)) {
1521 dst = get_dst_reg(c, inst, i);
1522 brw_MOV( p, dst, param );
1523 }
1524 }
1525 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1526 brw_set_saturate( p, 0 );
1527
1528 release_tmps( c, mark );
1529 }
1530
1531 static void noise2_sub( struct brw_wm_compile *c ) {
1532
1533 struct brw_compile *p = &c->func;
1534 struct brw_reg param0, param1,
1535 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1536 t, tmp[ 4 ], /* float temporaries */
1537 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1538 int i;
1539 int mark = mark_tmps( c );
1540
1541 x0y0 = alloc_tmp( c );
1542 x0y1 = alloc_tmp( c );
1543 x1y0 = alloc_tmp( c );
1544 x1y1 = alloc_tmp( c );
1545 t = alloc_tmp( c );
1546 for( i = 0; i < 4; i++ ) {
1547 tmp[ i ] = alloc_tmp( c );
1548 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1549 }
1550 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1551 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1552 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1553
1554 param0 = lookup_tmp( c, mark - 3 );
1555 param1 = lookup_tmp( c, mark - 2 );
1556
1557 brw_set_access_mode( p, BRW_ALIGN_1 );
1558
1559 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1560 be hashed. Also compute the remainders (offsets within the unit
1561 square), interleaved to reduce register dependency penalties. */
1562 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1563 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1564 brw_FRC( p, param0, param0 );
1565 brw_FRC( p, param1, param1 );
1566 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1567 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1568 low_words( itmp[ 1 ] ) );
1569 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1570 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1571 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1572 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1573 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1574
1575 /* We're now ready to perform the hashing. The four hashes are
1576 interleaved for performance. The hash function used is
1577 designed to rapidly achieve avalanche and require only 32x16
1578 bit multiplication, and 16-bit swizzles (which we get for
1579 free). We can't use immediate operands in the multiplies,
1580 because immediates are permitted only in src1 and the 16-bit
1581 factor is permitted only in src0. */
1582 for( i = 0; i < 4; i++ )
1583 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1584 for( i = 0; i < 4; i++ )
1585 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1586 high_words( itmp[ i ] ) );
1587 for( i = 0; i < 4; i++ )
1588 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1589 for( i = 0; i < 4; i++ )
1590 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1591 high_words( itmp[ i ] ) );
1592 for( i = 0; i < 4; i++ )
1593 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1594 for( i = 0; i < 4; i++ )
1595 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1596 high_words( itmp[ i ] ) );
1597
1598 /* Now we want to initialise the four gradients based on the
1599 hashes. Format conversion from signed integer to float leaves
1600 everything scaled too high by a factor of pow( 2, 15 ), but
1601 we correct for that right at the end. */
1602 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1603 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1604 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1605 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1606 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1607
1608 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1609 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1610 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1611 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1612
1613 brw_MUL( p, x1y0, x1y0, t );
1614 brw_MUL( p, x1y1, x1y1, t );
1615 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1616 brw_MUL( p, x0y0, x0y0, param0 );
1617 brw_MUL( p, x0y1, x0y1, param0 );
1618
1619 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1620 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1621 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1622 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1623
1624 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1625 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1626 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1627 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1628
1629 /* We interpolate between the gradients using the polynomial
1630 6t^5 - 15t^4 + 10t^3 (Perlin). */
1631 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1632 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1633 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1634 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1635 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1636 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1637 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1638 pipeline */
1639 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1640 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1641 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1642 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1643 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1644 pipeline */
1645 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1646 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1647 brw_MUL( p, param0, tmp[ 0 ], param0 );
1648 brw_MUL( p, param1, tmp[ 1 ], param1 );
1649
1650 /* Here we interpolate in the y dimension... */
1651 brw_MUL( p, x0y1, x0y1, param1 );
1652 brw_MUL( p, x1y1, x1y1, param1 );
1653 brw_ADD( p, x0y0, x0y0, x0y1 );
1654 brw_ADD( p, x1y0, x1y0, x1y1 );
1655
1656 /* And now in x. There are horrible register dependencies here,
1657 but we have nothing else to do. */
1658 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1659 brw_MUL( p, x1y0, x1y0, param0 );
1660 brw_ADD( p, x0y0, x0y0, x1y0 );
1661
1662 /* scale by pow( 2, -15 ), as described above */
1663 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1664
1665 release_tmps( c, mark );
1666 }
1667
1668 static void emit_noise2( struct brw_wm_compile *c,
1669 const struct prog_instruction *inst )
1670 {
1671 struct brw_compile *p = &c->func;
1672 struct brw_reg src0, src1, param0, param1, dst;
1673 GLuint mask = inst->DstReg.WriteMask;
1674 int i;
1675 int mark = mark_tmps( c );
1676
1677 assert( mark == 0 );
1678
1679 src0 = get_src_reg( c, inst, 0, 0 );
1680 src1 = get_src_reg( c, inst, 0, 1 );
1681
1682 param0 = alloc_tmp( c );
1683 param1 = alloc_tmp( c );
1684
1685 brw_MOV( p, param0, src0 );
1686 brw_MOV( p, param1, src1 );
1687
1688 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1689
1690 /* Fill in the result: */
1691 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1692 for (i = 0 ; i < 4; i++) {
1693 if (mask & (1<<i)) {
1694 dst = get_dst_reg(c, inst, i);
1695 brw_MOV( p, dst, param0 );
1696 }
1697 }
1698 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1699 brw_set_saturate( p, 0 );
1700
1701 release_tmps( c, mark );
1702 }
1703
1704 /**
1705 * The three-dimensional case is much like the one- and two- versions above,
1706 * but since the number of corners is rapidly growing we now pack 16 16-bit
1707 * hashes into each register to extract more parallelism from the EUs.
1708 */
1709 static void noise3_sub( struct brw_wm_compile *c ) {
1710
1711 struct brw_compile *p = &c->func;
1712 struct brw_reg param0, param1, param2,
1713 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1714 xi, yi, zi, /* interpolation coefficients */
1715 t, tmp[ 8 ], /* float temporaries */
1716 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1717 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1718 int i;
1719 int mark = mark_tmps( c );
1720
1721 x0y0 = alloc_tmp( c );
1722 x0y1 = alloc_tmp( c );
1723 x1y0 = alloc_tmp( c );
1724 x1y1 = alloc_tmp( c );
1725 xi = alloc_tmp( c );
1726 yi = alloc_tmp( c );
1727 zi = alloc_tmp( c );
1728 t = alloc_tmp( c );
1729 for( i = 0; i < 8; i++ ) {
1730 tmp[ i ] = alloc_tmp( c );
1731 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1732 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1733 }
1734
1735 param0 = lookup_tmp( c, mark - 4 );
1736 param1 = lookup_tmp( c, mark - 3 );
1737 param2 = lookup_tmp( c, mark - 2 );
1738
1739 brw_set_access_mode( p, BRW_ALIGN_1 );
1740
1741 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1742 be hashed. Also compute the remainders (offsets within the unit
1743 cube), interleaved to reduce register dependency penalties. */
1744 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1745 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1746 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1747 brw_FRC( p, param0, param0 );
1748 brw_FRC( p, param1, param1 );
1749 brw_FRC( p, param2, param2 );
1750 /* Since we now have only 16 bits of precision in the hash, we must
1751 be more careful about thorough mixing to maintain entropy as we
1752 squash the input vector into a small scalar. */
1753 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1754 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1755 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1756 brw_imm_uw( 0x9B93 ) );
1757 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1758 brw_imm_uw( 0xBC8F ) );
1759
1760 /* Temporarily disable the execution mask while we work with ExecSize=16
1761 channels (the mask is set for ExecSize=8 and is probably incorrect).
1762 Although this might cause execution of unwanted channels, the code
1763 writes only to temporary registers and has no side effects, so
1764 disabling the mask is harmless. */
1765 brw_push_insn_state( p );
1766 brw_set_mask_control( p, BRW_MASK_DISABLE );
1767 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1768 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1769 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1770
1771 /* We're now ready to perform the hashing. The eight hashes are
1772 interleaved for performance. The hash function used is
1773 designed to rapidly achieve avalanche and require only 16x16
1774 bit multiplication, and 8-bit swizzles (which we get for
1775 free). */
1776 for( i = 0; i < 4; i++ )
1777 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1778 for( i = 0; i < 4; i++ )
1779 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1780 odd_bytes( wtmp[ i ] ) );
1781 for( i = 0; i < 4; i++ )
1782 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1783 for( i = 0; i < 4; i++ )
1784 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1785 odd_bytes( wtmp[ i ] ) );
1786 brw_pop_insn_state( p );
1787
1788 /* Now we want to initialise the four rear gradients based on the
1789 hashes. Format conversion from signed integer to float leaves
1790 everything scaled too high by a factor of pow( 2, 15 ), but
1791 we correct for that right at the end. */
1792 /* x component */
1793 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1794 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1795 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1796 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1797 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1798
1799 brw_push_insn_state( p );
1800 brw_set_mask_control( p, BRW_MASK_DISABLE );
1801 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1802 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1803 brw_pop_insn_state( p );
1804
1805 brw_MUL( p, x1y0, x1y0, t );
1806 brw_MUL( p, x1y1, x1y1, t );
1807 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1808 brw_MUL( p, x0y0, x0y0, param0 );
1809 brw_MUL( p, x0y1, x0y1, param0 );
1810
1811 /* y component */
1812 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1813 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1814 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1815 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1816
1817 brw_push_insn_state( p );
1818 brw_set_mask_control( p, BRW_MASK_DISABLE );
1819 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1820 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1821 brw_pop_insn_state( p );
1822
1823 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1824 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1825 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1826 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1827 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1828
1829 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1830 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1831 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1832 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1833
1834 /* z component */
1835 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1836 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1837 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1838 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1839
1840 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1841 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1842 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1843 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1844
1845 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1846 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1847 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1848 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1849
1850 /* We interpolate between the gradients using the polynomial
1851 6t^5 - 15t^4 + 10t^3 (Perlin). */
1852 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1853 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1854 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1855 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1856 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1857 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1858 brw_MUL( p, xi, xi, param0 );
1859 brw_MUL( p, yi, yi, param1 );
1860 brw_MUL( p, zi, zi, param2 );
1861 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1862 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1863 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1864 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1865 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1866 brw_MUL( p, xi, xi, param0 );
1867 brw_MUL( p, yi, yi, param1 );
1868 brw_MUL( p, zi, zi, param2 );
1869 brw_MUL( p, xi, xi, param0 );
1870 brw_MUL( p, yi, yi, param1 );
1871 brw_MUL( p, zi, zi, param2 );
1872 brw_MUL( p, xi, xi, param0 );
1873 brw_MUL( p, yi, yi, param1 );
1874 brw_MUL( p, zi, zi, param2 );
1875
1876 /* Here we interpolate in the y dimension... */
1877 brw_MUL( p, x0y1, x0y1, yi );
1878 brw_MUL( p, x1y1, x1y1, yi );
1879 brw_ADD( p, x0y0, x0y0, x0y1 );
1880 brw_ADD( p, x1y0, x1y0, x1y1 );
1881
1882 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1883 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1884 brw_MUL( p, x1y0, x1y0, xi );
1885 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1886
1887 /* Now do the same thing for the front four gradients... */
1888 /* x component */
1889 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1890 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1891 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1892 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1893
1894 brw_push_insn_state( p );
1895 brw_set_mask_control( p, BRW_MASK_DISABLE );
1896 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1897 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1898 brw_pop_insn_state( p );
1899
1900 brw_MUL( p, x1y0, x1y0, t );
1901 brw_MUL( p, x1y1, x1y1, t );
1902 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1903 brw_MUL( p, x0y0, x0y0, param0 );
1904 brw_MUL( p, x0y1, x0y1, param0 );
1905
1906 /* y component */
1907 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1908 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1909 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1910 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1911
1912 brw_push_insn_state( p );
1913 brw_set_mask_control( p, BRW_MASK_DISABLE );
1914 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1915 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1916 brw_pop_insn_state( p );
1917
1918 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1919 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1920 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1921 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1922 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1923
1924 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1925 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1926 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1927 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1928
1929 /* z component */
1930 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1931 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1932 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1933 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1934
1935 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1936 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1937 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1938 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1939
1940 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1941 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1942 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1943 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1944
1945 /* The interpolation coefficients are still around from last time, so
1946 again interpolate in the y dimension... */
1947 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1948 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1949 brw_MUL( p, x0y1, x0y1, yi );
1950 brw_MUL( p, x1y1, x1y1, yi );
1951 brw_ADD( p, x0y0, x0y0, x0y1 );
1952 brw_ADD( p, x1y0, x1y0, x1y1 );
1953
1954 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1955 time put the front face in tmp[ 1 ] and we're nearly there... */
1956 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1957 brw_MUL( p, x1y0, x1y0, xi );
1958 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1959
1960 /* The final interpolation, in the z dimension: */
1961 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1962 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1963 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1964
1965 /* scale by pow( 2, -15 ), as described above */
1966 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1967
1968 release_tmps( c, mark );
1969 }
1970
1971 static void emit_noise3( struct brw_wm_compile *c,
1972 const struct prog_instruction *inst )
1973 {
1974 struct brw_compile *p = &c->func;
1975 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1976 GLuint mask = inst->DstReg.WriteMask;
1977 int i;
1978 int mark = mark_tmps( c );
1979
1980 assert( mark == 0 );
1981
1982 src0 = get_src_reg( c, inst, 0, 0 );
1983 src1 = get_src_reg( c, inst, 0, 1 );
1984 src2 = get_src_reg( c, inst, 0, 2 );
1985
1986 param0 = alloc_tmp( c );
1987 param1 = alloc_tmp( c );
1988 param2 = alloc_tmp( c );
1989
1990 brw_MOV( p, param0, src0 );
1991 brw_MOV( p, param1, src1 );
1992 brw_MOV( p, param2, src2 );
1993
1994 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1995
1996 /* Fill in the result: */
1997 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1998 for (i = 0 ; i < 4; i++) {
1999 if (mask & (1<<i)) {
2000 dst = get_dst_reg(c, inst, i);
2001 brw_MOV( p, dst, param0 );
2002 }
2003 }
2004 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2005 brw_set_saturate( p, 0 );
2006
2007 release_tmps( c, mark );
2008 }
2009
2010 /**
2011 * For the four-dimensional case, the little micro-optimisation benefits
2012 * we obtain by unrolling all the loops aren't worth the massive bloat it
2013 * now causes. Instead, we loop twice around performing a similar operation
2014 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2015 * code to glue it all together.
2016 */
2017 static void noise4_sub( struct brw_wm_compile *c )
2018 {
2019 struct brw_compile *p = &c->func;
2020 struct brw_reg param[ 4 ],
2021 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2022 w0, /* noise for the w=0 cube */
2023 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2024 interp[ 4 ], /* interpolation coefficients */
2025 t, tmp[ 8 ], /* float temporaries */
2026 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2027 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2028 int i, j;
2029 int mark = mark_tmps( c );
2030 GLuint loop, origin;
2031
2032 x0y0 = alloc_tmp( c );
2033 x0y1 = alloc_tmp( c );
2034 x1y0 = alloc_tmp( c );
2035 x1y1 = alloc_tmp( c );
2036 t = alloc_tmp( c );
2037 w0 = alloc_tmp( c );
2038 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2039 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2040
2041 for( i = 0; i < 4; i++ ) {
2042 param[ i ] = lookup_tmp( c, mark - 5 + i );
2043 interp[ i ] = alloc_tmp( c );
2044 }
2045
2046 for( i = 0; i < 8; i++ ) {
2047 tmp[ i ] = alloc_tmp( c );
2048 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2049 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2050 }
2051
2052 brw_set_access_mode( p, BRW_ALIGN_1 );
2053
2054 /* We only want 16 bits of precision from the integral part of each
2055 co-ordinate, but unfortunately the RNDD semantics would saturate
2056 at 16 bits if we performed the operation directly to a 16-bit
2057 destination. Therefore, we round to 32-bit temporaries where
2058 appropriate, and then store only the lower 16 bits. */
2059 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2060 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2061 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2062 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2063 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2064 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2065
2066 /* Modify the flag register here, because the side effect is useful
2067 later (see below). We know for certain that all flags will be
2068 cleared, since the FRC instruction cannot possibly generate
2069 negative results. Even for exceptional inputs (infinities, denormals,
2070 NaNs), the architecture guarantees that the L conditional is false. */
2071 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2072 brw_FRC( p, param[ 0 ], param[ 0 ] );
2073 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2074 for( i = 1; i < 4; i++ )
2075 brw_FRC( p, param[ i ], param[ i ] );
2076
2077 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2078 of all. */
2079 for( i = 0; i < 4; i++ )
2080 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2081 for( i = 0; i < 4; i++ )
2082 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2083 for( i = 0; i < 4; i++ )
2084 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2085 for( i = 0; i < 4; i++ )
2086 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2087 for( j = 0; j < 3; j++ )
2088 for( i = 0; i < 4; i++ )
2089 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2090
2091 /* Mark the current address, as it will be a jump destination. The
2092 following code will be executed twice: first, with the flag
2093 register clear indicating the w=0 case, and second with flags
2094 set for w=1. */
2095 loop = p->nr_insn;
2096
2097 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2098 be hashed. Since we have only 16 bits of precision in the hash, we
2099 must be careful about thorough mixing to maintain entropy as we
2100 squash the input vector into a small scalar. */
2101 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2102 brw_imm_uw( 0xBC8F ) );
2103 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2104 brw_imm_uw( 0xD0BD ) );
2105 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2106 brw_imm_uw( 0x9B93 ) );
2107 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2108 brw_imm_uw( 0xA359 ) );
2109 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2110 brw_imm_uw( 0xBC8F ) );
2111
2112 /* Temporarily disable the execution mask while we work with ExecSize=16
2113 channels (the mask is set for ExecSize=8 and is probably incorrect).
2114 Although this might cause execution of unwanted channels, the code
2115 writes only to temporary registers and has no side effects, so
2116 disabling the mask is harmless. */
2117 brw_push_insn_state( p );
2118 brw_set_mask_control( p, BRW_MASK_DISABLE );
2119 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2120 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2121 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2122
2123 /* We're now ready to perform the hashing. The eight hashes are
2124 interleaved for performance. The hash function used is
2125 designed to rapidly achieve avalanche and require only 16x16
2126 bit multiplication, and 8-bit swizzles (which we get for
2127 free). */
2128 for( i = 0; i < 4; i++ )
2129 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2130 for( i = 0; i < 4; i++ )
2131 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2132 odd_bytes( wtmp[ i ] ) );
2133 for( i = 0; i < 4; i++ )
2134 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2135 for( i = 0; i < 4; i++ )
2136 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2137 odd_bytes( wtmp[ i ] ) );
2138 brw_pop_insn_state( p );
2139
2140 /* Now we want to initialise the four rear gradients based on the
2141 hashes. Format conversion from signed integer to float leaves
2142 everything scaled too high by a factor of pow( 2, 15 ), but
2143 we correct for that right at the end. */
2144 /* x component */
2145 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2146 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2147 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2148 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2149 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2150
2151 brw_push_insn_state( p );
2152 brw_set_mask_control( p, BRW_MASK_DISABLE );
2153 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2154 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2155 brw_pop_insn_state( p );
2156
2157 brw_MUL( p, x1y0, x1y0, t );
2158 brw_MUL( p, x1y1, x1y1, t );
2159 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2160 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2161 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2162
2163 /* y component */
2164 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2165 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2166 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2167 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2168
2169 brw_push_insn_state( p );
2170 brw_set_mask_control( p, BRW_MASK_DISABLE );
2171 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2172 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2173 brw_pop_insn_state( p );
2174
2175 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2176 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2177 /* prepare t for the w component (used below): w the first time through
2178 the loop; w - 1 the second time) */
2179 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2180 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2181 p->current->header.predicate_inverse = 1;
2182 brw_MOV( p, t, param[ 3 ] );
2183 p->current->header.predicate_inverse = 0;
2184 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2185 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2186 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2187
2188 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2189 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2190 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2191 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2192
2193 /* z component */
2194 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2195 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2196 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2197 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2198
2199 brw_push_insn_state( p );
2200 brw_set_mask_control( p, BRW_MASK_DISABLE );
2201 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2202 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2203 brw_pop_insn_state( p );
2204
2205 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2206 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2207 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2208 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2209
2210 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2211 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2212 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2213 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2214
2215 /* w component */
2216 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2217 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2218 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2219 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2220
2221 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2222 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2223 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2224 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2225 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2226
2227 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2228 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2229 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2230 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2231
2232 /* Here we interpolate in the y dimension... */
2233 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2234 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2235 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2236 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2237 brw_ADD( p, x0y0, x0y0, x0y1 );
2238 brw_ADD( p, x1y0, x1y0, x1y1 );
2239
2240 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2241 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2242 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2243 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2244
2245 /* Now do the same thing for the front four gradients... */
2246 /* x component */
2247 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2248 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2249 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2250 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2251
2252 brw_push_insn_state( p );
2253 brw_set_mask_control( p, BRW_MASK_DISABLE );
2254 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2255 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2256 brw_pop_insn_state( p );
2257
2258 brw_MUL( p, x1y0, x1y0, t );
2259 brw_MUL( p, x1y1, x1y1, t );
2260 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2261 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2262 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2263
2264 /* y component */
2265 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2266 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2267 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2268 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2269
2270 brw_push_insn_state( p );
2271 brw_set_mask_control( p, BRW_MASK_DISABLE );
2272 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2273 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2274 brw_pop_insn_state( p );
2275
2276 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2277 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2278 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2279 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2280 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2281
2282 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2283 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2284 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2285 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2286
2287 /* z component */
2288 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2289 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2290 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2291 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2292
2293 brw_push_insn_state( p );
2294 brw_set_mask_control( p, BRW_MASK_DISABLE );
2295 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2296 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2297 brw_pop_insn_state( p );
2298
2299 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2300 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2301 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2302 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2303 /* prepare t for the w component (used below): w the first time through
2304 the loop; w - 1 the second time) */
2305 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2306 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2307 p->current->header.predicate_inverse = 1;
2308 brw_MOV( p, t, param[ 3 ] );
2309 p->current->header.predicate_inverse = 0;
2310 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2311
2312 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2313 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2314 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2315 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2316
2317 /* w component */
2318 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2319 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2320 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2321 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2322
2323 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2324 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2325 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2326 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2327
2328 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2329 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2330 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2331 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2332
2333 /* Interpolate in the y dimension: */
2334 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2335 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2336 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2337 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2338 brw_ADD( p, x0y0, x0y0, x0y1 );
2339 brw_ADD( p, x1y0, x1y0, x1y1 );
2340
2341 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2342 time put the front face in tmp[ 1 ] and we're nearly there... */
2343 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2344 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2345 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2346
2347 /* Another interpolation, in the z dimension: */
2348 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2349 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2350 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2351
2352 /* Exit the loop if we've computed both cubes... */
2353 origin = p->nr_insn;
2354 brw_push_insn_state( p );
2355 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2356 brw_set_mask_control( p, BRW_MASK_DISABLE );
2357 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2358 brw_pop_insn_state( p );
2359
2360 /* Save the result for the w=0 case, and increment the w coordinate: */
2361 brw_MOV( p, w0, tmp[ 0 ] );
2362 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2363 brw_imm_uw( 1 ) );
2364
2365 /* Loop around for the other cube. Explicitly set the flag register
2366 (unfortunately we must spend an extra instruction to do this: we
2367 can't rely on a side effect of the previous MOV or ADD because
2368 conditional modifiers which are normally true might be false in
2369 exceptional circumstances, e.g. given a NaN input; the add to
2370 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2371 brw_push_insn_state( p );
2372 brw_set_mask_control( p, BRW_MASK_DISABLE );
2373 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2374 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2375 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2376 brw_pop_insn_state( p );
2377
2378 /* Patch the previous conditional branch now that we know the
2379 destination address. */
2380 brw_set_src1( p->store + origin,
2381 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2382
2383 /* The very last interpolation. */
2384 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2385 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2386 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2387
2388 /* scale by pow( 2, -15 ), as described above */
2389 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2390
2391 release_tmps( c, mark );
2392 }
2393
2394 static void emit_noise4( struct brw_wm_compile *c,
2395 const struct prog_instruction *inst )
2396 {
2397 struct brw_compile *p = &c->func;
2398 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2399 GLuint mask = inst->DstReg.WriteMask;
2400 int i;
2401 int mark = mark_tmps( c );
2402
2403 assert( mark == 0 );
2404
2405 src0 = get_src_reg( c, inst, 0, 0 );
2406 src1 = get_src_reg( c, inst, 0, 1 );
2407 src2 = get_src_reg( c, inst, 0, 2 );
2408 src3 = get_src_reg( c, inst, 0, 3 );
2409
2410 param0 = alloc_tmp( c );
2411 param1 = alloc_tmp( c );
2412 param2 = alloc_tmp( c );
2413 param3 = alloc_tmp( c );
2414
2415 brw_MOV( p, param0, src0 );
2416 brw_MOV( p, param1, src1 );
2417 brw_MOV( p, param2, src2 );
2418 brw_MOV( p, param3, src3 );
2419
2420 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2421
2422 /* Fill in the result: */
2423 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2424 for (i = 0 ; i < 4; i++) {
2425 if (mask & (1<<i)) {
2426 dst = get_dst_reg(c, inst, i);
2427 brw_MOV( p, dst, param0 );
2428 }
2429 }
2430 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2431 brw_set_saturate( p, 0 );
2432
2433 release_tmps( c, mark );
2434 }
2435
2436 static void emit_wpos_xy(struct brw_wm_compile *c,
2437 const struct prog_instruction *inst)
2438 {
2439 struct brw_compile *p = &c->func;
2440 GLuint mask = inst->DstReg.WriteMask;
2441 struct brw_reg src0[2], dst[2];
2442
2443 dst[0] = get_dst_reg(c, inst, 0);
2444 dst[1] = get_dst_reg(c, inst, 1);
2445
2446 src0[0] = get_src_reg(c, inst, 0, 0);
2447 src0[1] = get_src_reg(c, inst, 0, 1);
2448
2449 /* Calculate the pixel offset from window bottom left into destination
2450 * X and Y channels.
2451 */
2452 if (mask & WRITEMASK_X) {
2453 /* X' = X - origin_x */
2454 brw_ADD(p,
2455 dst[0],
2456 retype(src0[0], BRW_REGISTER_TYPE_W),
2457 brw_imm_d(0 - c->key.origin_x));
2458 }
2459
2460 if (mask & WRITEMASK_Y) {
2461 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2462 brw_ADD(p,
2463 dst[1],
2464 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2465 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2466 }
2467 }
2468
2469 /* TODO
2470 BIAS on SIMD8 not working yet...
2471 */
2472 static void emit_txb(struct brw_wm_compile *c,
2473 const struct prog_instruction *inst)
2474 {
2475 struct brw_compile *p = &c->func;
2476 struct brw_reg dst[4], src[4], payload_reg;
2477 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2478 GLuint i;
2479
2480 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2481
2482 for (i = 0; i < 4; i++)
2483 dst[i] = get_dst_reg(c, inst, i);
2484 for (i = 0; i < 4; i++)
2485 src[i] = get_src_reg(c, inst, 0, i);
2486
2487 switch (inst->TexSrcTarget) {
2488 case TEXTURE_1D_INDEX:
2489 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2490 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2491 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2492 break;
2493 case TEXTURE_2D_INDEX:
2494 case TEXTURE_RECT_INDEX:
2495 brw_MOV(p, brw_message_reg(2), src[0]);
2496 brw_MOV(p, brw_message_reg(3), src[1]);
2497 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2498 break;
2499 default:
2500 brw_MOV(p, brw_message_reg(2), src[0]);
2501 brw_MOV(p, brw_message_reg(3), src[1]);
2502 brw_MOV(p, brw_message_reg(4), src[2]);
2503 break;
2504 }
2505 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2506 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2507 brw_SAMPLE(p,
2508 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2509 1, /* msg_reg_nr */
2510 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2511 SURF_INDEX_TEXTURE(unit),
2512 unit, /* sampler */
2513 inst->DstReg.WriteMask, /* writemask */
2514 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2515 4, /* response_length */
2516 4, /* msg_length */
2517 0); /* eot */
2518 }
2519
2520
2521 static void emit_tex(struct brw_wm_compile *c,
2522 const struct prog_instruction *inst)
2523 {
2524 struct brw_compile *p = &c->func;
2525 struct brw_reg dst[4], src[4], payload_reg;
2526 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2527 GLuint msg_len;
2528 GLuint i, nr;
2529 GLuint emit;
2530 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2531
2532 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2533
2534 for (i = 0; i < 4; i++)
2535 dst[i] = get_dst_reg(c, inst, i);
2536 for (i = 0; i < 4; i++)
2537 src[i] = get_src_reg(c, inst, 0, i);
2538
2539 switch (inst->TexSrcTarget) {
2540 case TEXTURE_1D_INDEX:
2541 emit = WRITEMASK_X;
2542 nr = 1;
2543 break;
2544 case TEXTURE_2D_INDEX:
2545 case TEXTURE_RECT_INDEX:
2546 emit = WRITEMASK_XY;
2547 nr = 2;
2548 break;
2549 default:
2550 emit = WRITEMASK_XYZ;
2551 nr = 3;
2552 break;
2553 }
2554 msg_len = 1;
2555
2556 /* move/load S, T, R coords */
2557 for (i = 0; i < nr; i++) {
2558 static const GLuint swz[4] = {0,1,2,2};
2559 if (emit & (1<<i))
2560 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2561 else
2562 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2563 msg_len += 1;
2564 }
2565
2566 if (shadow) {
2567 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2568 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2569 }
2570
2571 brw_SAMPLE(p,
2572 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2573 1, /* msg_reg_nr */
2574 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2575 SURF_INDEX_TEXTURE(unit),
2576 unit, /* sampler */
2577 inst->DstReg.WriteMask, /* writemask */
2578 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2579 4, /* response_length */
2580 shadow ? 6 : 4, /* msg_length */
2581 0); /* eot */
2582
2583 if (shadow)
2584 brw_MOV(p, dst[3], brw_imm_f(1.0));
2585 }
2586
2587
2588 /**
2589 * Resolve subroutine calls after code emit is done.
2590 */
2591 static void post_wm_emit( struct brw_wm_compile *c )
2592 {
2593 brw_resolve_cals(&c->func);
2594 }
2595
2596 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2597 {
2598 #define MAX_IFSN 32
2599 #define MAX_LOOP_DEPTH 32
2600 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2601 struct brw_instruction *inst0, *inst1;
2602 int i, if_insn = 0, loop_insn = 0;
2603 struct brw_compile *p = &c->func;
2604 struct brw_indirect stack_index = brw_indirect(0, 0);
2605
2606 c->reg_index = 0;
2607 prealloc_reg(c);
2608 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2609 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2610
2611 for (i = 0; i < c->nr_fp_insns; i++) {
2612 const struct prog_instruction *inst = &c->prog_instructions[i];
2613
2614 #if 0
2615 _mesa_printf("Inst %d: ", i);
2616 _mesa_print_instruction(inst);
2617 #endif
2618
2619 /* fetch any constants that this instruction needs */
2620 if (c->fp->use_const_buffer)
2621 fetch_constants(c, inst);
2622
2623 if (inst->CondUpdate)
2624 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2625 else
2626 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2627
2628 switch (inst->Opcode) {
2629 case WM_PIXELXY:
2630 emit_pixel_xy(c, inst);
2631 break;
2632 case WM_DELTAXY:
2633 emit_delta_xy(c, inst);
2634 break;
2635 case WM_PIXELW:
2636 emit_pixel_w(c, inst);
2637 break;
2638 case WM_LINTERP:
2639 emit_linterp(c, inst);
2640 break;
2641 case WM_PINTERP:
2642 emit_pinterp(c, inst);
2643 break;
2644 case WM_CINTERP:
2645 emit_cinterp(c, inst);
2646 break;
2647 case WM_WPOSXY:
2648 emit_wpos_xy(c, inst);
2649 break;
2650 case WM_FB_WRITE:
2651 emit_fb_write(c, inst);
2652 break;
2653 case WM_FRONTFACING:
2654 emit_frontfacing(c, inst);
2655 break;
2656 case OPCODE_ABS:
2657 emit_abs(c, inst);
2658 break;
2659 case OPCODE_ADD:
2660 emit_add(c, inst);
2661 break;
2662 case OPCODE_ARL:
2663 emit_arl(c, inst);
2664 break;
2665 case OPCODE_SUB:
2666 emit_sub(c, inst);
2667 break;
2668 case OPCODE_FRC:
2669 emit_frc(c, inst);
2670 break;
2671 case OPCODE_FLR:
2672 emit_flr(c, inst);
2673 break;
2674 case OPCODE_LRP:
2675 emit_lrp(c, inst);
2676 break;
2677 case OPCODE_TRUNC:
2678 emit_trunc(c, inst);
2679 break;
2680 case OPCODE_MOV:
2681 case OPCODE_SWZ:
2682 emit_mov(c, inst);
2683 break;
2684 case OPCODE_DP3:
2685 emit_dp3(c, inst);
2686 break;
2687 case OPCODE_DP4:
2688 emit_dp4(c, inst);
2689 break;
2690 case OPCODE_XPD:
2691 emit_xpd(c, inst);
2692 break;
2693 case OPCODE_DPH:
2694 emit_dph(c, inst);
2695 break;
2696 case OPCODE_RCP:
2697 emit_rcp(c, inst);
2698 break;
2699 case OPCODE_RSQ:
2700 emit_rsq(c, inst);
2701 break;
2702 case OPCODE_SIN:
2703 emit_sin(c, inst);
2704 break;
2705 case OPCODE_COS:
2706 emit_cos(c, inst);
2707 break;
2708 case OPCODE_EX2:
2709 emit_ex2(c, inst);
2710 break;
2711 case OPCODE_LG2:
2712 emit_lg2(c, inst);
2713 break;
2714 case OPCODE_MIN:
2715 case OPCODE_MAX:
2716 emit_min_max(c, inst);
2717 break;
2718 case OPCODE_DDX:
2719 emit_ddx(c, inst);
2720 break;
2721 case OPCODE_DDY:
2722 emit_ddy(c, inst);
2723 break;
2724 case OPCODE_SLT:
2725 emit_slt(c, inst);
2726 break;
2727 case OPCODE_SLE:
2728 emit_sle(c, inst);
2729 break;
2730 case OPCODE_SGT:
2731 emit_sgt(c, inst);
2732 break;
2733 case OPCODE_SGE:
2734 emit_sge(c, inst);
2735 break;
2736 case OPCODE_SEQ:
2737 emit_seq(c, inst);
2738 break;
2739 case OPCODE_SNE:
2740 emit_sne(c, inst);
2741 break;
2742 case OPCODE_MUL:
2743 emit_mul(c, inst);
2744 break;
2745 case OPCODE_POW:
2746 emit_pow(c, inst);
2747 break;
2748 case OPCODE_MAD:
2749 emit_mad(c, inst);
2750 break;
2751 case OPCODE_NOISE1:
2752 emit_noise1(c, inst);
2753 break;
2754 case OPCODE_NOISE2:
2755 emit_noise2(c, inst);
2756 break;
2757 case OPCODE_NOISE3:
2758 emit_noise3(c, inst);
2759 break;
2760 case OPCODE_NOISE4:
2761 emit_noise4(c, inst);
2762 break;
2763 case OPCODE_TEX:
2764 emit_tex(c, inst);
2765 break;
2766 case OPCODE_TXB:
2767 emit_txb(c, inst);
2768 break;
2769 case OPCODE_KIL_NV:
2770 emit_kil(c);
2771 break;
2772 case OPCODE_IF:
2773 assert(if_insn < MAX_IFSN);
2774 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2775 break;
2776 case OPCODE_ELSE:
2777 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2778 break;
2779 case OPCODE_ENDIF:
2780 assert(if_insn > 0);
2781 brw_ENDIF(p, if_inst[--if_insn]);
2782 break;
2783 case OPCODE_BGNSUB:
2784 brw_save_label(p, inst->Comment, p->nr_insn);
2785 break;
2786 case OPCODE_ENDSUB:
2787 /* no-op */
2788 break;
2789 case OPCODE_CAL:
2790 brw_push_insn_state(p);
2791 brw_set_mask_control(p, BRW_MASK_DISABLE);
2792 brw_set_access_mode(p, BRW_ALIGN_1);
2793 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2794 brw_set_access_mode(p, BRW_ALIGN_16);
2795 brw_ADD(p, get_addr_reg(stack_index),
2796 get_addr_reg(stack_index), brw_imm_d(4));
2797 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2798 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2799 brw_pop_insn_state(p);
2800 break;
2801
2802 case OPCODE_RET:
2803 brw_push_insn_state(p);
2804 brw_set_mask_control(p, BRW_MASK_DISABLE);
2805 brw_ADD(p, get_addr_reg(stack_index),
2806 get_addr_reg(stack_index), brw_imm_d(-4));
2807 brw_set_access_mode(p, BRW_ALIGN_1);
2808 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2809 brw_set_access_mode(p, BRW_ALIGN_16);
2810 brw_pop_insn_state(p);
2811
2812 break;
2813 case OPCODE_BGNLOOP:
2814 /* XXX may need to invalidate the current_constant regs */
2815 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2816 break;
2817 case OPCODE_BRK:
2818 brw_BREAK(p);
2819 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2820 break;
2821 case OPCODE_CONT:
2822 brw_CONT(p);
2823 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2824 break;
2825 case OPCODE_ENDLOOP:
2826 loop_insn--;
2827 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2828 /* patch all the BREAK instructions from
2829 last BEGINLOOP */
2830 while (inst0 > loop_inst[loop_insn]) {
2831 inst0--;
2832 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2833 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2834 inst0->bits3.if_else.pop_count = 0;
2835 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2836 inst0->bits3.if_else.jump_count = inst1 - inst0;
2837 inst0->bits3.if_else.pop_count = 0;
2838 }
2839 }
2840 break;
2841 default:
2842 _mesa_printf("unsupported IR in fragment shader %d\n",
2843 inst->Opcode);
2844 }
2845 if (inst->CondUpdate)
2846 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2847 else
2848 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2849 }
2850 post_wm_emit(c);
2851
2852 if (c->reg_index >= BRW_WM_MAX_GRF) {
2853 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2854 /* XXX we need to do some proper error recovery here */
2855 }
2856 }
2857
2858
2859 /**
2860 * Do GPU code generation for shaders that use GLSL features such as
2861 * flow control. Other shaders will be compiled with the
2862 */
2863 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2864 {
2865 if (INTEL_DEBUG & DEBUG_WM) {
2866 _mesa_printf("brw_wm_glsl_emit:\n");
2867 }
2868
2869 /* initial instruction translation/simplification */
2870 brw_wm_pass_fp(c);
2871
2872 /* actual code generation */
2873 brw_wm_emit_glsl(brw, c);
2874
2875 if (INTEL_DEBUG & DEBUG_WM) {
2876 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2877 }
2878
2879 c->prog_data.total_grf = c->reg_index;
2880 c->prog_data.total_scratch = 0;
2881 }