Merge commit 'origin/master' into gallium-0.2
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11 /* Only guess, need a flag in gl_fragment_program later */
12 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
13 {
14 int i;
15 for (i = 0; i < fp->Base.NumInstructions; i++) {
16 struct prog_instruction *inst = &fp->Base.Instructions[i];
17 switch (inst->Opcode) {
18 case OPCODE_IF:
19 case OPCODE_TRUNC:
20 case OPCODE_ENDIF:
21 case OPCODE_CAL:
22 case OPCODE_BRK:
23 case OPCODE_RET:
24 case OPCODE_DDX:
25 case OPCODE_DDY:
26 case OPCODE_NOISE1:
27 case OPCODE_NOISE2:
28 case OPCODE_NOISE3:
29 case OPCODE_NOISE4:
30 case OPCODE_BGNLOOP:
31 return GL_TRUE;
32 default:
33 break;
34 }
35 }
36 return GL_FALSE;
37 }
38
39 static void set_reg(struct brw_wm_compile *c, int file, int index,
40 int component, struct brw_reg reg)
41 {
42 c->wm_regs[file][index][component].reg = reg;
43 c->wm_regs[file][index][component].inited = GL_TRUE;
44 }
45
46 static int get_scalar_dst_index(struct prog_instruction *inst)
47 {
48 int i;
49 for (i = 0; i < 4; i++)
50 if (inst->DstReg.WriteMask & (1<<i))
51 break;
52 return i;
53 }
54
55 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
56 {
57 struct brw_reg reg;
58 if(c->tmp_index == c->tmp_max)
59 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
60
61 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
62 return reg;
63 }
64
65 static int mark_tmps(struct brw_wm_compile *c)
66 {
67 return c->tmp_index;
68 }
69
70 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
71 {
72 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
73 }
74
75 static void release_tmps(struct brw_wm_compile *c, int mark)
76 {
77 c->tmp_index = mark;
78 }
79
80 static struct brw_reg
81 get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
82 {
83 struct brw_reg reg;
84 switch (file) {
85 case PROGRAM_STATE_VAR:
86 case PROGRAM_CONSTANT:
87 case PROGRAM_UNIFORM:
88 file = PROGRAM_STATE_VAR;
89 break;
90 case PROGRAM_UNDEFINED:
91 return brw_null_reg();
92 default:
93 break;
94 }
95
96 if(c->wm_regs[file][index][component].inited)
97 reg = c->wm_regs[file][index][component].reg;
98 else
99 reg = brw_vec8_grf(c->reg_index, 0);
100
101 if(!c->wm_regs[file][index][component].inited) {
102 set_reg(c, file, index, component, reg);
103 c->reg_index++;
104 }
105
106 if (neg & (1<< component)) {
107 reg = negate(reg);
108 }
109 if (abs)
110 reg = brw_abs(reg);
111 return reg;
112 }
113
114 static void prealloc_reg(struct brw_wm_compile *c)
115 {
116 int i, j;
117 struct brw_reg reg;
118 int nr_interp_regs = 0;
119 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
120
121 for (i = 0; i < 4; i++) {
122 reg = (i < c->key.nr_depth_regs)
123 ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
124 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
125 }
126 c->reg_index += 2*c->key.nr_depth_regs;
127 {
128 int nr_params = c->fp->program.Base.Parameters->NumParameters;
129 struct gl_program_parameter_list *plist =
130 c->fp->program.Base.Parameters;
131 int index = 0;
132 c->prog_data.nr_params = 4*nr_params;
133 for (i = 0; i < nr_params; i++) {
134 for (j = 0; j < 4; j++, index++) {
135 reg = brw_vec1_grf(c->reg_index + index/8,
136 index%8);
137 c->prog_data.param[index] =
138 &plist->ParameterValues[i][j];
139 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
140 }
141 }
142 c->nr_creg = 2*((4*nr_params+15)/16);
143 c->reg_index += c->nr_creg;
144 }
145 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
146 if (inputs & (1<<i)) {
147 nr_interp_regs++;
148 reg = brw_vec8_grf(c->reg_index, 0);
149 for (j = 0; j < 4; j++)
150 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
151 c->reg_index += 2;
152
153 }
154 }
155 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
156 c->prog_data.urb_read_length = nr_interp_regs * 2;
157 c->prog_data.curb_read_length = c->nr_creg;
158 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
159 c->reg_index++;
160 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
161 c->reg_index += 2;
162 }
163
164 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
165 struct prog_instruction *inst, int component, int nr)
166 {
167 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
168 0, 0);
169 }
170
171 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
172 struct prog_src_register *src, int index, int nr)
173 {
174 int component = GET_SWZ(src->Swizzle, index);
175 return get_reg(c, src->File, src->Index, component, nr,
176 src->NegateBase, src->Abs);
177 }
178
179 /* Subroutines are minimal support for resusable instruction sequences.
180 They are implemented as simply as possible to minimise overhead: there
181 is no explicit support for communication between the caller and callee
182 other than saving the return address in a temporary register, nor is
183 there any automatic local storage. This implies that great care is
184 required before attempting reentrancy or any kind of nested
185 subroutine invocations. */
186 static void invoke_subroutine( struct brw_wm_compile *c,
187 enum _subroutine subroutine,
188 void (*emit)( struct brw_wm_compile * ) )
189 {
190 struct brw_compile *p = &c->func;
191
192 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
193
194 if( c->subroutines[ subroutine ] ) {
195 /* subroutine previously emitted: reuse existing instructions */
196
197 int mark = mark_tmps( c );
198 struct brw_reg return_address = retype( alloc_tmp( c ),
199 BRW_REGISTER_TYPE_UD );
200 int here = p->nr_insn;
201
202 brw_push_insn_state(p);
203 brw_set_mask_control(p, BRW_MASK_DISABLE);
204 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
205
206 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
207 brw_imm_d( ( c->subroutines[ subroutine ] -
208 here - 1 ) << 4 ) );
209 brw_pop_insn_state(p);
210
211 release_tmps( c, mark );
212 } else {
213 /* previously unused subroutine: emit, and mark for later reuse */
214
215 int mark = mark_tmps( c );
216 struct brw_reg return_address = retype( alloc_tmp( c ),
217 BRW_REGISTER_TYPE_UD );
218 struct brw_instruction *calc;
219 int base = p->nr_insn;
220
221 brw_push_insn_state(p);
222 brw_set_mask_control(p, BRW_MASK_DISABLE);
223 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
224 brw_pop_insn_state(p);
225
226 c->subroutines[ subroutine ] = p->nr_insn;
227
228 emit( c );
229
230 brw_push_insn_state(p);
231 brw_set_mask_control(p, BRW_MASK_DISABLE);
232 brw_MOV( p, brw_ip_reg(), return_address );
233 brw_pop_insn_state(p);
234
235 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
236
237 release_tmps( c, mark );
238 }
239 }
240
241 static void emit_abs( struct brw_wm_compile *c,
242 struct prog_instruction *inst)
243 {
244 int i;
245 struct brw_compile *p = &c->func;
246 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
247 for (i = 0; i < 4; i++) {
248 if (inst->DstReg.WriteMask & (1<<i)) {
249 struct brw_reg src, dst;
250 dst = get_dst_reg(c, inst, i, 1);
251 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
252 brw_MOV(p, dst, brw_abs(src));
253 }
254 }
255 brw_set_saturate(p, 0);
256 }
257
258 static void emit_trunc( struct brw_wm_compile *c,
259 struct prog_instruction *inst)
260 {
261 int i;
262 struct brw_compile *p = &c->func;
263 GLuint mask = inst->DstReg.WriteMask;
264 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
265 for (i = 0; i < 4; i++) {
266 if (mask & (1<<i)) {
267 struct brw_reg src, dst;
268 dst = get_dst_reg(c, inst, i, 1) ;
269 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
270 brw_RNDZ(p, dst, src);
271 }
272 }
273 brw_set_saturate(p, 0);
274 }
275
276 static void emit_mov( struct brw_wm_compile *c,
277 struct prog_instruction *inst)
278 {
279 int i;
280 struct brw_compile *p = &c->func;
281 GLuint mask = inst->DstReg.WriteMask;
282 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
283 for (i = 0; i < 4; i++) {
284 if (mask & (1<<i)) {
285 struct brw_reg src, dst;
286 dst = get_dst_reg(c, inst, i, 1);
287 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
288 brw_MOV(p, dst, src);
289 }
290 }
291 brw_set_saturate(p, 0);
292 }
293
294 static void emit_pixel_xy(struct brw_wm_compile *c,
295 struct prog_instruction *inst)
296 {
297 struct brw_reg r1 = brw_vec1_grf(1, 0);
298 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
299
300 struct brw_reg dst0, dst1;
301 struct brw_compile *p = &c->func;
302 GLuint mask = inst->DstReg.WriteMask;
303
304 dst0 = get_dst_reg(c, inst, 0, 1);
305 dst1 = get_dst_reg(c, inst, 1, 1);
306 /* Calculate pixel centers by adding 1 or 0 to each of the
307 * micro-tile coordinates passed in r1.
308 */
309 if (mask & WRITEMASK_X) {
310 brw_ADD(p,
311 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
312 stride(suboffset(r1_uw, 4), 2, 4, 0),
313 brw_imm_v(0x10101010));
314 }
315
316 if (mask & WRITEMASK_Y) {
317 brw_ADD(p,
318 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
319 stride(suboffset(r1_uw, 5), 2, 4, 0),
320 brw_imm_v(0x11001100));
321 }
322
323 }
324
325 static void emit_delta_xy(struct brw_wm_compile *c,
326 struct prog_instruction *inst)
327 {
328 struct brw_reg r1 = brw_vec1_grf(1, 0);
329 struct brw_reg dst0, dst1, src0, src1;
330 struct brw_compile *p = &c->func;
331 GLuint mask = inst->DstReg.WriteMask;
332
333 dst0 = get_dst_reg(c, inst, 0, 1);
334 dst1 = get_dst_reg(c, inst, 1, 1);
335 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
336 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
337 /* Calc delta X,Y by subtracting origin in r1 from the pixel
338 * centers.
339 */
340 if (mask & WRITEMASK_X) {
341 brw_ADD(p,
342 dst0,
343 retype(src0, BRW_REGISTER_TYPE_UW),
344 negate(r1));
345 }
346
347 if (mask & WRITEMASK_Y) {
348 brw_ADD(p,
349 dst1,
350 retype(src1, BRW_REGISTER_TYPE_UW),
351 negate(suboffset(r1,1)));
352
353 }
354
355 }
356
357
358 static void fire_fb_write( struct brw_wm_compile *c,
359 GLuint base_reg,
360 GLuint nr,
361 GLuint target,
362 GLuint eot)
363 {
364 struct brw_compile *p = &c->func;
365 /* Pass through control information:
366 */
367 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
368 {
369 brw_push_insn_state(p);
370 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
371 brw_MOV(p,
372 brw_message_reg(base_reg + 1),
373 brw_vec8_grf(1, 0));
374 brw_pop_insn_state(p);
375 }
376 /* Send framebuffer write message: */
377 brw_fb_WRITE(p,
378 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
379 base_reg,
380 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
381 target,
382 nr,
383 0,
384 eot);
385 }
386
387 static void emit_fb_write(struct brw_wm_compile *c,
388 struct prog_instruction *inst)
389 {
390 struct brw_compile *p = &c->func;
391 int nr = 2;
392 int channel;
393 GLuint target, eot;
394 struct brw_reg src0;
395
396 /* Reserve a space for AA - may not be needed:
397 */
398 if (c->key.aa_dest_stencil_reg)
399 nr += 1;
400 {
401 brw_push_insn_state(p);
402 for (channel = 0; channel < 4; channel++) {
403 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
404 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
405 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
406 brw_MOV(p, brw_message_reg(nr + channel), src0);
407 }
408 /* skip over the regs populated above: */
409 nr += 8;
410 brw_pop_insn_state(p);
411 }
412
413 if (c->key.source_depth_to_render_target)
414 {
415 if (c->key.computes_depth) {
416 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
417 brw_MOV(p, brw_message_reg(nr), src0);
418 } else {
419 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
420 brw_MOV(p, brw_message_reg(nr), src0);
421 }
422
423 nr += 2;
424 }
425 target = inst->Sampler >> 1;
426 eot = inst->Sampler & 1;
427 fire_fb_write(c, 0, nr, target, eot);
428 }
429
430 static void emit_pixel_w( struct brw_wm_compile *c,
431 struct prog_instruction *inst)
432 {
433 struct brw_compile *p = &c->func;
434 GLuint mask = inst->DstReg.WriteMask;
435 if (mask & WRITEMASK_W) {
436 struct brw_reg dst, src0, delta0, delta1;
437 struct brw_reg interp3;
438
439 dst = get_dst_reg(c, inst, 3, 1);
440 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
441 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
442 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
443
444 interp3 = brw_vec1_grf(src0.nr+1, 4);
445 /* Calc 1/w - just linterp wpos[3] optimized by putting the
446 * result straight into a message reg.
447 */
448 brw_LINE(p, brw_null_reg(), interp3, delta0);
449 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
450
451 /* Calc w */
452 brw_math_16( p, dst,
453 BRW_MATH_FUNCTION_INV,
454 BRW_MATH_SATURATE_NONE,
455 2, brw_null_reg(),
456 BRW_MATH_PRECISION_FULL);
457 }
458 }
459
460 static void emit_linterp(struct brw_wm_compile *c,
461 struct prog_instruction *inst)
462 {
463 struct brw_compile *p = &c->func;
464 GLuint mask = inst->DstReg.WriteMask;
465 struct brw_reg interp[4];
466 struct brw_reg dst, delta0, delta1;
467 struct brw_reg src0;
468
469 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
470 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
471 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
472 GLuint nr = src0.nr;
473 int i;
474
475 interp[0] = brw_vec1_grf(nr, 0);
476 interp[1] = brw_vec1_grf(nr, 4);
477 interp[2] = brw_vec1_grf(nr+1, 0);
478 interp[3] = brw_vec1_grf(nr+1, 4);
479
480 for(i = 0; i < 4; i++ ) {
481 if (mask & (1<<i)) {
482 dst = get_dst_reg(c, inst, i, 1);
483 brw_LINE(p, brw_null_reg(), interp[i], delta0);
484 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
485 }
486 }
487 }
488
489 static void emit_cinterp(struct brw_wm_compile *c,
490 struct prog_instruction *inst)
491 {
492 struct brw_compile *p = &c->func;
493 GLuint mask = inst->DstReg.WriteMask;
494
495 struct brw_reg interp[4];
496 struct brw_reg dst, src0;
497
498 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
499 GLuint nr = src0.nr;
500 int i;
501
502 interp[0] = brw_vec1_grf(nr, 0);
503 interp[1] = brw_vec1_grf(nr, 4);
504 interp[2] = brw_vec1_grf(nr+1, 0);
505 interp[3] = brw_vec1_grf(nr+1, 4);
506
507 for(i = 0; i < 4; i++ ) {
508 if (mask & (1<<i)) {
509 dst = get_dst_reg(c, inst, i, 1);
510 brw_MOV(p, dst, suboffset(interp[i],3));
511 }
512 }
513 }
514
515 static void emit_pinterp(struct brw_wm_compile *c,
516 struct prog_instruction *inst)
517 {
518 struct brw_compile *p = &c->func;
519 GLuint mask = inst->DstReg.WriteMask;
520
521 struct brw_reg interp[4];
522 struct brw_reg dst, delta0, delta1;
523 struct brw_reg src0, w;
524
525 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
526 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
527 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
528 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
529 GLuint nr = src0.nr;
530 int i;
531
532 interp[0] = brw_vec1_grf(nr, 0);
533 interp[1] = brw_vec1_grf(nr, 4);
534 interp[2] = brw_vec1_grf(nr+1, 0);
535 interp[3] = brw_vec1_grf(nr+1, 4);
536
537 for(i = 0; i < 4; i++ ) {
538 if (mask & (1<<i)) {
539 dst = get_dst_reg(c, inst, i, 1);
540 brw_LINE(p, brw_null_reg(), interp[i], delta0);
541 brw_MAC(p, dst, suboffset(interp[i],1),
542 delta1);
543 brw_MUL(p, dst, dst, w);
544 }
545 }
546 }
547
548 static void emit_xpd(struct brw_wm_compile *c,
549 struct prog_instruction *inst)
550 {
551 int i;
552 struct brw_compile *p = &c->func;
553 GLuint mask = inst->DstReg.WriteMask;
554 for (i = 0; i < 4; i++) {
555 GLuint i2 = (i+2)%3;
556 GLuint i1 = (i+1)%3;
557 if (mask & (1<<i)) {
558 struct brw_reg src0, src1, dst;
559 dst = get_dst_reg(c, inst, i, 1);
560 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
561 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
562 brw_MUL(p, brw_null_reg(), src0, src1);
563 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
564 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
565 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
566 brw_MAC(p, dst, src0, src1);
567 brw_set_saturate(p, 0);
568 }
569 }
570 brw_set_saturate(p, 0);
571 }
572
573 static void emit_dp3(struct brw_wm_compile *c,
574 struct prog_instruction *inst)
575 {
576 struct brw_reg src0[3], src1[3], dst;
577 int i;
578 struct brw_compile *p = &c->func;
579 for (i = 0; i < 3; i++) {
580 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
581 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
582 }
583
584 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
585 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
586 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
587 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
588 brw_MAC(p, dst, src0[2], src1[2]);
589 brw_set_saturate(p, 0);
590 }
591
592 static void emit_dp4(struct brw_wm_compile *c,
593 struct prog_instruction *inst)
594 {
595 struct brw_reg src0[4], src1[4], dst;
596 int i;
597 struct brw_compile *p = &c->func;
598 for (i = 0; i < 4; i++) {
599 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
600 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
601 }
602 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
603 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
604 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
605 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
606 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
607 brw_MAC(p, dst, src0[3], src1[3]);
608 brw_set_saturate(p, 0);
609 }
610
611 static void emit_dph(struct brw_wm_compile *c,
612 struct prog_instruction *inst)
613 {
614 struct brw_reg src0[4], src1[4], dst;
615 int i;
616 struct brw_compile *p = &c->func;
617 for (i = 0; i < 4; i++) {
618 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
619 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
620 }
621 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
622 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
623 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
624 brw_MAC(p, dst, src0[2], src1[2]);
625 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
626 brw_ADD(p, dst, src0[3], src1[3]);
627 brw_set_saturate(p, 0);
628 }
629
630 static void emit_math1(struct brw_wm_compile *c,
631 struct prog_instruction *inst, GLuint func)
632 {
633 struct brw_compile *p = &c->func;
634 struct brw_reg src0, dst;
635
636 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
637 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
638 brw_MOV(p, brw_message_reg(2), src0);
639 brw_math(p,
640 dst,
641 func,
642 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
643 2,
644 brw_null_reg(),
645 BRW_MATH_DATA_VECTOR,
646 BRW_MATH_PRECISION_FULL);
647 }
648
649 static void emit_rcp(struct brw_wm_compile *c,
650 struct prog_instruction *inst)
651 {
652 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
653 }
654
655 static void emit_rsq(struct brw_wm_compile *c,
656 struct prog_instruction *inst)
657 {
658 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
659 }
660
661 static void emit_sin(struct brw_wm_compile *c,
662 struct prog_instruction *inst)
663 {
664 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
665 }
666
667 static void emit_cos(struct brw_wm_compile *c,
668 struct prog_instruction *inst)
669 {
670 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
671 }
672
673 static void emit_ex2(struct brw_wm_compile *c,
674 struct prog_instruction *inst)
675 {
676 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
677 }
678
679 static void emit_lg2(struct brw_wm_compile *c,
680 struct prog_instruction *inst)
681 {
682 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
683 }
684
685 static void emit_add(struct brw_wm_compile *c,
686 struct prog_instruction *inst)
687 {
688 struct brw_compile *p = &c->func;
689 struct brw_reg src0, src1, dst;
690 GLuint mask = inst->DstReg.WriteMask;
691 int i;
692 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
693 for (i = 0 ; i < 4; i++) {
694 if (mask & (1<<i)) {
695 dst = get_dst_reg(c, inst, i, 1);
696 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
697 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
698 brw_ADD(p, dst, src0, src1);
699 }
700 }
701 brw_set_saturate(p, 0);
702 }
703
704 static void emit_sub(struct brw_wm_compile *c,
705 struct prog_instruction *inst)
706 {
707 struct brw_compile *p = &c->func;
708 struct brw_reg src0, src1, dst;
709 GLuint mask = inst->DstReg.WriteMask;
710 int i;
711 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
712 for (i = 0 ; i < 4; i++) {
713 if (mask & (1<<i)) {
714 dst = get_dst_reg(c, inst, i, 1);
715 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
716 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
717 brw_ADD(p, dst, src0, negate(src1));
718 }
719 }
720 brw_set_saturate(p, 0);
721 }
722
723 static void emit_mul(struct brw_wm_compile *c,
724 struct prog_instruction *inst)
725 {
726 struct brw_compile *p = &c->func;
727 struct brw_reg src0, src1, dst;
728 GLuint mask = inst->DstReg.WriteMask;
729 int i;
730 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
731 for (i = 0 ; i < 4; i++) {
732 if (mask & (1<<i)) {
733 dst = get_dst_reg(c, inst, i, 1);
734 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
735 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
736 brw_MUL(p, dst, src0, src1);
737 }
738 }
739 brw_set_saturate(p, 0);
740 }
741
742 static void emit_frc(struct brw_wm_compile *c,
743 struct prog_instruction *inst)
744 {
745 struct brw_compile *p = &c->func;
746 struct brw_reg src0, dst;
747 GLuint mask = inst->DstReg.WriteMask;
748 int i;
749 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
750 for (i = 0 ; i < 4; i++) {
751 if (mask & (1<<i)) {
752 dst = get_dst_reg(c, inst, i, 1);
753 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
754 brw_FRC(p, dst, src0);
755 }
756 }
757 if (inst->SaturateMode != SATURATE_OFF)
758 brw_set_saturate(p, 0);
759 }
760
761 static void emit_flr(struct brw_wm_compile *c,
762 struct prog_instruction *inst)
763 {
764 struct brw_compile *p = &c->func;
765 struct brw_reg src0, dst;
766 GLuint mask = inst->DstReg.WriteMask;
767 int i;
768 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
769 for (i = 0 ; i < 4; i++) {
770 if (mask & (1<<i)) {
771 dst = get_dst_reg(c, inst, i, 1);
772 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
773 brw_RNDD(p, dst, src0);
774 }
775 }
776 brw_set_saturate(p, 0);
777 }
778
779 static void emit_max(struct brw_wm_compile *c,
780 struct prog_instruction *inst)
781 {
782 struct brw_compile *p = &c->func;
783 GLuint mask = inst->DstReg.WriteMask;
784 struct brw_reg src0, src1, dst;
785 int i;
786 brw_push_insn_state(p);
787 for (i = 0; i < 4; i++) {
788 if (mask & (1<<i)) {
789 dst = get_dst_reg(c, inst, i, 1);
790 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
791 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
792 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
793 brw_MOV(p, dst, src0);
794 brw_set_saturate(p, 0);
795
796 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
797 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
798 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
799 brw_MOV(p, dst, src1);
800 brw_set_saturate(p, 0);
801 brw_set_predicate_control_flag_value(p, 0xff);
802 }
803 }
804 brw_pop_insn_state(p);
805 }
806
807 static void emit_min(struct brw_wm_compile *c,
808 struct prog_instruction *inst)
809 {
810 struct brw_compile *p = &c->func;
811 GLuint mask = inst->DstReg.WriteMask;
812 struct brw_reg src0, src1, dst;
813 int i;
814 brw_push_insn_state(p);
815 for (i = 0; i < 4; i++) {
816 if (mask & (1<<i)) {
817 dst = get_dst_reg(c, inst, i, 1);
818 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
819 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
820 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
821 brw_MOV(p, dst, src0);
822 brw_set_saturate(p, 0);
823
824 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
825 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
826 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
827 brw_MOV(p, dst, src1);
828 brw_set_saturate(p, 0);
829 brw_set_predicate_control_flag_value(p, 0xff);
830 }
831 }
832 brw_pop_insn_state(p);
833 }
834
835 static void emit_pow(struct brw_wm_compile *c,
836 struct prog_instruction *inst)
837 {
838 struct brw_compile *p = &c->func;
839 struct brw_reg dst, src0, src1;
840 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
841 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
842 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
843
844 brw_MOV(p, brw_message_reg(2), src0);
845 brw_MOV(p, brw_message_reg(3), src1);
846
847 brw_math(p,
848 dst,
849 BRW_MATH_FUNCTION_POW,
850 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
851 2,
852 brw_null_reg(),
853 BRW_MATH_DATA_VECTOR,
854 BRW_MATH_PRECISION_FULL);
855 }
856
857 static void emit_lrp(struct brw_wm_compile *c,
858 struct prog_instruction *inst)
859 {
860 struct brw_compile *p = &c->func;
861 GLuint mask = inst->DstReg.WriteMask;
862 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
863 int i;
864 int mark = mark_tmps(c);
865 for (i = 0; i < 4; i++) {
866 if (mask & (1<<i)) {
867 dst = get_dst_reg(c, inst, i, 1);
868 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
869
870 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
871
872 if (src1.nr == dst.nr) {
873 tmp1 = alloc_tmp(c);
874 brw_MOV(p, tmp1, src1);
875 } else
876 tmp1 = src1;
877
878 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
879 if (src2.nr == dst.nr) {
880 tmp2 = alloc_tmp(c);
881 brw_MOV(p, tmp2, src2);
882 } else
883 tmp2 = src2;
884
885 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
886 brw_MUL(p, brw_null_reg(), dst, tmp2);
887 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
888 brw_MAC(p, dst, src0, tmp1);
889 brw_set_saturate(p, 0);
890 }
891 release_tmps(c, mark);
892 }
893 }
894
895 static void emit_kil(struct brw_wm_compile *c)
896 {
897 struct brw_compile *p = &c->func;
898 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
899 brw_push_insn_state(p);
900 brw_set_mask_control(p, BRW_MASK_DISABLE);
901 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
902 brw_AND(p, depth, c->emit_mask_reg, depth);
903 brw_pop_insn_state(p);
904 }
905
906 static void emit_mad(struct brw_wm_compile *c,
907 struct prog_instruction *inst)
908 {
909 struct brw_compile *p = &c->func;
910 GLuint mask = inst->DstReg.WriteMask;
911 struct brw_reg dst, src0, src1, src2;
912 int i;
913
914 for (i = 0; i < 4; i++) {
915 if (mask & (1<<i)) {
916 dst = get_dst_reg(c, inst, i, 1);
917 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
918 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
919 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
920 brw_MUL(p, dst, src0, src1);
921
922 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
923 brw_ADD(p, dst, dst, src2);
924 brw_set_saturate(p, 0);
925 }
926 }
927 }
928
929 static void emit_sop(struct brw_wm_compile *c,
930 struct prog_instruction *inst, GLuint cond)
931 {
932 struct brw_compile *p = &c->func;
933 GLuint mask = inst->DstReg.WriteMask;
934 struct brw_reg dst, src0, src1;
935 int i;
936
937 for (i = 0; i < 4; i++) {
938 if (mask & (1<<i)) {
939 dst = get_dst_reg(c, inst, i, 1);
940 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
941 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
942 brw_push_insn_state(p);
943 brw_CMP(p, brw_null_reg(), cond, src0, src1);
944 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
945 brw_MOV(p, dst, brw_imm_f(0.0));
946 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
947 brw_MOV(p, dst, brw_imm_f(1.0));
948 brw_pop_insn_state(p);
949 }
950 }
951 }
952
953 static void emit_slt(struct brw_wm_compile *c,
954 struct prog_instruction *inst)
955 {
956 emit_sop(c, inst, BRW_CONDITIONAL_L);
957 }
958
959 static void emit_sle(struct brw_wm_compile *c,
960 struct prog_instruction *inst)
961 {
962 emit_sop(c, inst, BRW_CONDITIONAL_LE);
963 }
964
965 static void emit_sgt(struct brw_wm_compile *c,
966 struct prog_instruction *inst)
967 {
968 emit_sop(c, inst, BRW_CONDITIONAL_G);
969 }
970
971 static void emit_sge(struct brw_wm_compile *c,
972 struct prog_instruction *inst)
973 {
974 emit_sop(c, inst, BRW_CONDITIONAL_GE);
975 }
976
977 static void emit_seq(struct brw_wm_compile *c,
978 struct prog_instruction *inst)
979 {
980 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
981 }
982
983 static void emit_sne(struct brw_wm_compile *c,
984 struct prog_instruction *inst)
985 {
986 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
987 }
988
989 static void emit_ddx(struct brw_wm_compile *c,
990 struct prog_instruction *inst)
991 {
992 struct brw_compile *p = &c->func;
993 GLuint mask = inst->DstReg.WriteMask;
994 struct brw_reg interp[4];
995 struct brw_reg dst;
996 struct brw_reg src0, w;
997 GLuint nr, i;
998 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
999 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1000 nr = src0.nr;
1001 interp[0] = brw_vec1_grf(nr, 0);
1002 interp[1] = brw_vec1_grf(nr, 4);
1003 interp[2] = brw_vec1_grf(nr+1, 0);
1004 interp[3] = brw_vec1_grf(nr+1, 4);
1005 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1006 for(i = 0; i < 4; i++ ) {
1007 if (mask & (1<<i)) {
1008 dst = get_dst_reg(c, inst, i, 1);
1009 brw_MOV(p, dst, interp[i]);
1010 brw_MUL(p, dst, dst, w);
1011 }
1012 }
1013 brw_set_saturate(p, 0);
1014 }
1015
1016 static void emit_ddy(struct brw_wm_compile *c,
1017 struct prog_instruction *inst)
1018 {
1019 struct brw_compile *p = &c->func;
1020 GLuint mask = inst->DstReg.WriteMask;
1021 struct brw_reg interp[4];
1022 struct brw_reg dst;
1023 struct brw_reg src0, w;
1024 GLuint nr, i;
1025
1026 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1027 nr = src0.nr;
1028 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1029 interp[0] = brw_vec1_grf(nr, 0);
1030 interp[1] = brw_vec1_grf(nr, 4);
1031 interp[2] = brw_vec1_grf(nr+1, 0);
1032 interp[3] = brw_vec1_grf(nr+1, 4);
1033 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1034 for(i = 0; i < 4; i++ ) {
1035 if (mask & (1<<i)) {
1036 dst = get_dst_reg(c, inst, i, 1);
1037 brw_MOV(p, dst, suboffset(interp[i], 1));
1038 brw_MUL(p, dst, dst, w);
1039 }
1040 }
1041 brw_set_saturate(p, 0);
1042 }
1043
1044 static __inline struct brw_reg high_words( struct brw_reg reg )
1045 {
1046 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1047 0, 8, 2 );
1048 }
1049
1050 static __inline struct brw_reg low_words( struct brw_reg reg )
1051 {
1052 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1053 }
1054
1055 static __inline struct brw_reg even_bytes( struct brw_reg reg )
1056 {
1057 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1058 }
1059
1060 static __inline struct brw_reg odd_bytes( struct brw_reg reg )
1061 {
1062 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1063 0, 16, 2 );
1064 }
1065
1066 /* One-, two- and three-dimensional Perlin noise, similar to the description
1067 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1068 static void noise1_sub( struct brw_wm_compile *c ) {
1069
1070 struct brw_compile *p = &c->func;
1071 struct brw_reg param,
1072 x0, x1, /* gradients at each end */
1073 t, tmp[ 2 ], /* float temporaries */
1074 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1075 int i;
1076 int mark = mark_tmps( c );
1077
1078 x0 = alloc_tmp( c );
1079 x1 = alloc_tmp( c );
1080 t = alloc_tmp( c );
1081 tmp[ 0 ] = alloc_tmp( c );
1082 tmp[ 1 ] = alloc_tmp( c );
1083 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1084 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1085 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1086 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1087 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1088
1089 param = lookup_tmp( c, mark - 2 );
1090
1091 brw_set_access_mode( p, BRW_ALIGN_1 );
1092
1093 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1094
1095 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1096 be hashed. Also compute the remainder (offset within the unit
1097 length), interleaved to reduce register dependency penalties. */
1098 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1099 brw_FRC( p, param, param );
1100 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1101 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1102 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1103
1104 /* We're now ready to perform the hashing. The two hashes are
1105 interleaved for performance. The hash function used is
1106 designed to rapidly achieve avalanche and require only 32x16
1107 bit multiplication, and 16-bit swizzles (which we get for
1108 free). We can't use immediate operands in the multiplies,
1109 because immediates are permitted only in src1 and the 16-bit
1110 factor is permitted only in src0. */
1111 for( i = 0; i < 2; i++ )
1112 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1113 for( i = 0; i < 2; i++ )
1114 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1115 high_words( itmp[ i ] ) );
1116 for( i = 0; i < 2; i++ )
1117 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1118 for( i = 0; i < 2; i++ )
1119 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1120 high_words( itmp[ i ] ) );
1121 for( i = 0; i < 2; i++ )
1122 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1123 for( i = 0; i < 2; i++ )
1124 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1125 high_words( itmp[ i ] ) );
1126
1127 /* Now we want to initialise the two gradients based on the
1128 hashes. Format conversion from signed integer to float leaves
1129 everything scaled too high by a factor of pow( 2, 31 ), but
1130 we correct for that right at the end. */
1131 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1132 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1133 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1134
1135 brw_MUL( p, x0, x0, param );
1136 brw_MUL( p, x1, x1, t );
1137
1138 /* We interpolate between the gradients using the polynomial
1139 6t^5 - 15t^4 + 10t^3 (Perlin). */
1140 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1141 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1142 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1143 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1144 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1145 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1146 pipeline */
1147 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1148 brw_MUL( p, param, tmp[ 0 ], param );
1149 brw_MUL( p, x1, x1, param );
1150 brw_ADD( p, x0, x0, x1 );
1151 /* scale by pow( 2, -30 ), to compensate for the format conversion
1152 above and an extra factor of 2 so that a single gradient covers
1153 the [-1,1] range */
1154 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1155
1156 release_tmps( c, mark );
1157 }
1158
1159 static void emit_noise1( struct brw_wm_compile *c,
1160 struct prog_instruction *inst )
1161 {
1162 struct brw_compile *p = &c->func;
1163 struct brw_reg src, param, dst;
1164 GLuint mask = inst->DstReg.WriteMask;
1165 int i;
1166 int mark = mark_tmps( c );
1167
1168 assert( mark == 0 );
1169
1170 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1171
1172 param = alloc_tmp( c );
1173
1174 brw_MOV( p, param, src );
1175
1176 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1177
1178 /* Fill in the result: */
1179 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1180 for (i = 0 ; i < 4; i++) {
1181 if (mask & (1<<i)) {
1182 dst = get_dst_reg(c, inst, i, 1);
1183 brw_MOV( p, dst, param );
1184 }
1185 }
1186 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1187 brw_set_saturate( p, 0 );
1188
1189 release_tmps( c, mark );
1190 }
1191
1192 static void noise2_sub( struct brw_wm_compile *c ) {
1193
1194 struct brw_compile *p = &c->func;
1195 struct brw_reg param0, param1,
1196 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1197 t, tmp[ 4 ], /* float temporaries */
1198 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1199 int i;
1200 int mark = mark_tmps( c );
1201
1202 x0y0 = alloc_tmp( c );
1203 x0y1 = alloc_tmp( c );
1204 x1y0 = alloc_tmp( c );
1205 x1y1 = alloc_tmp( c );
1206 t = alloc_tmp( c );
1207 for( i = 0; i < 4; i++ ) {
1208 tmp[ i ] = alloc_tmp( c );
1209 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1210 }
1211 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1212 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1213 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1214
1215 param0 = lookup_tmp( c, mark - 3 );
1216 param1 = lookup_tmp( c, mark - 2 );
1217
1218 brw_set_access_mode( p, BRW_ALIGN_1 );
1219
1220 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1221 be hashed. Also compute the remainders (offsets within the unit
1222 square), interleaved to reduce register dependency penalties. */
1223 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1224 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1225 brw_FRC( p, param0, param0 );
1226 brw_FRC( p, param1, param1 );
1227 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1228 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1229 low_words( itmp[ 1 ] ) );
1230 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1231 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1232 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1233 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1234 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1235
1236 /* We're now ready to perform the hashing. The four hashes are
1237 interleaved for performance. The hash function used is
1238 designed to rapidly achieve avalanche and require only 32x16
1239 bit multiplication, and 16-bit swizzles (which we get for
1240 free). We can't use immediate operands in the multiplies,
1241 because immediates are permitted only in src1 and the 16-bit
1242 factor is permitted only in src0. */
1243 for( i = 0; i < 4; i++ )
1244 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1245 for( i = 0; i < 4; i++ )
1246 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1247 high_words( itmp[ i ] ) );
1248 for( i = 0; i < 4; i++ )
1249 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1250 for( i = 0; i < 4; i++ )
1251 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1252 high_words( itmp[ i ] ) );
1253 for( i = 0; i < 4; i++ )
1254 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1255 for( i = 0; i < 4; i++ )
1256 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1257 high_words( itmp[ i ] ) );
1258
1259 /* Now we want to initialise the four gradients based on the
1260 hashes. Format conversion from signed integer to float leaves
1261 everything scaled too high by a factor of pow( 2, 15 ), but
1262 we correct for that right at the end. */
1263 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1264 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1265 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1266 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1267 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1268
1269 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1270 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1271 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1272 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1273
1274 brw_MUL( p, x1y0, x1y0, t );
1275 brw_MUL( p, x1y1, x1y1, t );
1276 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1277 brw_MUL( p, x0y0, x0y0, param0 );
1278 brw_MUL( p, x0y1, x0y1, param0 );
1279
1280 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1281 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1282 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1283 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1284
1285 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1286 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1287 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1288 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1289
1290 /* We interpolate between the gradients using the polynomial
1291 6t^5 - 15t^4 + 10t^3 (Perlin). */
1292 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1293 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1294 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1295 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1296 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1297 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1298 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1299 pipeline */
1300 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1301 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1302 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1303 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1304 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1305 pipeline */
1306 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1307 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1308 brw_MUL( p, param0, tmp[ 0 ], param0 );
1309 brw_MUL( p, param1, tmp[ 1 ], param1 );
1310
1311 /* Here we interpolate in the y dimension... */
1312 brw_MUL( p, x0y1, x0y1, param1 );
1313 brw_MUL( p, x1y1, x1y1, param1 );
1314 brw_ADD( p, x0y0, x0y0, x0y1 );
1315 brw_ADD( p, x1y0, x1y0, x1y1 );
1316
1317 /* And now in x. There are horrible register dependencies here,
1318 but we have nothing else to do. */
1319 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1320 brw_MUL( p, x1y0, x1y0, param0 );
1321 brw_ADD( p, x0y0, x0y0, x1y0 );
1322
1323 /* scale by pow( 2, -15 ), as described above */
1324 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1325
1326 release_tmps( c, mark );
1327 }
1328
1329 static void emit_noise2( struct brw_wm_compile *c,
1330 struct prog_instruction *inst )
1331 {
1332 struct brw_compile *p = &c->func;
1333 struct brw_reg src0, src1, param0, param1, dst;
1334 GLuint mask = inst->DstReg.WriteMask;
1335 int i;
1336 int mark = mark_tmps( c );
1337
1338 assert( mark == 0 );
1339
1340 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1341 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1342
1343 param0 = alloc_tmp( c );
1344 param1 = alloc_tmp( c );
1345
1346 brw_MOV( p, param0, src0 );
1347 brw_MOV( p, param1, src1 );
1348
1349 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1350
1351 /* Fill in the result: */
1352 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1353 for (i = 0 ; i < 4; i++) {
1354 if (mask & (1<<i)) {
1355 dst = get_dst_reg(c, inst, i, 1);
1356 brw_MOV( p, dst, param0 );
1357 }
1358 }
1359 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1360 brw_set_saturate( p, 0 );
1361
1362 release_tmps( c, mark );
1363 }
1364
1365 /* The three-dimensional case is much like the one- and two- versions above,
1366 but since the number of corners is rapidly growing we now pack 16 16-bit
1367 hashes into each register to extract more parallelism from the EUs. */
1368 static void noise3_sub( struct brw_wm_compile *c ) {
1369
1370 struct brw_compile *p = &c->func;
1371 struct brw_reg param0, param1, param2,
1372 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1373 xi, yi, zi, /* interpolation coefficients */
1374 t, tmp[ 8 ], /* float temporaries */
1375 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1376 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1377 int i;
1378 int mark = mark_tmps( c );
1379
1380 x0y0 = alloc_tmp( c );
1381 x0y1 = alloc_tmp( c );
1382 x1y0 = alloc_tmp( c );
1383 x1y1 = alloc_tmp( c );
1384 xi = alloc_tmp( c );
1385 yi = alloc_tmp( c );
1386 zi = alloc_tmp( c );
1387 t = alloc_tmp( c );
1388 for( i = 0; i < 8; i++ ) {
1389 tmp[ i ] = alloc_tmp( c );
1390 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1391 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1392 }
1393
1394 param0 = lookup_tmp( c, mark - 4 );
1395 param1 = lookup_tmp( c, mark - 3 );
1396 param2 = lookup_tmp( c, mark - 2 );
1397
1398 brw_set_access_mode( p, BRW_ALIGN_1 );
1399
1400 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1401 be hashed. Also compute the remainders (offsets within the unit
1402 cube), interleaved to reduce register dependency penalties. */
1403 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1404 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1405 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1406 brw_FRC( p, param0, param0 );
1407 brw_FRC( p, param1, param1 );
1408 brw_FRC( p, param2, param2 );
1409 /* Since we now have only 16 bits of precision in the hash, we must
1410 be more careful about thorough mixing to maintain entropy as we
1411 squash the input vector into a small scalar. */
1412 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1413 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1414 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1415 brw_imm_uw( 0x9B93 ) );
1416 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1417 brw_imm_uw( 0xBC8F ) );
1418
1419 /* Temporarily disable the execution mask while we work with ExecSize=16
1420 channels (the mask is set for ExecSize=8 and is probably incorrect).
1421 Although this might cause execution of unwanted channels, the code
1422 writes only to temporary registers and has no side effects, so
1423 disabling the mask is harmless. */
1424 brw_push_insn_state( p );
1425 brw_set_mask_control( p, BRW_MASK_DISABLE );
1426 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1427 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1428 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1429
1430 /* We're now ready to perform the hashing. The eight hashes are
1431 interleaved for performance. The hash function used is
1432 designed to rapidly achieve avalanche and require only 16x16
1433 bit multiplication, and 8-bit swizzles (which we get for
1434 free). */
1435 for( i = 0; i < 4; i++ )
1436 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1437 for( i = 0; i < 4; i++ )
1438 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1439 odd_bytes( wtmp[ i ] ) );
1440 for( i = 0; i < 4; i++ )
1441 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1442 for( i = 0; i < 4; i++ )
1443 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1444 odd_bytes( wtmp[ i ] ) );
1445 brw_pop_insn_state( p );
1446
1447 /* Now we want to initialise the four rear gradients based on the
1448 hashes. Format conversion from signed integer to float leaves
1449 everything scaled too high by a factor of pow( 2, 15 ), but
1450 we correct for that right at the end. */
1451 /* x component */
1452 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1453 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1454 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1455 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1456 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1457
1458 brw_push_insn_state( p );
1459 brw_set_mask_control( p, BRW_MASK_DISABLE );
1460 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1461 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1462 brw_pop_insn_state( p );
1463
1464 brw_MUL( p, x1y0, x1y0, t );
1465 brw_MUL( p, x1y1, x1y1, t );
1466 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1467 brw_MUL( p, x0y0, x0y0, param0 );
1468 brw_MUL( p, x0y1, x0y1, param0 );
1469
1470 /* y component */
1471 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1472 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1473 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1474 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1475
1476 brw_push_insn_state( p );
1477 brw_set_mask_control( p, BRW_MASK_DISABLE );
1478 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1479 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1480 brw_pop_insn_state( p );
1481
1482 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1483 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1484 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1485 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1486 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1487
1488 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1489 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1490 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1491 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1492
1493 /* z component */
1494 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1495 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1496 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1497 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1498
1499 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1500 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1501 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1502 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1503
1504 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1505 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1506 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1507 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1508
1509 /* We interpolate between the gradients using the polynomial
1510 6t^5 - 15t^4 + 10t^3 (Perlin). */
1511 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1512 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1513 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1514 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1515 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1516 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1517 brw_MUL( p, xi, xi, param0 );
1518 brw_MUL( p, yi, yi, param1 );
1519 brw_MUL( p, zi, zi, param2 );
1520 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1521 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1522 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1523 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1524 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1525 brw_MUL( p, xi, xi, param0 );
1526 brw_MUL( p, yi, yi, param1 );
1527 brw_MUL( p, zi, zi, param2 );
1528 brw_MUL( p, xi, xi, param0 );
1529 brw_MUL( p, yi, yi, param1 );
1530 brw_MUL( p, zi, zi, param2 );
1531 brw_MUL( p, xi, xi, param0 );
1532 brw_MUL( p, yi, yi, param1 );
1533 brw_MUL( p, zi, zi, param2 );
1534
1535 /* Here we interpolate in the y dimension... */
1536 brw_MUL( p, x0y1, x0y1, yi );
1537 brw_MUL( p, x1y1, x1y1, yi );
1538 brw_ADD( p, x0y0, x0y0, x0y1 );
1539 brw_ADD( p, x1y0, x1y0, x1y1 );
1540
1541 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1542 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1543 brw_MUL( p, x1y0, x1y0, xi );
1544 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1545
1546 /* Now do the same thing for the front four gradients... */
1547 /* x component */
1548 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1549 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1550 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1551 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1552
1553 brw_push_insn_state( p );
1554 brw_set_mask_control( p, BRW_MASK_DISABLE );
1555 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1556 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1557 brw_pop_insn_state( p );
1558
1559 brw_MUL( p, x1y0, x1y0, t );
1560 brw_MUL( p, x1y1, x1y1, t );
1561 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1562 brw_MUL( p, x0y0, x0y0, param0 );
1563 brw_MUL( p, x0y1, x0y1, param0 );
1564
1565 /* y component */
1566 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1567 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1568 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1569 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1570
1571 brw_push_insn_state( p );
1572 brw_set_mask_control( p, BRW_MASK_DISABLE );
1573 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1574 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1575 brw_pop_insn_state( p );
1576
1577 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1578 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1579 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1580 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1581 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1582
1583 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1584 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1585 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1586 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1587
1588 /* z component */
1589 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1590 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1591 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1592 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1593
1594 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1595 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1596 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1597 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1598
1599 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1600 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1601 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1602 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1603
1604 /* The interpolation coefficients are still around from last time, so
1605 again interpolate in the y dimension... */
1606 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1607 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1608 brw_MUL( p, x0y1, x0y1, yi );
1609 brw_MUL( p, x1y1, x1y1, yi );
1610 brw_ADD( p, x0y0, x0y0, x0y1 );
1611 brw_ADD( p, x1y0, x1y0, x1y1 );
1612
1613 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1614 time put the front face in tmp[ 1 ] and we're nearly there... */
1615 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1616 brw_MUL( p, x1y0, x1y0, xi );
1617 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1618
1619 /* The final interpolation, in the z dimension: */
1620 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1621 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1622 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1623
1624 /* scale by pow( 2, -15 ), as described above */
1625 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1626
1627 release_tmps( c, mark );
1628 }
1629
1630 static void emit_noise3( struct brw_wm_compile *c,
1631 struct prog_instruction *inst )
1632 {
1633 struct brw_compile *p = &c->func;
1634 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1635 GLuint mask = inst->DstReg.WriteMask;
1636 int i;
1637 int mark = mark_tmps( c );
1638
1639 assert( mark == 0 );
1640
1641 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1642 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1643 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1644
1645 param0 = alloc_tmp( c );
1646 param1 = alloc_tmp( c );
1647 param2 = alloc_tmp( c );
1648
1649 brw_MOV( p, param0, src0 );
1650 brw_MOV( p, param1, src1 );
1651 brw_MOV( p, param2, src2 );
1652
1653 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1654
1655 /* Fill in the result: */
1656 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1657 for (i = 0 ; i < 4; i++) {
1658 if (mask & (1<<i)) {
1659 dst = get_dst_reg(c, inst, i, 1);
1660 brw_MOV( p, dst, param0 );
1661 }
1662 }
1663 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1664 brw_set_saturate( p, 0 );
1665
1666 release_tmps( c, mark );
1667 }
1668
1669 /* For the four-dimensional case, the little micro-optimisation benefits
1670 we obtain by unrolling all the loops aren't worth the massive bloat it
1671 now causes. Instead, we loop twice around performing a similar operation
1672 to noise3, once for the w=0 cube and once for the w=1, with a bit more
1673 code to glue it all together. */
1674 static void noise4_sub( struct brw_wm_compile *c ) {
1675
1676 struct brw_compile *p = &c->func;
1677 struct brw_reg param[ 4 ],
1678 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1679 w0, /* noise for the w=0 cube */
1680 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1681 interp[ 4 ], /* interpolation coefficients */
1682 t, tmp[ 8 ], /* float temporaries */
1683 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1684 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1685 int i, j;
1686 int mark = mark_tmps( c );
1687 GLuint loop, origin;
1688
1689 x0y0 = alloc_tmp( c );
1690 x0y1 = alloc_tmp( c );
1691 x1y0 = alloc_tmp( c );
1692 x1y1 = alloc_tmp( c );
1693 t = alloc_tmp( c );
1694 w0 = alloc_tmp( c );
1695 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1696 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1697
1698 for( i = 0; i < 4; i++ ) {
1699 param[ i ] = lookup_tmp( c, mark - 5 + i );
1700 interp[ i ] = alloc_tmp( c );
1701 }
1702
1703 for( i = 0; i < 8; i++ ) {
1704 tmp[ i ] = alloc_tmp( c );
1705 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1706 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1707 }
1708
1709 brw_set_access_mode( p, BRW_ALIGN_1 );
1710
1711 /* We only want 16 bits of precision from the integral part of each
1712 co-ordinate, but unfortunately the RNDD semantics would saturate
1713 at 16 bits if we performed the operation directly to a 16-bit
1714 destination. Therefore, we round to 32-bit temporaries where
1715 appropriate, and then store only the lower 16 bits. */
1716 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1717 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1718 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1719 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1720 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1721 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1722
1723 /* Modify the flag register here, because the side effect is useful
1724 later (see below). We know for certain that all flags will be
1725 cleared, since the FRC instruction cannot possibly generate
1726 negative results. Even for exceptional inputs (infinities, denormals,
1727 NaNs), the architecture guarantees that the L conditional is false. */
1728 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1729 brw_FRC( p, param[ 0 ], param[ 0 ] );
1730 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1731 for( i = 1; i < 4; i++ )
1732 brw_FRC( p, param[ i ], param[ i ] );
1733
1734 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1735 of all. */
1736 for( i = 0; i < 4; i++ )
1737 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1738 for( i = 0; i < 4; i++ )
1739 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1740 for( i = 0; i < 4; i++ )
1741 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1742 for( i = 0; i < 4; i++ )
1743 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1744 for( j = 0; j < 3; j++ )
1745 for( i = 0; i < 4; i++ )
1746 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1747
1748 /* Mark the current address, as it will be a jump destination. The
1749 following code will be executed twice: first, with the flag
1750 register clear indicating the w=0 case, and second with flags
1751 set for w=1. */
1752 loop = p->nr_insn;
1753
1754 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1755 be hashed. Since we have only 16 bits of precision in the hash, we
1756 must be careful about thorough mixing to maintain entropy as we
1757 squash the input vector into a small scalar. */
1758 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1759 brw_imm_uw( 0xBC8F ) );
1760 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1761 brw_imm_uw( 0xD0BD ) );
1762 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1763 brw_imm_uw( 0x9B93 ) );
1764 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1765 brw_imm_uw( 0xA359 ) );
1766 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1767 brw_imm_uw( 0xBC8F ) );
1768
1769 /* Temporarily disable the execution mask while we work with ExecSize=16
1770 channels (the mask is set for ExecSize=8 and is probably incorrect).
1771 Although this might cause execution of unwanted channels, the code
1772 writes only to temporary registers and has no side effects, so
1773 disabling the mask is harmless. */
1774 brw_push_insn_state( p );
1775 brw_set_mask_control( p, BRW_MASK_DISABLE );
1776 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1777 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1778 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1779
1780 /* We're now ready to perform the hashing. The eight hashes are
1781 interleaved for performance. The hash function used is
1782 designed to rapidly achieve avalanche and require only 16x16
1783 bit multiplication, and 8-bit swizzles (which we get for
1784 free). */
1785 for( i = 0; i < 4; i++ )
1786 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1787 for( i = 0; i < 4; i++ )
1788 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1789 odd_bytes( wtmp[ i ] ) );
1790 for( i = 0; i < 4; i++ )
1791 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1792 for( i = 0; i < 4; i++ )
1793 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1794 odd_bytes( wtmp[ i ] ) );
1795 brw_pop_insn_state( p );
1796
1797 /* Now we want to initialise the four rear gradients based on the
1798 hashes. Format conversion from signed integer to float leaves
1799 everything scaled too high by a factor of pow( 2, 15 ), but
1800 we correct for that right at the end. */
1801 /* x component */
1802 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1803 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1804 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1805 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1806 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1807
1808 brw_push_insn_state( p );
1809 brw_set_mask_control( p, BRW_MASK_DISABLE );
1810 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1811 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1812 brw_pop_insn_state( p );
1813
1814 brw_MUL( p, x1y0, x1y0, t );
1815 brw_MUL( p, x1y1, x1y1, t );
1816 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1817 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1818 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1819
1820 /* y component */
1821 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1822 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1823 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1824 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1825
1826 brw_push_insn_state( p );
1827 brw_set_mask_control( p, BRW_MASK_DISABLE );
1828 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1829 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1830 brw_pop_insn_state( p );
1831
1832 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1833 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1834 /* prepare t for the w component (used below): w the first time through
1835 the loop; w - 1 the second time) */
1836 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1837 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1838 p->current->header.predicate_inverse = 1;
1839 brw_MOV( p, t, param[ 3 ] );
1840 p->current->header.predicate_inverse = 0;
1841 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1842 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1843 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1844
1845 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1846 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1847 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1848 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1849
1850 /* z component */
1851 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1852 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1853 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1854 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1855
1856 brw_push_insn_state( p );
1857 brw_set_mask_control( p, BRW_MASK_DISABLE );
1858 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1859 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1860 brw_pop_insn_state( p );
1861
1862 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1863 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1864 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1865 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1866
1867 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1868 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1869 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1870 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1871
1872 /* w component */
1873 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1874 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1875 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1876 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1877
1878 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1879 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1880 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1881 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1882 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1883
1884 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1885 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1886 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1887 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1888
1889 /* Here we interpolate in the y dimension... */
1890 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1891 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1892 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1893 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1894 brw_ADD( p, x0y0, x0y0, x0y1 );
1895 brw_ADD( p, x1y0, x1y0, x1y1 );
1896
1897 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1898 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1899 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1900 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1901
1902 /* Now do the same thing for the front four gradients... */
1903 /* x component */
1904 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1905 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1906 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1907 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1908
1909 brw_push_insn_state( p );
1910 brw_set_mask_control( p, BRW_MASK_DISABLE );
1911 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1912 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1913 brw_pop_insn_state( p );
1914
1915 brw_MUL( p, x1y0, x1y0, t );
1916 brw_MUL( p, x1y1, x1y1, t );
1917 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1918 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1919 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1920
1921 /* y component */
1922 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1923 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1924 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1925 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1926
1927 brw_push_insn_state( p );
1928 brw_set_mask_control( p, BRW_MASK_DISABLE );
1929 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1930 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1931 brw_pop_insn_state( p );
1932
1933 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1934 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1935 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1936 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1937 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1938
1939 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1940 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1941 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1942 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1943
1944 /* z component */
1945 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1946 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1947 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1948 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1949
1950 brw_push_insn_state( p );
1951 brw_set_mask_control( p, BRW_MASK_DISABLE );
1952 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1953 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1954 brw_pop_insn_state( p );
1955
1956 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1957 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1958 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1959 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1960 /* prepare t for the w component (used below): w the first time through
1961 the loop; w - 1 the second time) */
1962 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1963 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1964 p->current->header.predicate_inverse = 1;
1965 brw_MOV( p, t, param[ 3 ] );
1966 p->current->header.predicate_inverse = 0;
1967 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1968
1969 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1970 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1971 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1972 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1973
1974 /* w component */
1975 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1976 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1977 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1978 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1979
1980 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1981 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1982 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1983 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1984
1985 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1986 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1987 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1988 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1989
1990 /* Interpolate in the y dimension: */
1991 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1992 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1993 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1994 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1995 brw_ADD( p, x0y0, x0y0, x0y1 );
1996 brw_ADD( p, x1y0, x1y0, x1y1 );
1997
1998 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1999 time put the front face in tmp[ 1 ] and we're nearly there... */
2000 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2001 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2002 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2003
2004 /* Another interpolation, in the z dimension: */
2005 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2006 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2007 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2008
2009 /* Exit the loop if we've computed both cubes... */
2010 origin = p->nr_insn;
2011 brw_push_insn_state( p );
2012 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2013 brw_set_mask_control( p, BRW_MASK_DISABLE );
2014 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2015 brw_pop_insn_state( p );
2016
2017 /* Save the result for the w=0 case, and increment the w coordinate: */
2018 brw_MOV( p, w0, tmp[ 0 ] );
2019 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2020 brw_imm_uw( 1 ) );
2021
2022 /* Loop around for the other cube. Explicitly set the flag register
2023 (unfortunately we must spend an extra instruction to do this: we
2024 can't rely on a side effect of the previous MOV or ADD because
2025 conditional modifiers which are normally true might be false in
2026 exceptional circumstances, e.g. given a NaN input; the add to
2027 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2028 brw_push_insn_state( p );
2029 brw_set_mask_control( p, BRW_MASK_DISABLE );
2030 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2031 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2032 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2033 brw_pop_insn_state( p );
2034
2035 /* Patch the previous conditional branch now that we know the
2036 destination address. */
2037 brw_set_src1( p->store + origin,
2038 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2039
2040 /* The very last interpolation. */
2041 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2042 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2043 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2044
2045 /* scale by pow( 2, -15 ), as described above */
2046 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2047
2048 release_tmps( c, mark );
2049 }
2050
2051 static void emit_noise4( struct brw_wm_compile *c,
2052 struct prog_instruction *inst )
2053 {
2054 struct brw_compile *p = &c->func;
2055 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2056 GLuint mask = inst->DstReg.WriteMask;
2057 int i;
2058 int mark = mark_tmps( c );
2059
2060 assert( mark == 0 );
2061
2062 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2063 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2064 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2065 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2066
2067 param0 = alloc_tmp( c );
2068 param1 = alloc_tmp( c );
2069 param2 = alloc_tmp( c );
2070 param3 = alloc_tmp( c );
2071
2072 brw_MOV( p, param0, src0 );
2073 brw_MOV( p, param1, src1 );
2074 brw_MOV( p, param2, src2 );
2075 brw_MOV( p, param3, src3 );
2076
2077 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2078
2079 /* Fill in the result: */
2080 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2081 for (i = 0 ; i < 4; i++) {
2082 if (mask & (1<<i)) {
2083 dst = get_dst_reg(c, inst, i, 1);
2084 brw_MOV( p, dst, param0 );
2085 }
2086 }
2087 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2088 brw_set_saturate( p, 0 );
2089
2090 release_tmps( c, mark );
2091 }
2092
2093 static void emit_wpos_xy(struct brw_wm_compile *c,
2094 struct prog_instruction *inst)
2095 {
2096 struct brw_compile *p = &c->func;
2097 GLuint mask = inst->DstReg.WriteMask;
2098 struct brw_reg src0[2], dst[2];
2099
2100 dst[0] = get_dst_reg(c, inst, 0, 1);
2101 dst[1] = get_dst_reg(c, inst, 1, 1);
2102
2103 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2104 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2105
2106 /* Calculate the pixel offset from window bottom left into destination
2107 * X and Y channels.
2108 */
2109 if (mask & WRITEMASK_X) {
2110 /* X' = X - origin_x */
2111 brw_ADD(p,
2112 dst[0],
2113 retype(src0[0], BRW_REGISTER_TYPE_W),
2114 brw_imm_d(0 - c->key.origin_x));
2115 }
2116
2117 if (mask & WRITEMASK_Y) {
2118 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2119 brw_ADD(p,
2120 dst[1],
2121 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2122 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2123 }
2124 }
2125
2126 /* TODO
2127 BIAS on SIMD8 not workind yet...
2128 */
2129 static void emit_txb(struct brw_wm_compile *c,
2130 struct prog_instruction *inst)
2131 {
2132 struct brw_compile *p = &c->func;
2133 struct brw_reg dst[4], src[4], payload_reg;
2134 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2135
2136 GLuint i;
2137 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2138 for (i = 0; i < 4; i++)
2139 dst[i] = get_dst_reg(c, inst, i, 1);
2140 for (i = 0; i < 4; i++)
2141 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2142
2143 switch (inst->TexSrcTarget) {
2144 case TEXTURE_1D_INDEX:
2145 brw_MOV(p, brw_message_reg(2), src[0]);
2146 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2147 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2148 break;
2149 case TEXTURE_2D_INDEX:
2150 case TEXTURE_RECT_INDEX:
2151 brw_MOV(p, brw_message_reg(2), src[0]);
2152 brw_MOV(p, brw_message_reg(3), src[1]);
2153 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2154 break;
2155 default:
2156 brw_MOV(p, brw_message_reg(2), src[0]);
2157 brw_MOV(p, brw_message_reg(3), src[1]);
2158 brw_MOV(p, brw_message_reg(4), src[2]);
2159 break;
2160 }
2161 brw_MOV(p, brw_message_reg(5), src[3]);
2162 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2163 brw_SAMPLE(p,
2164 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2165 1,
2166 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2167 unit + MAX_DRAW_BUFFERS, /* surface */
2168 unit, /* sampler */
2169 inst->DstReg.WriteMask,
2170 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2171 4,
2172 4,
2173 0);
2174 }
2175
2176 static void emit_tex(struct brw_wm_compile *c,
2177 struct prog_instruction *inst)
2178 {
2179 struct brw_compile *p = &c->func;
2180 struct brw_reg dst[4], src[4], payload_reg;
2181 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2182
2183 GLuint msg_len;
2184 GLuint i, nr;
2185 GLuint emit;
2186 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2187
2188 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2189
2190 for (i = 0; i < 4; i++)
2191 dst[i] = get_dst_reg(c, inst, i, 1);
2192 for (i = 0; i < 4; i++)
2193 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2194
2195
2196 switch (inst->TexSrcTarget) {
2197 case TEXTURE_1D_INDEX:
2198 emit = WRITEMASK_X;
2199 nr = 1;
2200 break;
2201 case TEXTURE_2D_INDEX:
2202 case TEXTURE_RECT_INDEX:
2203 emit = WRITEMASK_XY;
2204 nr = 2;
2205 break;
2206 default:
2207 emit = WRITEMASK_XYZ;
2208 nr = 3;
2209 break;
2210 }
2211 msg_len = 1;
2212
2213 for (i = 0; i < nr; i++) {
2214 static const GLuint swz[4] = {0,1,2,2};
2215 if (emit & (1<<i))
2216 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2217 else
2218 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2219 msg_len += 1;
2220 }
2221
2222 if (shadow) {
2223 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2224 brw_MOV(p, brw_message_reg(6), src[2]);
2225 }
2226
2227 brw_SAMPLE(p,
2228 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2229 1,
2230 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2231 unit + MAX_DRAW_BUFFERS, /* surface */
2232 unit, /* sampler */
2233 inst->DstReg.WriteMask,
2234 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2235 4,
2236 shadow ? 6 : 4,
2237 0);
2238
2239 if (shadow)
2240 brw_MOV(p, dst[3], brw_imm_f(1.0));
2241 }
2242
2243 static void post_wm_emit( struct brw_wm_compile *c )
2244 {
2245 GLuint nr_insns = c->fp->program.Base.NumInstructions;
2246 GLuint insn, target_insn;
2247 struct prog_instruction *inst1, *inst2;
2248 struct brw_instruction *brw_inst1, *brw_inst2;
2249 int offset;
2250 for (insn = 0; insn < nr_insns; insn++) {
2251 inst1 = &c->fp->program.Base.Instructions[insn];
2252 brw_inst1 = inst1->Data;
2253 switch (inst1->Opcode) {
2254 case OPCODE_CAL:
2255 target_insn = inst1->BranchTarget;
2256 inst2 = &c->fp->program.Base.Instructions[target_insn];
2257 brw_inst2 = inst2->Data;
2258 offset = brw_inst2 - brw_inst1;
2259 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
2260 break;
2261 default:
2262 break;
2263 }
2264 }
2265 }
2266
2267 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2268 {
2269 #define MAX_IFSN 32
2270 #define MAX_LOOP_DEPTH 32
2271 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2272 struct brw_instruction *inst0, *inst1;
2273 int i, if_insn = 0, loop_insn = 0;
2274 struct brw_compile *p = &c->func;
2275 struct brw_indirect stack_index = brw_indirect(0, 0);
2276
2277 c->reg_index = 0;
2278 prealloc_reg(c);
2279 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2280 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2281
2282 for (i = 0; i < c->nr_fp_insns; i++) {
2283 struct prog_instruction *inst = &c->prog_instructions[i];
2284 struct prog_instruction *orig_inst;
2285
2286 if ((orig_inst = inst->Data) != 0)
2287 orig_inst->Data = current_insn(p);
2288
2289 if (inst->CondUpdate)
2290 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2291 else
2292 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2293
2294 switch (inst->Opcode) {
2295 case WM_PIXELXY:
2296 emit_pixel_xy(c, inst);
2297 break;
2298 case WM_DELTAXY:
2299 emit_delta_xy(c, inst);
2300 break;
2301 case WM_PIXELW:
2302 emit_pixel_w(c, inst);
2303 break;
2304 case WM_LINTERP:
2305 emit_linterp(c, inst);
2306 break;
2307 case WM_PINTERP:
2308 emit_pinterp(c, inst);
2309 break;
2310 case WM_CINTERP:
2311 emit_cinterp(c, inst);
2312 break;
2313 case WM_WPOSXY:
2314 emit_wpos_xy(c, inst);
2315 break;
2316 case WM_FB_WRITE:
2317 emit_fb_write(c, inst);
2318 break;
2319 case OPCODE_ABS:
2320 emit_abs(c, inst);
2321 break;
2322 case OPCODE_ADD:
2323 emit_add(c, inst);
2324 break;
2325 case OPCODE_SUB:
2326 emit_sub(c, inst);
2327 break;
2328 case OPCODE_FRC:
2329 emit_frc(c, inst);
2330 break;
2331 case OPCODE_FLR:
2332 emit_flr(c, inst);
2333 break;
2334 case OPCODE_LRP:
2335 emit_lrp(c, inst);
2336 break;
2337 case OPCODE_TRUNC:
2338 emit_trunc(c, inst);
2339 break;
2340 case OPCODE_MOV:
2341 emit_mov(c, inst);
2342 break;
2343 case OPCODE_DP3:
2344 emit_dp3(c, inst);
2345 break;
2346 case OPCODE_DP4:
2347 emit_dp4(c, inst);
2348 break;
2349 case OPCODE_XPD:
2350 emit_xpd(c, inst);
2351 break;
2352 case OPCODE_DPH:
2353 emit_dph(c, inst);
2354 break;
2355 case OPCODE_RCP:
2356 emit_rcp(c, inst);
2357 break;
2358 case OPCODE_RSQ:
2359 emit_rsq(c, inst);
2360 break;
2361 case OPCODE_SIN:
2362 emit_sin(c, inst);
2363 break;
2364 case OPCODE_COS:
2365 emit_cos(c, inst);
2366 break;
2367 case OPCODE_EX2:
2368 emit_ex2(c, inst);
2369 break;
2370 case OPCODE_LG2:
2371 emit_lg2(c, inst);
2372 break;
2373 case OPCODE_MAX:
2374 emit_max(c, inst);
2375 break;
2376 case OPCODE_MIN:
2377 emit_min(c, inst);
2378 break;
2379 case OPCODE_DDX:
2380 emit_ddx(c, inst);
2381 break;
2382 case OPCODE_DDY:
2383 emit_ddy(c, inst);
2384 break;
2385 case OPCODE_SLT:
2386 emit_slt(c, inst);
2387 break;
2388 case OPCODE_SLE:
2389 emit_sle(c, inst);
2390 break;
2391 case OPCODE_SGT:
2392 emit_sgt(c, inst);
2393 break;
2394 case OPCODE_SGE:
2395 emit_sge(c, inst);
2396 break;
2397 case OPCODE_SEQ:
2398 emit_seq(c, inst);
2399 break;
2400 case OPCODE_SNE:
2401 emit_sne(c, inst);
2402 break;
2403 case OPCODE_MUL:
2404 emit_mul(c, inst);
2405 break;
2406 case OPCODE_POW:
2407 emit_pow(c, inst);
2408 break;
2409 case OPCODE_MAD:
2410 emit_mad(c, inst);
2411 break;
2412 case OPCODE_NOISE1:
2413 emit_noise1(c, inst);
2414 break;
2415 case OPCODE_NOISE2:
2416 emit_noise2(c, inst);
2417 break;
2418 case OPCODE_NOISE3:
2419 emit_noise3(c, inst);
2420 break;
2421 case OPCODE_NOISE4:
2422 emit_noise4(c, inst);
2423 break;
2424 case OPCODE_TEX:
2425 emit_tex(c, inst);
2426 break;
2427 case OPCODE_TXB:
2428 emit_txb(c, inst);
2429 break;
2430 case OPCODE_KIL_NV:
2431 emit_kil(c);
2432 break;
2433 case OPCODE_IF:
2434 assert(if_insn < MAX_IFSN);
2435 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2436 break;
2437 case OPCODE_ELSE:
2438 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2439 break;
2440 case OPCODE_ENDIF:
2441 assert(if_insn > 0);
2442 brw_ENDIF(p, if_inst[--if_insn]);
2443 break;
2444 case OPCODE_BGNSUB:
2445 case OPCODE_ENDSUB:
2446 break;
2447 case OPCODE_CAL:
2448 brw_push_insn_state(p);
2449 brw_set_mask_control(p, BRW_MASK_DISABLE);
2450 brw_set_access_mode(p, BRW_ALIGN_1);
2451 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2452 brw_set_access_mode(p, BRW_ALIGN_16);
2453 brw_ADD(p, get_addr_reg(stack_index),
2454 get_addr_reg(stack_index), brw_imm_d(4));
2455 orig_inst = inst->Data;
2456 orig_inst->Data = &p->store[p->nr_insn];
2457 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2458 brw_pop_insn_state(p);
2459 break;
2460
2461 case OPCODE_RET:
2462 brw_push_insn_state(p);
2463 brw_set_mask_control(p, BRW_MASK_DISABLE);
2464 brw_ADD(p, get_addr_reg(stack_index),
2465 get_addr_reg(stack_index), brw_imm_d(-4));
2466 brw_set_access_mode(p, BRW_ALIGN_1);
2467 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2468 brw_set_access_mode(p, BRW_ALIGN_16);
2469 brw_pop_insn_state(p);
2470
2471 break;
2472 case OPCODE_BGNLOOP:
2473 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2474 break;
2475 case OPCODE_BRK:
2476 brw_BREAK(p);
2477 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2478 break;
2479 case OPCODE_CONT:
2480 brw_CONT(p);
2481 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2482 break;
2483 case OPCODE_ENDLOOP:
2484 loop_insn--;
2485 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2486 /* patch all the BREAK instructions from
2487 last BEGINLOOP */
2488 while (inst0 > loop_inst[loop_insn]) {
2489 inst0--;
2490 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2491 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2492 inst0->bits3.if_else.pop_count = 0;
2493 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2494 inst0->bits3.if_else.jump_count = inst1 - inst0;
2495 inst0->bits3.if_else.pop_count = 0;
2496 }
2497 }
2498 break;
2499 default:
2500 _mesa_printf("unsupported IR in fragment shader %d\n",
2501 inst->Opcode);
2502 }
2503 if (inst->CondUpdate)
2504 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2505 else
2506 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2507 }
2508 post_wm_emit(c);
2509 for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
2510 c->fp->program.Base.Instructions[i].Data = NULL;
2511 }
2512
2513 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2514 {
2515 brw_wm_pass_fp(c);
2516 brw_wm_emit_glsl(brw, c);
2517 c->prog_data.total_grf = c->reg_index;
2518 c->prog_data.total_scratch = 0;
2519 }