+/* We shouldn't execute TEXLOD if any of the pixels in a quad have
+ * different LOD values, so branch off groups of equal LOD.
+ */
+static void
+emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
+ struct nv50_reg *src, struct nv50_program_exec *tex)
+{
+ struct nv50_program_exec *join_at;
+ unsigned i, target = pc->p->exec_size + 7 * 2;
+
+ /* Subtract lod of each pixel from lod of top left pixel, jump
+ * texlod insn if result is 0, then repeat for 2 other pixels.
+ */
+ emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
+ emit_branch(pc, 0, 2, &join_at)->param.index = target;
+
+ for (i = 1; i < 4; ++i) {
+ emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
+ emit_branch(pc, 0, 2, NULL)->param.index = target;
+ }
+
+ emit_mov(pc, tlod, src); /* target */
+ emit(pc, tex); /* texlod */
+
+ join_at->param.index = target + 2 * 2;
+ emit_nop(pc);
+ pc->p->exec_tail->inst[1] |= 2; /* join _after_ tex */
+}
+
+static void
+emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
+ struct nv50_program_exec *tex)
+{
+ struct nv50_program_exec *e;
+ struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
+ int r_pred = 0;
+ unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
+
+ pc->allow32 = FALSE;
+ ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
+
+ /* Subtract bias value of thread i from bias values of each thread,
+ * store result in r_pred, and set bit i in r_bits if result was 0.
+ */
+ assert(arg < 4);
+ for (i = 0; i < 4; ++i, ++imm_1248.hw) {
+ emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
+ emit_mov(pc, r_bits, &imm_1248);
+ set_pred(pc, 2, r_pred, pc->p->exec_tail);
+ }
+ emit_mov_to_pred(pc, r_pred, r_bits);
+
+ /* The lanes of a quad are now grouped by the bit in r_pred they have
+ * set. Put the input values for TEX into a new register set for each
+ * group and execute TEX only for a specific group.
+ * We cannot use the same register set for each group because we need
+ * the derivatives, which are implicitly calculated, to be correct.
+ */
+ for (i = 1; i < 4; ++i) {
+ alloc_temp4(pc, t123[i], 0);
+
+ for (c = 0; c <= arg; ++c)
+ emit_mov(pc, t123[i][c], t[c]);
+
+ *(e = exec(pc)) = *(tex);
+ e->inst[0] &= ~0x01fc;
+ set_dst(pc, t123[i][0], e);
+ set_pred(pc, cc[i], r_pred, e);
+ emit(pc, e);
+ }
+ /* finally TEX on the original regs (where we kept the input) */
+ set_pred(pc, cc[0], r_pred, tex);
+ emit(pc, tex);
+
+ /* put the 3 * n other results into regs for lane 0 */
+ n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
+ for (i = 1; i < 4; ++i) {
+ for (c = 0; c < n; ++c) {
+ emit_mov(pc, t[c], t123[i][c]);
+ set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
+ }
+ free_temp4(pc, t123[i]);
+ }
+
+ emit_nop(pc);
+ free_temp(pc, r_bits);
+}
+